Code and Stuff

I am just learning python but i will post some stuff here till i have my github up and running … still learning, so code might be full of errors 🙂 all code made with jupyter lab and exportet as running code

Googe Fonts Crawler
#!/usr/bin/env python
# coding: utf-8

# ## Code to check for google Fonts on your page 

# This is just some webscraping code that checks your webpages for links , 
# checks the sites again removes doubles and checks for google fonts !<br>
# I take no responsibility for the code , this is just for my personal use but you are free to use it for yourself.<br>
# Dont pay money to check your webpage for google fonts , Nah no one wants that !<br>
# 
# Published under :<br>
# CC BY-NC 4.0 Deed<br>
# Attribution-NonCommercial 4.0 International <br>
# https://creativecommons.org/licenses/by-nc/4.0/legalcode.txt
# 
# °love°peace°python°<br>
# <br>
# °tofoo°
# <br>
# <br>
# ..... i should set up a git hub account but am too lazy for that ....<br>
# .............maybe in the future when my coding gets better :)  .....<br>

# In[ ]:

import requests
import re
import time

link_list=[]
link_list_cleared=[]
complete_list=[]


# ## Enter your starting point:
# 

# In[1]:

starting_link=input("just tell me a link where to start ! :")
if not starting_link:
    starting_link='https://www.froschau.at'
print("ill start here ->"+starting_link)

# In[ ]:

def check_for_google(link):
    res = requests.get(link)
    #print(res.text)
    #fonts.googleapis.com
    #fonts.gstatic.com

    suchstring1  =  re.compile(r'fonts.googleapis.com')
    suchstring2  =  re.compile(r'fonts.gstatic.com')
    suchstring3  = re.compile(r'Froschau')
    mo1 = suchstring1.findall(res.text)
    mo2 = suchstring2.findall(res.text)
    mo3 = suchstring3.findall(res.text)
    for i in mo1:
        print(i)
    if not mo1:
        print(f"couldnt find fonts.googleapis.com in {link}")
    for i in mo2 :
        print(i)
    if not mo2:
        print(f"couldnt find fonts.gstatic.com in {link}")
    #for i in mo3 :
    #    print(i)
    


# In[ ]:


def check_link(link):
    res = requests.get(link)
    suchstring5 = re.compile(r'\b((?:https?://)?(?:(?:www\.)?(?:[\da-z\.-]+)\.(?:[a-z]{2,6})|(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)|(?:(?:[0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,7}:|(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,5}(?::[0-9a-fA-F]{1,4}){1,2}|(?:[0-9a-fA-F]{1,4}:){1,4}(?::[0-9a-fA-F]{1,4}){1,3}|(?:[0-9a-fA-F]{1,4}:){1,3}(?::[0-9a-fA-F]{1,4}){1,4}|(?:[0-9a-fA-F]{1,4}:){1,2}(?::[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:(?:(?::[0-9a-fA-F]{1,4}){1,6})|:(?:(?::[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(?::[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(?:ffff(?::0{1,4}){0,1}:){0,1}(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])|(?:[0-9a-fA-F]{1,4}:){1,4}:(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])))(?::[0-9]{1,4}|[1-5][0-9]{4}|6[0-4][0-9]{3}|65[0-4][0-9]{2}|655[0-2][0-9]|6553[0-5])?(?:/[\w\.-]*)*/?)\b')
    mo5 = suchstring5.findall(res.text)
    for i in mo5:
        #if(i[0]=="h" and i[1]=="t" and i[2]=="t" and i[3]=="p" and i[4]=="s" and i[5]==":" and i[6]=="/" and i[7]=="/" and i[8]=="w"and i[9]=="w"and i[10]=="w"and i[11]=="."and i[12]=="f"):
        if starting_link in i:
            if i in complete_list:
                print(f"{i} link already in list")
            else:
                complete_list.append(i)
            #print(i)
        #link_list.append(i)


# # get the first page and extract links

# In[ ]:


res = requests.get(starting_link)
#print (res.text)
#print(type(res.text))
suchstring2  =  re.compile(r'''(https://www\.a)
                                (.*.)
                                (\.[a-zA-Z.]{2,5})(/)''',re.VERBOSE)

suchstring3  =  re.compile(r'''(https://)
                                (.*.)
                                (\.[a-zA-Z.]{2,5})(/|'')''',re.VERBOSE)
#suchstring2  =  re.compile(r'''(https://.*?\.[a-zA-Z.]{2,10})''',re.VERBOSE)
#suchstring2  =  re.compile(r'(www\..*?\.at)')
suchstring1  =  re.compile(r'(https://.*?/)')
suchstring4  =  re.compile(r'(https://.*?)+(\.[a-zA-Z.]{2,6})')
suchstring5 = re.compile(r'\b((?:https?://)?(?:(?:www\.)?(?:[\da-z\.-]+)\.(?:[a-z]{2,6})|(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)|(?:(?:[0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,7}:|(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,5}(?::[0-9a-fA-F]{1,4}){1,2}|(?:[0-9a-fA-F]{1,4}:){1,4}(?::[0-9a-fA-F]{1,4}){1,3}|(?:[0-9a-fA-F]{1,4}:){1,3}(?::[0-9a-fA-F]{1,4}){1,4}|(?:[0-9a-fA-F]{1,4}:){1,2}(?::[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:(?:(?::[0-9a-fA-F]{1,4}){1,6})|:(?:(?::[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(?::[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(?:ffff(?::0{1,4}){0,1}:){0,1}(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])|(?:[0-9a-fA-F]{1,4}:){1,4}:(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])))(?::[0-9]{1,4}|[1-5][0-9]{4}|6[0-4][0-9]{3}|65[0-4][0-9]{2}|655[0-2][0-9]|6553[0-5])?(?:/[\w\.-]*)*/?)\b')

suchstring  =  re.compile(r'boku')
mo = suchstring.search(res.text)
mo2 = suchstring2.findall(res.text)
mo1 = suchstring1.findall(res.text)
mo3 = suchstring3.findall(res.text)
mo4 = suchstring4.findall(res.text)
mo5 = suchstring5.findall(res.text)
#print(mo.group())
#mo1 is working

for i in mo5:
    #print(i)
    #if(i[0]=="h" and i[1]=="t" and i[2]=="t" and i[3]=="p" and i[4]=="s" and i[5]==":" and i[6]=="/" and i[7]=="/" and i[8]=="w"and i[9]=="w"and i[10]=="w"and i[11]=="."and i[12]=="f"):
    if starting_link in i:  
        print(i)
        link_list.append(i)
    #else:
        #print(f"https://www.froschau.at/{i}")
#\b((?:https?://)?(?:(?:www\.)?(?:[\da-z\.-]+)\.(?:[a-z]{2,6})|(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)|(?:(?:[0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,7}:|(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,5}(?::[0-9a-fA-F]{1,4}){1,2}|(?:[0-9a-fA-F]{1,4}:){1,4}(?::[0-9a-fA-F]{1,4}){1,3}|(?:[0-9a-fA-F]{1,4}:){1,3}(?::[0-9a-fA-F]{1,4}){1,4}|(?:[0-9a-fA-F]{1,4}:){1,2}(?::[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:(?:(?::[0-9a-fA-F]{1,4}){1,6})|:(?:(?::[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(?::[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(?:ffff(?::0{1,4}){0,1}:){0,1}(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])|(?:[0-9a-fA-F]{1,4}:){1,4}:(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])))(?::[0-9]{1,4}|[1-5][0-9]{4}|6[0-4][0-9]{3}|65[0-4][0-9]{2}|655[0-2][0-9]|6553[0-5])?(?:/[\w\.-]*)*/?)\b
#print(link_list)
#print(re.search("(?P<url>https?://[^\s]+)", res.text).group("url"))


# # check list and go through every link

# In[ ]:


for i in link_list:
    check_link(i)
    time.sleep(1)


# # remove doubles

# In[ ]:


complete_list = list(dict.fromkeys(complete_list))
for item in complete_list:
    print(item)


# # check for google fonts

# In[ ]:


for i in complete_list:
    check_for_google(i)

https://github.com/ozzoholla