I am just learning python but i will post some stuff here till i have my github up and running … still learning, so code might be full of errors 🙂 all code made with jupyter lab and exportet as running code
Googe Fonts Crawler
#!/usr/bin/env python
# coding: utf-8
# ## Code to check for google Fonts on your page
# This is just some webscraping code that checks your webpages for links ,
# checks the sites again removes doubles and checks for google fonts !<br>
# I take no responsibility for the code , this is just for my personal use but you are free to use it for yourself.<br>
# Dont pay money to check your webpage for google fonts , Nah no one wants that !<br>
#
# Published under :<br>
# CC BY-NC 4.0 Deed<br>
# Attribution-NonCommercial 4.0 International <br>
# https://creativecommons.org/licenses/by-nc/4.0/legalcode.txt
#
# °love°peace°python°<br>
# <br>
# °tofoo°
# <br>
# <br>
# ..... i should set up a git hub account but am too lazy for that ....<br>
# .............maybe in the future when my coding gets better :) .....<br>
# In[ ]:
import requests
import re
import time
link_list=[]
link_list_cleared=[]
complete_list=[]
# ## Enter your starting point:
#
# In[1]:
starting_link=input("just tell me a link where to start ! :")
if not starting_link:
starting_link='https://www.froschau.at'
print("ill start here ->"+starting_link)
# In[ ]:
def check_for_google(link):
res = requests.get(link)
#print(res.text)
#fonts.googleapis.com
#fonts.gstatic.com
suchstring1 = re.compile(r'fonts.googleapis.com')
suchstring2 = re.compile(r'fonts.gstatic.com')
suchstring3 = re.compile(r'Froschau')
mo1 = suchstring1.findall(res.text)
mo2 = suchstring2.findall(res.text)
mo3 = suchstring3.findall(res.text)
for i in mo1:
print(i)
if not mo1:
print(f"couldnt find fonts.googleapis.com in {link}")
for i in mo2 :
print(i)
if not mo2:
print(f"couldnt find fonts.gstatic.com in {link}")
#for i in mo3 :
# print(i)
# In[ ]:
def check_link(link):
res = requests.get(link)
suchstring5 = re.compile(r'\b((?:https?://)?(?:(?:www\.)?(?:[\da-z\.-]+)\.(?:[a-z]{2,6})|(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)|(?:(?:[0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,7}:|(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,5}(?::[0-9a-fA-F]{1,4}){1,2}|(?:[0-9a-fA-F]{1,4}:){1,4}(?::[0-9a-fA-F]{1,4}){1,3}|(?:[0-9a-fA-F]{1,4}:){1,3}(?::[0-9a-fA-F]{1,4}){1,4}|(?:[0-9a-fA-F]{1,4}:){1,2}(?::[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:(?:(?::[0-9a-fA-F]{1,4}){1,6})|:(?:(?::[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(?::[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(?:ffff(?::0{1,4}){0,1}:){0,1}(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])|(?:[0-9a-fA-F]{1,4}:){1,4}:(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])))(?::[0-9]{1,4}|[1-5][0-9]{4}|6[0-4][0-9]{3}|65[0-4][0-9]{2}|655[0-2][0-9]|6553[0-5])?(?:/[\w\.-]*)*/?)\b')
mo5 = suchstring5.findall(res.text)
for i in mo5:
#if(i[0]=="h" and i[1]=="t" and i[2]=="t" and i[3]=="p" and i[4]=="s" and i[5]==":" and i[6]=="/" and i[7]=="/" and i[8]=="w"and i[9]=="w"and i[10]=="w"and i[11]=="."and i[12]=="f"):
if starting_link in i:
if i in complete_list:
print(f"{i} link already in list")
else:
complete_list.append(i)
#print(i)
#link_list.append(i)
# # get the first page and extract links
# In[ ]:
res = requests.get(starting_link)
#print (res.text)
#print(type(res.text))
suchstring2 = re.compile(r'''(https://www\.a)
(.*.)
(\.[a-zA-Z.]{2,5})(/)''',re.VERBOSE)
suchstring3 = re.compile(r'''(https://)
(.*.)
(\.[a-zA-Z.]{2,5})(/|'')''',re.VERBOSE)
#suchstring2 = re.compile(r'''(https://.*?\.[a-zA-Z.]{2,10})''',re.VERBOSE)
#suchstring2 = re.compile(r'(www\..*?\.at)')
suchstring1 = re.compile(r'(https://.*?/)')
suchstring4 = re.compile(r'(https://.*?)+(\.[a-zA-Z.]{2,6})')
suchstring5 = re.compile(r'\b((?:https?://)?(?:(?:www\.)?(?:[\da-z\.-]+)\.(?:[a-z]{2,6})|(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)|(?:(?:[0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,7}:|(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,5}(?::[0-9a-fA-F]{1,4}){1,2}|(?:[0-9a-fA-F]{1,4}:){1,4}(?::[0-9a-fA-F]{1,4}){1,3}|(?:[0-9a-fA-F]{1,4}:){1,3}(?::[0-9a-fA-F]{1,4}){1,4}|(?:[0-9a-fA-F]{1,4}:){1,2}(?::[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:(?:(?::[0-9a-fA-F]{1,4}){1,6})|:(?:(?::[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(?::[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(?:ffff(?::0{1,4}){0,1}:){0,1}(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])|(?:[0-9a-fA-F]{1,4}:){1,4}:(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])))(?::[0-9]{1,4}|[1-5][0-9]{4}|6[0-4][0-9]{3}|65[0-4][0-9]{2}|655[0-2][0-9]|6553[0-5])?(?:/[\w\.-]*)*/?)\b')
suchstring = re.compile(r'boku')
mo = suchstring.search(res.text)
mo2 = suchstring2.findall(res.text)
mo1 = suchstring1.findall(res.text)
mo3 = suchstring3.findall(res.text)
mo4 = suchstring4.findall(res.text)
mo5 = suchstring5.findall(res.text)
#print(mo.group())
#mo1 is working
for i in mo5:
#print(i)
#if(i[0]=="h" and i[1]=="t" and i[2]=="t" and i[3]=="p" and i[4]=="s" and i[5]==":" and i[6]=="/" and i[7]=="/" and i[8]=="w"and i[9]=="w"and i[10]=="w"and i[11]=="."and i[12]=="f"):
if starting_link in i:
print(i)
link_list.append(i)
#else:
#print(f"https://www.froschau.at/{i}")
#\b((?:https?://)?(?:(?:www\.)?(?:[\da-z\.-]+)\.(?:[a-z]{2,6})|(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)|(?:(?:[0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,7}:|(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,5}(?::[0-9a-fA-F]{1,4}){1,2}|(?:[0-9a-fA-F]{1,4}:){1,4}(?::[0-9a-fA-F]{1,4}){1,3}|(?:[0-9a-fA-F]{1,4}:){1,3}(?::[0-9a-fA-F]{1,4}){1,4}|(?:[0-9a-fA-F]{1,4}:){1,2}(?::[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:(?:(?::[0-9a-fA-F]{1,4}){1,6})|:(?:(?::[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(?::[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(?:ffff(?::0{1,4}){0,1}:){0,1}(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])|(?:[0-9a-fA-F]{1,4}:){1,4}:(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])))(?::[0-9]{1,4}|[1-5][0-9]{4}|6[0-4][0-9]{3}|65[0-4][0-9]{2}|655[0-2][0-9]|6553[0-5])?(?:/[\w\.-]*)*/?)\b
#print(link_list)
#print(re.search("(?P<url>https?://[^\s]+)", res.text).group("url"))
# # check list and go through every link
# In[ ]:
for i in link_list:
check_link(i)
time.sleep(1)
# # remove doubles
# In[ ]:
complete_list = list(dict.fromkeys(complete_list))
for item in complete_list:
print(item)
# # check for google fonts
# In[ ]:
for i in complete_list:
check_for_google(i)