Thanks for looking into my problem, I built this web crawler and all I wanted it to do is crawl a given homepage of a given website and get all links links from that website, like visit all of them, and finally put them all in crawled.txt file but what it's doing when ran is it only visiting homepage and not gathering all links from it and visiting those and finally I only have homepage in crawled.txt file and not links on the site. There are five files to this crawler,
main.py
import queue
import threading
from queue import Queue
from Spider import spider
from SetupCrawler import *
from domain import *
PROJECT_NAME = input('Enter Project Name ')
HOMEPAGE = input('Enter HomePage ')
DOMAIN_NAME = get_domain_name(HOMEPAGE)
QUEUE_FILE = PROJECT_NAME+'/queue.txt'
CRAWLED_FILE = PROJECT_NAME+'/crawled.txt'
NUMBER_OF_THREADS = 8
queue = Queue()
spider(PROJECT_NAME,HOMEPAGE,DOMAIN_NAME)
SetupCrawler.py
import os
from os import write
LinkDir = 'LinkDirectory'
def create_project_dir(dire):
if not os.path.exists(dire):
print('Creating Project...',dire)
os.makedirs(dire)
def write_file(file_name,data):
f = open(file_name,'w')
f.write(data)
f.close()
#queue and crawled files
def create_data_files(project_name,seedLink):
queue = project_name+'/queue.txt'
crawled = project_name+'/crawled.txt'
if not os.path.isfile(queue):
write_file(queue,seedLink)
if not os.path.isfile(crawled):
write_file(crawled,'')
#add data into existing file
def appendToFile(path,data):
open(path,'a').write(data+'\n')
#delete the contents of file
def deleteFileContents(path):
with open(path,'w'):
pass
#read a file and convert each line o set items(binary tree)
def file_to_set(file_name):
results = set()
file = open(file_name,'rt')
for line in file:
results.add(line.replace('\n',''))
return results
#irtr thru a set ech item will be new item in a file
def set_to_file(links,file):
deleteFileContents(file)
for link in sorted(links): #increases the time complexity
appendToFile(file,link)
Spider.py
from os import link, set_inheritable
from urllib.request import urlopen
from linkFinder import LinkFinder
from SetupCrawler import *
class spider:
#class vars / shared for all instances
project_name = ''
base_url = ''
domain_name = ''
queue_file =''
crawled_file = ''
queue = set()
crawled = set()
def __init__(self,project_name,base_url,domain_name):
spider.domain_name = domain_name
spider.base_url = base_url
spider.project_name = project_name
spider.queue_file = spider.project_name+'/queue.txt'
spider.crawled_file = spider.project_name+'/crawled.txt'
self.boot()
self.crawl_page('First Spider',spider.base_url)
@staticmethod
def boot():
create_project_dir(spider.project_name)
create_data_files(spider.project_name,spider.base_url)
spider.queue = file_to_set(spider.queue_file)
spider.crawled = file_to_set(spider.crawled_file)
@staticmethod
def update_files():
set_to_file(spider.queue,spider.queue_file)
set_to_file(spider.crawled,spider.crawled_file)
@staticmethod
def add_Links_To_Queue(links):
for url in links:
if url in spider.queue:
continue
if url in spider.crawled:
continue
if spider.domain_name not in url: # crawling only designated sites not enitre web
#beacuse a site might have social links ggl,fb,insta it will go to these sites as well so to prevent
#that from happening we make sure we're only crawling a given set of sites or site
continue
spider.queue.add(url)
@staticmethod
def gather_links(page_url):
html_string = ''
try:
respnse = urlopen(page_url)
if respnse.getheader('Content-Type')=='text/html':
html_bytes = respnse.read()
html_string = html_bytes.decode('utf-8')
finder = LinkFinder(spider.base_url,page_url)
finder.feed(html_string)
except:
print('Error can not crawl page')
return set()
return finder.page_links()
@staticmethod
def crawl_page(thread_name,page_url):
if page_url not in spider.crawled:
print(thread_name+' now crawling '+page_url)
print('Queue : '+str(len(spider.queue))+' | '+' Crawled : '+str(len(spider.crawled)))
spider.add_Links_To_Queue(spider.gather_links(page_url))
spider.queue.remove(page_url)
spider.crawled.add(page_url)
spider.update_files()
linkFinder.py
from html.parser import HTMLParser
from urllib import parse
class LinkFinder(HTMLParser):
def __init__(self,base_url,curr_page_url):
super().__init__()
self.base_url = base_url
self.page_url = curr_page_url
self.links = set()
def error(self, message): #has to be implemented from HtmlParser
pass
def handle_starttag(self, tag,attrs):
if tag=='a':
for (att,val) in attrs:
if att=='href':
url = parse.urljoin(self.base_url,val) # add a proper url in case its a reletive one else ignore
self.links.add(url)
def page_links(self):
return self.links
domain.py
from urllib.parse import urlparse
#get domain name (exapmle.com)
def get_domain_name(url):
try:
results = get_sub_domain_name(url).split('.')
return results[-2]+'.'+results[-1]
except:
return ''
# get domain name (name.example.com)
def get_sub_domain_name(url):
try:
return urlparse(url).netloc
except:
return ''
Aucun commentaire:
Enregistrer un commentaire