dimanche 4 octobre 2020

The webcrawler I built using python is only crawling homepage and not getting the links on the homepage

Thanks for looking into my problem, I built this web crawler and all I wanted it to do is crawl a given homepage of a given website and get all links links from that website, like visit all of them, and finally put them all in crawled.txt file but what it's doing when ran is it only visiting homepage and not gathering all links from it and visiting those and finally I only have homepage in crawled.txt file and not links on the site. There are five files to this crawler,

main.py

import queue
import threading 
from queue import Queue
from Spider import spider
from SetupCrawler import *
from domain import *

PROJECT_NAME  = input('Enter Project Name ')
HOMEPAGE = input('Enter HomePage ')
DOMAIN_NAME = get_domain_name(HOMEPAGE)
QUEUE_FILE = PROJECT_NAME+'/queue.txt'
CRAWLED_FILE = PROJECT_NAME+'/crawled.txt'
NUMBER_OF_THREADS = 8

queue = Queue()
spider(PROJECT_NAME,HOMEPAGE,DOMAIN_NAME)

SetupCrawler.py

import os
from os import write

LinkDir = 'LinkDirectory'

def create_project_dir(dire):
    if not os.path.exists(dire):
        print('Creating Project...',dire)
        os.makedirs(dire)

def write_file(file_name,data):
    f = open(file_name,'w')
    f.write(data)
    f.close()


#queue and crawled files
def create_data_files(project_name,seedLink):
    queue = project_name+'/queue.txt'
    crawled = project_name+'/crawled.txt'
    if not os.path.isfile(queue):
        write_file(queue,seedLink)
    if not os.path.isfile(crawled):
        write_file(crawled,'')

#add data into existing file
def appendToFile(path,data):
    open(path,'a').write(data+'\n')

#delete the contents of file
def deleteFileContents(path):
   with  open(path,'w'):
       pass

#read a file and convert each line o set items(binary tree)
def file_to_set(file_name):
    results = set()
    file = open(file_name,'rt')
    for line in file:
        results.add(line.replace('\n',''))
    return results

#irtr thru a set ech item will be new item in a file
def set_to_file(links,file):
    deleteFileContents(file)
    for link in sorted(links): #increases the time complexity
        appendToFile(file,link)



Spider.py

from os import link, set_inheritable
from urllib.request import urlopen
from linkFinder import LinkFinder
from SetupCrawler import *

class spider:
    #class vars / shared for all instances
    project_name = ''
    base_url = ''
    domain_name = ''
    queue_file =''
    crawled_file = ''
    queue = set()
    crawled = set()

    def __init__(self,project_name,base_url,domain_name):
        spider.domain_name  = domain_name
        spider.base_url = base_url
        spider.project_name = project_name
        spider.queue_file = spider.project_name+'/queue.txt'
        spider.crawled_file = spider.project_name+'/crawled.txt'
        self.boot()
        self.crawl_page('First Spider',spider.base_url)

    @staticmethod
    def boot():
        create_project_dir(spider.project_name)
        create_data_files(spider.project_name,spider.base_url)
        spider.queue = file_to_set(spider.queue_file)
        spider.crawled = file_to_set(spider.crawled_file)
    
    @staticmethod 
    def update_files():
        set_to_file(spider.queue,spider.queue_file)
        set_to_file(spider.crawled,spider.crawled_file)


    @staticmethod
    def add_Links_To_Queue(links):
        for url in links:
            if url in spider.queue:
                continue
            if url in spider.crawled:
                continue
            if spider.domain_name not in url: # crawling only designated sites not enitre web 
                #beacuse a site might have social links ggl,fb,insta it will go to these sites as well so to prevent 
                #that from happening we make sure we're only crawling a given set of sites or site
                continue
            spider.queue.add(url)


    @staticmethod
    def gather_links(page_url):
        html_string = ''
        try:
            respnse  = urlopen(page_url)
            if respnse.getheader('Content-Type')=='text/html':
                html_bytes = respnse.read()
                html_string = html_bytes.decode('utf-8')
            finder = LinkFinder(spider.base_url,page_url)
            finder.feed(html_string)
        except:
            print('Error can not crawl page')
            return set()
        return finder.page_links()

    @staticmethod
    def crawl_page(thread_name,page_url):
        if page_url not in spider.crawled:
            print(thread_name+' now crawling '+page_url)
            print('Queue :  '+str(len(spider.queue))+' | '+' Crawled : '+str(len(spider.crawled)))
            spider.add_Links_To_Queue(spider.gather_links(page_url))
            spider.queue.remove(page_url)
            spider.crawled.add(page_url)
            spider.update_files()

          

linkFinder.py

from html.parser import HTMLParser
from urllib import parse

class LinkFinder(HTMLParser):

    def __init__(self,base_url,curr_page_url):
        super().__init__()
        self.base_url = base_url
        self.page_url = curr_page_url
        self.links = set()

    def error(self, message): #has to be implemented from HtmlParser
        pass
    def handle_starttag(self, tag,attrs):
        if tag=='a':
            for (att,val) in attrs:
                if att=='href':
                    url = parse.urljoin(self.base_url,val) # add a proper url in case its a reletive one else ignore
                    self.links.add(url)
    def page_links(self):
        return self.links
    



domain.py

from urllib.parse import urlparse


#get domain name (exapmle.com)
def get_domain_name(url):
    try:
        results = get_sub_domain_name(url).split('.')
        return results[-2]+'.'+results[-1]
    except:
        return ''

# get domain name (name.example.com)
def get_sub_domain_name(url):
    try:
        return urlparse(url).netloc
    except:
        return ''






Aucun commentaire:

Enregistrer un commentaire