jeudi 23 mai 2019

Web Crawling : Search Words

I'm preparing a Web Crawler.

More precisely, it is about indexing (= taking all the words) of a main page, and finding all the links of this page, to then look for the words (of the main page) in the links extracted.

My problem is in the function indexe (): when I try to recover all the secondary pages, after checking that the word is not in the stoplist (= pronouns, articles, ... etc) and that these words are in the secondary pages. So that I can search all the words (from the main page) in the secondary pages

Here's what I did for now:

import requests 
from bs4 import BeautifulSoup

def extract(links) : 
    page = requests.get(links).text
    soup = BeautifulSoup(page)
    for link in soup.find_all('a', href=True):
        print(link['href'])

def clean_html(page) :
   cleanr = re.compile('<.*?>')
   cleantext = re.sub(cleanr, '', page)
   return cleantext 

def indexer(dex, words, url):
    for x in url : 
        x = requests.get(x).text
        x = clean_html(x)
        x = x.lower()
        x = x.split()                 

    for word in words:
        word = word.lower()
        word = clean(word)

        if word not in stoplist :
           if word in x : 
            # print(x) THE PROBLEM: I'm trying to retrieve the secondary pages, but I get only the last link, (36 times)
            add(dex, word, url)

def add(dex, word, url):
    try:
        dex[word].append(url)
    except KeyError:
        dex[word] = [url]


def main(url, idx) :     
    list_urls = extract(url) 
    main_page = requests.get(url).text
    main_page = clean_html(main_page)
    main_page = main_page.split()

    idx = {}
    indexe(idx, main_page, list_urls)
    prd(idx)

def prd(d) :  
   for c in sorted(d) : 
         print( '\t', c, ':', d[c])


stoplist = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

I want to find the words in the secondary pages, hopefully the output will be something like this

Word1 : [url1, url2]
Word2 : [url1, url3, ...]
... 




Aucun commentaire:

Enregistrer un commentaire