lundi 3 octobre 2016

why my links not writing in my file

import urllib
from bs4 import BeautifulSoup
import requests
import readability
import time
import http.client

seed_url = "http://ift.tt/1TeO2wc"
root_url = "http://ift.tt/WJGOSh"
max_limit=5
#file = open("file_crawled.txt", "w")

def get_urls(seed_url):
r = requests.get(seed_url)
soup = BeautifulSoup(r.content,"html.parser")
links = soup.findAll('a', href=True)
valid_links=[]
 for links in links:
   if 'wiki' in links['href'] and '.' not in links['href'] and ':' not in links['href'] and '#' not in links['href']:
     valid_links.append(root_url + links['href'])
 return valid_links


visited=[]
def crawl_dfs(seed_url, max_depth):
depth=1
file1 = open("file_crawled.txt", "w+")
visited.append(root_url)
   if depth<=max_depth:
      children=get_urls(seed_url)
      for child in children:
           if child not in visited:          
                file1.write(child)                                    
                time.sleep(1)
                visited.append(child)
                crawl_dfs(child,max_depth-1)
   file1.close()

crawl_dfs(seed_url,max_limit)

dfs crawling use python 3.6 help me with the code, please correct where i am wrong, my crawled links are not writing to my file named file1. i dont know why i have tried everything at my end




Aucun commentaire:

Enregistrer un commentaire