lundi 26 octobre 2015

Web Scraping: The code returns an error Toomanyredirects: exceeded 30 redirects

website_crawl = list()

with open('1-1013.csv', 'rb') as f: mycsv = csv.reader(f) for row in mycsv: website_crawl.append(row[0]) f.close()

new_urls = deque(website_crawl)

processed_urls = set()

emails = list()

while len(new_urls):

url = new_urls.popleft()
processed_urls.add(url)

parts = urlsplit(url)
base_url = "{0.scheme}://{0.netloc}".format(parts)
path = url[:url.rfind('/')+1] if '/' in parts.path else url

print("Processing %s" % url)
try:
    response = requests.get(url)
except (requests.exceptions.MissingSchema, requests.exceptions.ConnectionError):
    continue

new_emails = set(re.findall(r"[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.['a-z','.com','.org']+", response.text, re.I))

email = ','.join(list(new_emails))
email = email.strip()
if email:
    emails.append(email)
else:
    emails.append(None)

line = list()
out_file = open("1-1013emails.csv", "ab")
writer = csv.writer(out_file)
line.append(url)
line.append(email)
writer.writerow(line)
out_file.close()




Aucun commentaire:

Enregistrer un commentaire