website_crawl = list()
with open('1-1013.csv', 'rb') as f: mycsv = csv.reader(f) for row in mycsv: website_crawl.append(row[0]) f.close()
new_urls = deque(website_crawl)
processed_urls = set()
emails = list()
while len(new_urls):
url = new_urls.popleft()
processed_urls.add(url)
parts = urlsplit(url)
base_url = "{0.scheme}://{0.netloc}".format(parts)
path = url[:url.rfind('/')+1] if '/' in parts.path else url
print("Processing %s" % url)
try:
response = requests.get(url)
except (requests.exceptions.MissingSchema, requests.exceptions.ConnectionError):
continue
new_emails = set(re.findall(r"[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.['a-z','.com','.org']+", response.text, re.I))
email = ','.join(list(new_emails))
email = email.strip()
if email:
emails.append(email)
else:
emails.append(None)
line = list()
out_file = open("1-1013emails.csv", "ab")
writer = csv.writer(out_file)
line.append(url)
line.append(email)
writer.writerow(line)
out_file.close()
Aucun commentaire:
Enregistrer un commentaire