I am attempting to create a web crawler using the Asyncio, aiohttp, and BeautifulSoup libraries that finds a specific download URL on a webpage page and downloads its contents. Currently, I am appending around 1000 tasks to the task queue but only receiving around 500 outputs. Each time I run the file, I receive a different number of outputs, making me believe it has something to do with Asyncio skipping some of the tasks. Furthermore, all download urls appended to the list of download urls (mcif_urls) seem to be downloaded by my download_data and write_to_file functions. Thus, it seems as if the issue might be stemming from my identify_structure function. I don't have much experience with these libraries so I am probably just missing something, but does anyone have any ideas for why this is occurring?
# -*- coding: utf-8 -*-
"""
Created on Sun Oct 18 10:11:19 2020
@author: hgheiberger
"""
import asyncio
import time
import aiohttp
import nest_asyncio
import requests
from bs4 import BeautifulSoup
headers = {"Accept-Language": "en-US, en;q=0.5"}
indexes = {"2.1.1", "3.4", "0.4", "0.5", "0.1"}
mcif_urls = ["http://webbdcrista1.ehu.es/magndata/tmp/0.409_TmNi.mcif"]
#Adds asyncio support for IDE
nest_asyncio.apply()
def batch_indexes():
"""
Scrapes MAGNDATA homepage and appends mcif structure index values
Returns
-------
None.
"""
#Pulls datbase homepage through HTML GET request
url = "http://webbdcrista1.ehu.es/magndata/index.php?show_db=1"
page = requests.get(url, headers=headers, timeout=10.00, allow_redirects=True)
#Parses recieved HTML content
parsed_page = BeautifulSoup(page.text, "lxml")
#Finds and appends mcif index values
for link in parsed_page.find_all('a'):
link_text = str(link.get('href'))
if "index=" in link_text:
index = link_text.replace("?index=", "")
indexes.add(str(index))
async def identify_structures(structure_index: str):
"""
Scrapes individual structure database entries and appends mcif download link
Parameters
----------
structure_index : str
Identification index of indvidual magnetic structure
Returns
-------
link : str
Mcif download link of individual magnetic structure
"""
url = f"http://webbdcrista1.ehu.es/magndata/index_incomm.php?index={structure_index}"
async with aiohttp.ClientSession() as session:
async with session.get(url) as resp:
page = await resp.text()
parsed_page = BeautifulSoup(page, "lxml")
#Finds and appends mcif download links
for link in parsed_page.find_all('a'):
if "mcif" in link.text:
mcif_urls.append("http://webbdcrista1.ehu.es/magndata/" + link.get("href"))
link = "http://webbdcrista1.ehu.es/magndata/" + link.get("href")
return link
async def download_data(structure_index: str, link: str):
"""
Reads individual download links and returns file data
Parameters
----------
structure_index : str
Identification index of indvidual magnetic structure
link : str
Mcif download link of individual magnetic structure
Returns
-------
file_data : bytes
Mcif file data
"""
url = link
async with aiohttp.ClientSession() as session:
async with session.get(url, timeout=1000*60) as resp:
file_data = await resp.read()
return file_data
async def write_to_file(structure_index: str, file_data: bytes):
"""
Parameters
----------
structure_index : str
Identification index of indvidual magnetic structure
file_data : bytes
DESCRIPTION.
Returns
-------
None.
"""
filename = f"structure_{structure_index}.mcif"
with open(filename, "wb") as structure_file:
structure_file.write(file_data)
print(f"Finished writing {filename}")
async def web_scrape_task(structure_index: str):
"""
Parameters
----------
structure_index : str
DESCRIPTION.
Returns
-------
None.
"""
link = await identify_structures(structure_index)
file_data = await download_data(structure_index, link)
await write_to_file(structure_index, file_data)
async def main():
"""
Returns
-------
None.
"""
tasks = []
for index in indexes:
tasks.append(web_scrape_task(index))
await asyncio.wait(tasks)
if __name__ == "__main__":
batch_indexes()
s = time.perf_counter()
asyncio.run(main())
elapsed = time.perf_counter() - s
print(f"Execution time: {elapsed:0.2f} seconds.")
Aucun commentaire:
Enregistrer un commentaire