mercredi 21 octobre 2020

Asyncio skipping added tasks

I am attempting to create a web crawler using the Asyncio, aiohttp, and BeautifulSoup libraries that finds a specific download URL on a webpage page and downloads its contents. Currently, I am appending around 1000 tasks to the task queue but only receiving around 500 outputs. Each time I run the file, I receive a different number of outputs, making me believe it has something to do with Asyncio skipping some of the tasks. Furthermore, all download urls appended to the list of download urls (mcif_urls) seem to be downloaded by my download_data and write_to_file functions. Thus, it seems as if the issue might be stemming from my identify_structure function. I don't have much experience with these libraries so I am probably just missing something, but does anyone have any ideas for why this is occurring?

# -*- coding: utf-8 -*-
"""
Created on Sun Oct 18 10:11:19 2020

@author: hgheiberger
"""

import asyncio
import time
import aiohttp
import nest_asyncio
import requests
from bs4 import BeautifulSoup

headers = {"Accept-Language": "en-US, en;q=0.5"}
indexes = {"2.1.1", "3.4", "0.4", "0.5", "0.1"} 
mcif_urls = ["http://webbdcrista1.ehu.es/magndata/tmp/0.409_TmNi.mcif"]


#Adds asyncio support for IDE
nest_asyncio.apply()

def batch_indexes():
    """
    Scrapes MAGNDATA homepage and appends mcif structure index values

    Returns
    -------
    None.

    """
    
    #Pulls datbase homepage through HTML GET request
    url = "http://webbdcrista1.ehu.es/magndata/index.php?show_db=1"
    page = requests.get(url, headers=headers, timeout=10.00, allow_redirects=True)
    
    #Parses recieved HTML content
    parsed_page = BeautifulSoup(page.text, "lxml")
    
    #Finds and appends mcif index values
    for link in parsed_page.find_all('a'):
        link_text = str(link.get('href'))
        if "index=" in link_text:
            index = link_text.replace("?index=", "")
            indexes.add(str(index))    


async def identify_structures(structure_index: str):
    """
    Scrapes individual structure database entries and appends mcif download link

    Parameters
    ----------
    structure_index : str
        Identification index of indvidual magnetic structure

    Returns
    -------
    link : str
        Mcif download link of individual magnetic structure 
        
    """
    
    url = f"http://webbdcrista1.ehu.es/magndata/index_incomm.php?index={structure_index}"
    async with aiohttp.ClientSession() as session:
        async with session.get(url) as resp:
            page = await resp.text()
            parsed_page = BeautifulSoup(page, "lxml")
            
            #Finds and appends mcif download links
            for link in parsed_page.find_all('a'):
                if "mcif" in link.text:
                    mcif_urls.append("http://webbdcrista1.ehu.es/magndata/" + link.get("href"))            
                    link = "http://webbdcrista1.ehu.es/magndata/" + link.get("href")
                    return link

async def download_data(structure_index: str, link: str):
    """
    Reads individual download links and returns file data
    
    Parameters
    ----------
    structure_index : str
        Identification index of indvidual magnetic structure
    link : str
        Mcif download link of individual magnetic structure 

    Returns
    -------
    file_data : bytes
        Mcif file data

    """
    url = link
    async with aiohttp.ClientSession() as session:
        async with session.get(url, timeout=1000*60) as resp:
            file_data = await resp.read()
            return file_data


async def write_to_file(structure_index: str, file_data: bytes):
    """
    

    Parameters
    ----------
    structure_index : str
        Identification index of indvidual magnetic structure
    file_data : bytes
        DESCRIPTION.

    Returns
    -------
    None.

    """

    filename = f"structure_{structure_index}.mcif"
    with open(filename, "wb") as structure_file:
        structure_file.write(file_data)
        print(f"Finished writing {filename}")
        


async def web_scrape_task(structure_index: str):
    """
    

    Parameters
    ----------
    structure_index : str
        DESCRIPTION.

    Returns
    -------
    None.

    """
    link = await identify_structures(structure_index)
    file_data = await download_data(structure_index, link)
    await write_to_file(structure_index, file_data)


async def main():
    """
    

    Returns
    -------
    None.

    """
    tasks = []
    for index in indexes:
        tasks.append(web_scrape_task(index))
    await asyncio.wait(tasks)


if __name__ == "__main__":

    batch_indexes()

    s = time.perf_counter()

    asyncio.run(main())

    elapsed = time.perf_counter() - s
    print(f"Execution time: {elapsed:0.2f} seconds.")



Aucun commentaire:

Enregistrer un commentaire