mardi 28 novembre 2017

Looping through Multiple pages while scraping with python

I am trying to parse multiple page on IDMb. The parser is stuck on gathering information from one page. I have tried many forums to solve this to no avail. I suspect it has something to do with not setting up my embedded loop correctly or my initial request. Please help. Thanks.

Problems with this script: Loops on one page.

#Basic libraries

from requests import get
from bs4 import BeautifulSoup
import pandas as pd
from random import randint

#More advanced libraries
from time import sleep
from time import time
from IPython.core.display import clear_output
from warnings import warn

base_url = 'http://ift.tt/2hZ3Usi'
response = get(base_url)

soup = BeautifulSoup(response.text, 'lxml')

#data lists to append

names           = []
years           = []
imdb_ratings    = []
metascores      = []
votes           = []

#preparing the monitoring loop

pages =  str(range(1,5))

start_time = time()
requests = 0

#for every page in the interval 1-4
for url in urls:

#make a get request
    response = get(base_url + page)

#pause the loop
    sleep(randint(8,15))

#Monitor the requests
    requests += 1
    elapsed_time = time() - start_time
    if requests > 4:
        warn:('Number of requests was greater than expected.')
        break

    elif response.status_code != 200:
        warn('Request: {}; Frequency: {} requests/s'.format(requests, response.status_code))

    else:
        print('Request: {}; Frequency: {} requests/s'.format(requests, requests/elapsed_time))
        clear_output(wait = True)

    page_html = BeautifulSoup(response.text, 'lxml')  

#root
    movie_containers = soup.find_all('div', class_= 'lister-item mode-advanced')

#looping through containers
for container in movie_containers:
    if container.find('div', class_ = 'ratings-metascore') is not None:

        #The name
        name = container.h3.a.text
        #print(name)
        names.append(name)

        #The Year
        year = container.find('span', class_ = 'lister-item-year').text
        #print(year)
        years.append(year)

        #IDMb rating             
        imdb = container.strong.text
        #print(imdb)
        imdb_ratings.append(imdb)

        #Metascore              
        metascore = container.find('span', class_= 'metascore').text
        #print(metascore)
        metascores.append(int(metascore))

        #Number of Votes
        vote = container.find('span', attrs = {'name':'nv'})['data-value']
        #print(vote)
        votes.append(int(vote))


#keeping track of data        
test_df= pd.DataFrame({'Movie': names,
                        'Year': years,
                        'IMDb': imdb_ratings,
                        'Metascore': metascores,
                        'Votes': votes})
print(test_df.info())
test_df

web

mardi 28 novembre 2017

Looping through Multiple pages while scraping with python

Problems with this script: Loops on one page.

Aucun commentaire:

Enregistrer un commentaire