samedi 25 juillet 2020

I am having issue in range 10 because of which all pages are not scrapped Please help me to correct the code so that all pages can be scrapped?

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import datetime

headers = {
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
}

if __name__ == '__main__':
    today = datetime.date.today()
    days_to_scrap = 365 * 5  # 5 years...

    for i in range(1, days_to_scrap):
        time.sleep(5)
        scrap_date = today - datetime.timedelta(days=i)
        response = 'https://www.brecorder.com/markets/stocks/' + scrap_date.strftime('%Y-%m-%d')
        r = requests.get(response)

        if r.status_code == 200:
            html = r.text
            soup = BeautifulSoup(html, 'lxml')

        news_date = soup.find_all("span", {"class": "story__time"})

        news_title = soup.find_all("a", {"class": "story__link"})
        # news_title = soup.find_all("div",{"class":"story  cat-0 group-0 position-14 sub-story clearfix"})

        # news_description = soup.find_all("p", {"class": "excerpt"})
        date_list = []
        title_list = []
        # description_list = []
        # print("Hello world")
        for date in news_date:
            date_list.append(date.text.replace('Published', ' ').replace('Updated',' '))
        for title in news_title:
            title_list.append(title.text)
        # for description in news_description:
        #     description_list.append(description.text)
        for i in range(10):
            print("Date: ", date_list[i])
            print("Title: ", title_list[i])
            # print("Description: ", description_list[i])
            print()

        s1 = pd.Series(date_list[0:10], name='News Date')
        s2 = pd.Series(title_list[0:10], name='News Heading')
        # s3 = pd.Series(description_list[0:10], name='News Description')
        df = pd.concat([s1, s2], axis=1)
        df = df.fillna("None")
        # d = {'Phone name': phone_name_list, 'Price per month': price_per_month_list,'Interest List': interest_list, 'Total Price List': total_price_list,'Unlimited Offer List': unlimited_offer_list}
        # df = pd.DataFrame(data=d)
        print(df)
        df.to_csv('br1.csv', mode='a', encoding='utf-8')

This is the news scrapper of business recorder and the problem is that as I have given range 10 but in the website, few pages have different size of news in the pages such as 10, 20, 4, 15, 2 news in a page so how can I make a code that can scrap all the news that are in a certain page??




Aucun commentaire:

Enregistrer un commentaire