vendredi 25 décembre 2020

My Scrapy code in Python is inconsistent when scraping multiple Urls

My Scrapy code in Python is inconsistent when scraping multple Urls. Below is a printscreen where you can see that some urls do not work to scrape meanwhile others do. enter image description here

Here is my code below:

import scrapy
from scrapy_splash import SplashRequest
from scrapy_spider.items import ScrapySpiderItem

pages=0
urllist=[]
class SystemspiderSpider(scrapy.Spider):
    name = "systemspider"

    #start_urls = ["https://www.systembolaget.se/sok/? 
categoryLevel1=Vin&categoryLevel2=Ros%C3%A9vin&page=1"]

   while pages < 667:
       pages = pages+1        
       urllist.append(f"https://www.systembolaget.se/sok/? 
       categoryLevel1=Vin&page={pages}")



    start_urls = urllist


def start_requests(self):
    for url in self.start_urls:
        yield SplashRequest(url=url, callback=self.parse, endpoint='render.html')

 ## div > div > a > div.css-6ktubd > div.css-1rfq2pv > div.css-1mhygto > div.css-1eo7gke > div.css-1iyoj2o > h4

def parse(self, response):
    print("Response")

    if response.css('#captchacharacters').extract_first():
        print("-------------------------------------Captcha found------------------------------------------------------------")


    for q in response.css("div > div > a > div.css-6ktubd > div.css-1rfq2pv > div.css-1mhygto > div.css-1eo7gke > div.css-1iyoj2o"):
        #print(q)
        quote = ScrapySpiderItem()
        quote["author"] = q.css("div > h3 > span.css-uiubfo::text").extract_first()
        quote["quote"] = response.url
        yield quote



Aucun commentaire:

Enregistrer un commentaire