My Scrapy code in Python is inconsistent when scraping multple Urls. Below is a printscreen where you can see that some urls do not work to scrape meanwhile others do.
Here is my code below:
import scrapy
from scrapy_splash import SplashRequest
from scrapy_spider.items import ScrapySpiderItem
pages=0
urllist=[]
class SystemspiderSpider(scrapy.Spider):
name = "systemspider"
#start_urls = ["https://www.systembolaget.se/sok/?
categoryLevel1=Vin&categoryLevel2=Ros%C3%A9vin&page=1"]
while pages < 667:
pages = pages+1
urllist.append(f"https://www.systembolaget.se/sok/?
categoryLevel1=Vin&page={pages}")
start_urls = urllist
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url=url, callback=self.parse, endpoint='render.html')
## div > div > a > div.css-6ktubd > div.css-1rfq2pv > div.css-1mhygto > div.css-1eo7gke > div.css-1iyoj2o > h4
def parse(self, response):
print("Response")
if response.css('#captchacharacters').extract_first():
print("-------------------------------------Captcha found------------------------------------------------------------")
for q in response.css("div > div > a > div.css-6ktubd > div.css-1rfq2pv > div.css-1mhygto > div.css-1eo7gke > div.css-1iyoj2o"):
#print(q)
quote = ScrapySpiderItem()
quote["author"] = q.css("div > h3 > span.css-uiubfo::text").extract_first()
quote["quote"] = response.url
yield quote
Aucun commentaire:
Enregistrer un commentaire