mardi 27 août 2019

Bizarre, randomly timed scraping errors

so I've got a functioning scrapy web crawler that will search a given url ("amazon.ca" + "sku(from a csv)") and then return title, img url from 2nd level page, etc)... but it stops working after 300 or so URL crawls, giving this error in the terminal output:

 TypeError('Request url must be str or unicode, got %s:' % type(url).__name__)
TypeError: Request url must be str or unicode, got NoneType:

I know I'm doing some pretty random stuff to narrow in on my desired content before saving as items, but the real issue here is the fact that this stops working randomly after being run a certain number of times.

Here's my crawler:

import scrapy
import csv
from scrapy2.items import Scrapy2Item
  
class spider1(scrapy.Spider):
    name = "spider1"
    domain = "https://www.amazon.ca/s?k="

    with open("C:/Users/Tyler/Desktop/scraper/scrapy2/spiders/csv/input.csv", newline="") as csvfile:
        skureader = csv.reader(csvfile, delimiter=' ', quotechar='|')

        sku_list = []

        for row in skureader:
            sku_list.append(''.join(row))

    def start_requests(self):
        for url in self.sku_list:
            yield scrapy.Request(url=spider1.domain+url, callback = self.parse)

    custom_settings = {
        'DEPTH_LIMIT': 1
    }

    def parse(self, response):

        RESULT_SELECTOR = ".sg-col-20-of-24" + \
                          ".s-result-item" + \
                          ".sg-col-0-of-12" + \
                          ".sg-col-28-of-32" + \
                          ".sg-col-16-of-20" + \
                          ".sg-col" + \
                          ".sg-col-32-of-36" + \
                          ".sg-col-12-of-16" + \
                          ".sg-col-24-of-28"


        for dataset in response.css(RESULT_SELECTOR):

            items = Scrapy2Item()

            titlevar = dataset.css('span.a-text-normal ::text').extract_first()
            artistvar = dataset.css('span.a-size-base ::text').extract()

            skuvar = response.xpath('//meta[@name="keywords"]/@content')[0].extract()

            skuvar_split = skuvar.split(',', 1)[0]
            artistvar_split = artistvar[1]

            if any ("Sponsored" in s for s in artistvar):
                items['artist'] = "DELETE THIS"
                items['sku'] = "DELETE THIS"
                items['title'] = "DELETE THIS"
            elif any("by " in s for s in artistvar):
                items['artist'] = artistvar_split
                items['sku'] = skuvar_split
                items['title'] = titlevar
            else:
                items['artist'] = ""
                items['sku'] = skuvar_split
                items['title'] = titlevar

            itempage = response.urljoin(dataset.css('div.a-section > h2.a-size-mini > a ::attr(href)').extract_first())

            items['theurl'] = itempage

            request = scrapy.Request(itempage, callback=self.get_iteminfo)
            request.meta['items'] = items  # By calling .meta, we can pass our item object into the callback.
            yield request  # Return the item info back to the parser.

    def get_iteminfo(self, response):

        items = response.meta['items']  # Get the item we passed from scrape()

        imgvar = [response.css('img#landingImage ::attr(data-old-hires)').extract_first()]
        items['image_urls'] = imgvar

        yield items

and then the items.py

import scrapy

class Scrapy2Item(scrapy.Item):
    theurl = scrapy.Field()
    sku = scrapy.Field()
    title = scrapy.Field()
    artist = scrapy.Field()
    image_urls = scrapy.Field()

and then the pipelines.py:

import scrapy
from scrapy.pipelines.images import ImagesPipeline

import csv

class Scrapy2Pipeline(ImagesPipeline):
    def get_media_requests(self, item, info):
        return [scrapy.Request(x, meta={'image_name': item['sku']})
                for x in item.get('image_urls', [])]

    # write in current folder using the name we chose before
    def file_path(self, request, response=None, info=None):
        return '%s.jpg' % request.meta['image_name']


def write_to_csv(item):
   writer = csv.writer(open('C:/Users/Tyler/Desktop/scraper/scrapy2/spiders/csv/output.csv', 'a'), lineterminator='\n')
   writer.writerow([item[sku] for sku in item.keys()])

class WriteToCsv(object):

    def process_item(self, item, info):
        write_to_csv(item)
        return item

and lastly, the settings.py

BOT_NAME = 'scrapy2'

SPIDER_MODULES = ['scrapy2.spiders']
NEWSPIDER_MODULE = 'scrapy2.spiders'

# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'scrapy2 (+http://www.yourdomain.com)'
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:39.0) Gecko/20100101 Firefox/39.0'

# Obey robots.txt rules
ROBOTSTXT_OBEY = True

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   # 'scrapy.pipelines.images.ImagesPipeline' : 1,
   'scrapy2.pipelines.Scrapy2Pipeline': 100,
   'scrapy2.pipelines.WriteToCsv': 200,
}

IMAGES_STORE = 'C:/Users/Tyler/Desktop/scraper/scrapy2/spiders/images'



Aucun commentaire:

Enregistrer un commentaire