I wrote a spider to crawl a non-English website. Avoid being banned, I changed AUTOTHROTTLE settings and used a custom rotating user agent middleware. But running the spider, I got an error which I couldn't find a related question. This is the log I got:
2020-08-23 10:12:11 [scrapy.utils.log] INFO: Scrapy 1.6.0 started (bot: digikala)
2020-08-23 10:12:11 [scrapy.utils.log] INFO: Versions: lxml 4.5.2.0, libxml2 2.9.10, cssselect 1.1.0, parsel 1.5.2, w3lib 1.21.0, Twisted 20.3.0, Python 3.8.5 (default, Aug 5 2020, 09:44:06) [MSC v.1916 64 bit (AMD64)], pyOpenSSL 19.1.0 (OpenSSL 1.1.1g 21 Apr 2020), cryptography 2.9.2, Platform Windows-10-10.0.19041-SP0
2020-08-23 10:12:11 [scrapy.crawler] INFO: Overridden settings: {'AUTOTHROTTLE_MAX_DELAY': 120, 'AUTOTHROTTLE_START_DELAY': 60, 'BOT_NAME': 'digikala', 'FEED_FORMAT': 'csv', 'FEED_URI': 'dataset2.csv', 'HTTPCACHE_ENABLED': True, 'NEWSPIDER_MODULE': 'digikala.spiders', 'ROBOTSTXT_OBEY': True, 'SPIDER_MODULES': ['digikala.spiders']}
2020-08-23 10:12:11 [scrapy.extensions.telnet] INFO: Telnet Password: 3f0f7f745956e306
2020-08-23 10:12:11 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.feedexport.FeedExporter',
'scrapy.extensions.logstats.LogStats']
Unhandled error in Deferred:
2020-08-23 10:12:11 [twisted] CRITICAL: Unhandled error in Deferred:
Traceback (most recent call last):
File "C:\Users\shima\anaconda3\envs\virtual_workspace\lib\site-packages\scrapy\crawler.py", line 172, in crawl
return self._crawl(crawler, *args, **kwargs)
File "C:\Users\shima\anaconda3\envs\virtual_workspace\lib\site-packages\scrapy\crawler.py", line 176, in _crawl
d = crawler.crawl(*args, **kwargs)
File "C:\Users\shima\anaconda3\envs\virtual_workspace\lib\site-packages\twisted\internet\defer.py", line 1613, in unwindGenerator
return _cancellableInlineCallbacks(gen)
File "C:\Users\shima\anaconda3\envs\virtual_workspace\lib\site-packages\twisted\internet\defer.py", line 1529, in _cancellableInlineCallbacks
_inlineCallbacks(None, g, status)
--- <exception caught here> ---
File "C:\Users\shima\anaconda3\envs\virtual_workspace\lib\site-packages\twisted\internet\defer.py", line 1418, in _inlineCallbacks
result = g.send(result)
File "C:\Users\shima\anaconda3\envs\virtual_workspace\lib\site-packages\scrapy\crawler.py", line 80, in crawl
self.engine = self._create_engine()
File "C:\Users\shima\anaconda3\envs\virtual_workspace\lib\site-packages\scrapy\crawler.py", line 105, in _create_engine
return ExecutionEngine(self, lambda _: self.stop())
File "C:\Users\shima\anaconda3\envs\virtual_workspace\lib\site-packages\scrapy\core\engine.py", line 69, in __init__
self.downloader = downloader_cls(crawler)
File "C:\Users\shima\anaconda3\envs\virtual_workspace\lib\site-packages\scrapy\core\downloader\__init__.py", line 88, in __init__
self.middleware = DownloaderMiddlewareManager.from_crawler(crawler)
File "C:\Users\shima\anaconda3\envs\virtual_workspace\lib\site-packages\scrapy\middleware.py", line 53, in from_crawler
return cls.from_settings(crawler.settings, crawler)
File "C:\Users\shima\anaconda3\envs\virtual_workspace\lib\site-packages\scrapy\middleware.py", line 34, in from_settings
mwcls = load_object(clspath)
File "C:\Users\shima\anaconda3\envs\virtual_workspace\lib\site-packages\scrapy\utils\misc.py", line 44, in load_object
mod = import_module(module)
File "C:\Users\shima\anaconda3\envs\virtual_workspace\lib\importlib\__init__.py", line 127, in import_module
return _bootstrap._gcd_import(name[level:], package, level)
File "<frozen importlib._bootstrap>", line 1014, in _gcd_import
File "<frozen importlib._bootstrap>", line 991, in _find_and_load
File "<frozen importlib._bootstrap>", line 961, in _find_and_load_unlocked
File "<frozen importlib._bootstrap>", line 219, in _call_with_frames_removed
File "<frozen importlib._bootstrap>", line 1014, in _gcd_import
File "<frozen importlib._bootstrap>", line 991, in _find_and_load
File "<frozen importlib._bootstrap>", line 973, in _find_and_load_unlocked
builtins.ModuleNotFoundError: No module named 'demo_crawl'
2020-08-23 10:12:11 [twisted] CRITICAL:
Traceback (most recent call last):
File "C:\Users\shima\anaconda3\envs\virtual_workspace\lib\site-packages\twisted\internet\defer.py", line 1418, in _inlineCallbacks
result = g.send(result)
File "C:\Users\shima\anaconda3\envs\virtual_workspace\lib\site-packages\scrapy\crawler.py", line 80, in crawl
self.engine = self._create_engine()
File "C:\Users\shima\anaconda3\envs\virtual_workspace\lib\site-packages\scrapy\crawler.py", line 105, in _create_engine
return ExecutionEngine(self, lambda _: self.stop())
File "C:\Users\shima\anaconda3\envs\virtual_workspace\lib\site-packages\scrapy\core\engine.py", line 69, in __init__
self.downloader = downloader_cls(crawler)
File "C:\Users\shima\anaconda3\envs\virtual_workspace\lib\site-packages\scrapy\core\downloader\__init__.py", line 88, in __init__
self.middleware = DownloaderMiddlewareManager.from_crawler(crawler)
File "C:\Users\shima\anaconda3\envs\virtual_workspace\lib\site-packages\scrapy\middleware.py", line 53, in from_crawler
return cls.from_settings(crawler.settings, crawler)
File "C:\Users\shima\anaconda3\envs\virtual_workspace\lib\site-packages\scrapy\middleware.py", line 34, in from_settings
mwcls = load_object(clspath)
File "C:\Users\shima\anaconda3\envs\virtual_workspace\lib\site-packages\scrapy\utils\misc.py", line 44, in load_object
mod = import_module(module)
File "C:\Users\shima\anaconda3\envs\virtual_workspace\lib\importlib\__init__.py", line 127, in import_module
return _bootstrap._gcd_import(name[level:], package, level)
File "<frozen importlib._bootstrap>", line 1014, in _gcd_import
File "<frozen importlib._bootstrap>", line 991, in _find_and_load
File "<frozen importlib._bootstrap>", line 961, in _find_and_load_unlocked
File "<frozen importlib._bootstrap>", line 219, in _call_with_frames_removed
File "<frozen importlib._bootstrap>", line 1014, in _gcd_import
File "<frozen importlib._bootstrap>", line 991, in _find_and_load
File "<frozen importlib._bootstrap>", line 973, in _find_and_load_unlocked
ModuleNotFoundError: No module named 'demo_crawl'
This is my spider code:
# -*- coding: utf-8 -*-
import scrapy
import logging
import urllib.parse
parts= urllib.parse.urlsplit(u'http://fa.wikipedia.org/wiki/صفحهٔ_اصلی')
parts= parts._replace(path=urllib.parse.quote(parts.path.encode('utf8')))
encoded_url= parts.geturl().encode('ascii')
'https://fa.wikipedia.org/wiki/%D8%B5%D9%81%D8%AD%D9%87%D9%94_%D8%A7%D8%B5%D9%84%DB%8C'
class PromotionsSpider(scrapy.Spider):
name= 'promotions'
allowed_domains=['www.digikala.com']
def start_requests(self):
yield scrapy.Request(url= 'https://www.digikala.com/search/category-book/?type[0]=4844&promotion_types[0]=incredible_offer&promotion_types[1]=promotion&pageno=1&last_filter=promotion_types&last_value=incredible_offer&sortby=4',
callback= self.parse)
def parse(self, response):
for product in response.xpath("//ul[@class='c-listing__items']/li"):
title= product.xpath(".//a[@class='js-product-url']/text()").get()
star= float(str(product.xpath(".//div[@class='c-product-box__engagement-rating']/text()").get()))
discounted_percent = int(str(product.xpath(".//div[@class='c-price__discount-oval']/span/text()").get().strip()).replace('٪', ''))
discounted_price= int(str(product.xpath(".//div[@class='c-price__value-wrapper']/text()").get().strip()).replace(',', ''))
original_price= int(str(product.xpath(".//div[@class='c-price__value c-price__value--plp']/del/text()").get().strip()).replace(',', ''))
url= response.urljoin(product.xpath(".//a[@class='js-product-url']/@href").get())
discounted_amount= original_price-discounted_price
if star>=3.5 and (discounted_amount>=5000 or discounted_percent>=10):
yield{
'title':title,
'star':star,
'discounted_percent':discounted_percent,
'discounted_price':discounted_price,
'original_price':original_price,
'discounted_amount': discounted_amount,
'url':url
}
next_page = response.xpath('//*[@class="c-pager__item is-active"]/../following-sibling::*//@href').extract_first()
if next_page:
yield scrapy.Request(response.urljoin(next_page))
This is my middlewares.py file:
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware
import random, logging
class UserAgentRotatorMiddleware(UserAgentMiddleware):
user_agent_list=[
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
'Mozilla/5.0 (Windows NT 5.1; rv:7.0.1) Gecko/2010010 1 Firefox/7.0.1',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWeb Kit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393'
]
def __init__(self, user_agent=''):
self.user_agent= user_agent
def process_request(self, request, spider):
try:
self.user_agent= random.choice(self.user_agent_list)
request.headers.setdefault('User-Agent', self.user_agent)
except IndexError:
logging.error("Couldn't fetch the user agent")
class DigikalaSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class DigikalaDownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
And this is my settings.py file:
# -*- coding: utf-8 -*-
# Scrapy settings for digikala project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'digikala'
SPIDER_MODULES = ['digikala.spiders']
NEWSPIDER_MODULE = 'digikala.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'digikala (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
DOWNLOADER_MIDDLEWARES={
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
'demo_crawl.middlewares.UserAgentRotatorMiidleware':400
}
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'digikala.pipelines.DigikalaPipeline': 300,
#}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
AUTOTHROTTLE_START_DELAY = 60
# The maximum download delay to be set in case of high latencies
AUTOTHROTTLE_MAX_DELAY = 120
# The average number of requests Scrapy should be sending in parallel to
# each remote server
AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
Can anyone help me out with this error?
Thank you!!!
Aucun commentaire:
Enregistrer un commentaire