I am currently working on a web-crawler that is supposed to visit a list of websites in a directory, visit the sites' CSS stylesheets, check for an @media tag (a basic way of checking for responsive design, I know there are other corner cases to consider), and print all websites that do not use responsive design to a file.
I am fairly certain that my method of actually checking the CSS for an @media tag works fine, but the spider is not visiting all the CSS files before deciding whether or not it has found one with an @media tag. I have a test file that logs debugging output as the program progresses, and it shows odd patterns such as finishing checking all CSS files and then printing out what it found in the files, which shouldn't happen.
I was hoping someone could look at my code and help me determine why this isn't happening in the order I want it to. For reference, the goal is:
- Visit a website from the list
- Visit every CSS file in the head element of that site's HTML
- If an @media tag is found, we're done and the site uses responsive design
- If not, continue checking more CSS files
- If no CSS file contains an @media tag, the site does not use responsive design and should be added to the list
Here's my code (not everything works perfectly - for example, the program times out because I haven't worked out using TimeOutError yet, but for the most part, I feel like this should do it's job of correctly evaluating websites, and it is not doing that):
import scrapy
import re
import os.path
from scrapy.linkextractors import LinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from twisted.internet.error import TimeoutError
import time
class LCCISpider(CrawlSpider):
name = "lcci"
start_urls = ["http://ift.tt/2c9E1lF"]
#Calls parse_item for every category link on main page
rules = (Rule(SgmlLinkExtractor(restrict_xpaths=('//div[@id="catListingResults"]/table/tr')),
callback = 'parse_item', follow = True),)
website_list = []
found_media = False
#Called for each category
def parse_item(self, response):
#For each site on the page, calls parse_website
sites = response.xpath('//div[@id="busListingResults"]/table/tr')
for site in sites:
urls = site.xpath('.//td/a[4]/@href').extract()
for url in urls:
if len(url) == 0:
continue
else:
new_site = response.urljoin(url)
yield scrapy.Request(new_site, callback=self.parse_website,
errback=self.errback_website)
def parse_website(self, response):
f = open('output2.txt', 'a')
f.write("NOW VISITING")
f.flush()
f.write(response.url)
f.flush()
f.write("\n")
f.flush()
f.close()
#reset found_media to false for each website
self.found_media = False
#for every link in the header, check potential css for @media tag
for href in response.css("head > link::attr('href')"):
url = response.urljoin(href.extract())
#if @media tag has not been found, continue checking css
if self.found_media == False:
#Call check_css for the url of the css file
yield scrapy.Request(url, callback=self.check_css,
errback=self.errback_website)
f = open('output2.txt', 'a')
f.write("step\n")
f.flush()
f.close()
else:
break
#if no @media tag is found in any link in the header, add the url to the website_list
if self.found_media == False:
#self.website_list.append(response.url)
f = open('output2.txt', 'a')
f.write("No @media tag in")
f.flush()
f.write(response.url)
f.flush()
f.write("\n")
f.flush()
f.close()
f = open('outputfalse2.txt', 'a')
f.write(response.url)
f.write("\n")
f.close()
else:
f = open('outputtrue.txt', 'a')
f.write(reponse.url)
f.write("\n")
f.close()
def check_css(self, response):
#Just a way of converting url into a string, the ".txt" is otherwise meaningless
string = str(response.url)
f = open('output2.txt', 'a')
f.write("Checking CSS in ")
f.write(response.url)
f.write("\n")
f.flush()
f.close()
#only perform regex search if it's a .css file
if (string[-4:] == ".css"):
media_match = re.search(r'@media', response.body, flags=0)
if media_match != None:
f = open('output2.txt', 'a')
f.write("found @media tag in " + response.url + "\n")
f.flush()
#If an @media tag is found, set found_media to True
self.found_media = True
f.close()
else:
f = open('output2.txt', 'a')
f.write("not css")
f.flush()
f.close()
def errback_website(self, failure):
if failure.check(TimeoutError):
request = failure.request
self.logger.error = ('TimeoutError on %s', request.url)
Aucun commentaire:
Enregistrer un commentaire