I'm trying to use Scrapy to extract data from Craigslist posts. I will only be extracting from a few hundred posts.
I've been able to extract the links and titles of all the Craigslist posts. However, I also want to follow each link to extract the text from each Craigslist post.
The code is giving an error, SyntaxError: 'yield' outside function.
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
class CraigslistSampleItem(scrapy.Item):
date = scrapy.Field()
title = scrapy.Field()
link = scrapy.Field()
email = scrapy.Field()
craigID = scrapy.Field()
text = scrapy.Field()
postDate = scrapy.Field()
url = scrapy.Field()
class MySpider(CrawlSpider):
name = "craigslist"
allowed_domains = ["craigslist.org"]
base_url = "http://ift.tt/1HuNRIp?"
start_urls = ["http://ift.tt/1HuNRIp?"]
for i in range(1, 2):
start_urls.append(base_url + "s=" + str(i) + "00")
# rules = (Rule (SgmlLinkExtractor(allow=("index\d00\.html", ),restrict_xpaths=('//a[@class="button next"]',))
# , callback="parse", follow= True),
# )
def parse(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.select("//span[@class='pl']")
items = []
for titles in titles:
item = CraigslistSampleItem()
item ["title"] = titles.select("a/text()").extract()
item ["link"] = titles.select("a/@href").extract()
url = 'http://ift.tt/HloISj{}'.format(''.join(item['link']))
item ["url"] = url
items.append(item)
# Request(url=url, meta={'item': item}, callback=self.parse_item_page)
# # # Parse request to follow the posting link into the actual post
# request = scrapy.Request(url, callback = self.parse_item_page)
# request.meta['item'] = item
# self.df.loc[i] = pd.Series(item)
# yield request
item['link'] = urljoin(response.url, item['link'])
yield Request(item['link'],
meta={'item': item},
callback=self.parse_item_page)
# return items
def parse_item_page(self, response):
item = response.request.meta["item"]
item["contentLen"] = len(response.xpath("//section[@id='postingbody']").xpath("text()").extract())
postinginfo = response.xpath("//p[@class = 'postinginfo']").xpath("time/@datetime")
item["postDate"] = postinginfo[0].extract()
item["updateDate"] = postinginfo[len(postinginfo)-1].extract()
yield item
Aucun commentaire:
Enregistrer un commentaire