mercredi 20 mai 2015

Scrapy Python Craigslist Scraper Extract Text

I'm trying to use Scrapy to extract data from Craigslist posts. I will only be extracting from a few hundred posts.

I've been able to extract the links and titles of all the Craigslist posts. However, I also want to follow each link to extract the text from each Craigslist post.

The code is giving an error, SyntaxError: 'yield' outside function.

import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector


class CraigslistSampleItem(scrapy.Item):
    date = scrapy.Field()
    title = scrapy.Field()
    link = scrapy.Field()
    email = scrapy.Field()
    craigID = scrapy.Field()
    text = scrapy.Field()
    postDate = scrapy.Field()
    url = scrapy.Field()


class MySpider(CrawlSpider):
    name = "craigslist"
    allowed_domains = ["craigslist.org"]
    base_url = "http://ift.tt/1HuNRIp?"
    start_urls = ["http://ift.tt/1HuNRIp?"]

    for i in range(1, 2):
        start_urls.append(base_url + "s=" + str(i) + "00")

    # rules = (Rule (SgmlLinkExtractor(allow=("index\d00\.html", ),restrict_xpaths=('//a[@class="button next"]',))
    #   , callback="parse", follow= True),
    #   )

    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        titles = hxs.select("//span[@class='pl']")
        items = []
        for titles in titles:
            item = CraigslistSampleItem()
            item ["title"] = titles.select("a/text()").extract()
            item ["link"] = titles.select("a/@href").extract()
            url = 'http://ift.tt/HloISj{}'.format(''.join(item['link']))
            item ["url"] = url
            items.append(item)

            # Request(url=url, meta={'item': item}, callback=self.parse_item_page)

            # # # Parse request to follow the posting link into the actual post
            # request = scrapy.Request(url, callback = self.parse_item_page)
            # request.meta['item'] = item
            # self.df.loc[i] = pd.Series(item)
            # yield request
            item['link'] = urljoin(response.url, item['link'])
            yield Request(item['link'],
                          meta={'item': item},
                          callback=self.parse_item_page)

        # return items

    def parse_item_page(self, response):
        item = response.request.meta["item"]
        item["contentLen"] = len(response.xpath("//section[@id='postingbody']").xpath("text()").extract())
        postinginfo = response.xpath("//p[@class = 'postinginfo']").xpath("time/@datetime")
        item["postDate"] = postinginfo[0].extract()
        item["updateDate"] = postinginfo[len(postinginfo)-1].extract()
        yield item




Aucun commentaire:

Enregistrer un commentaire