dimanche 20 mai 2018

Scrapy _ How to append / delete text to listing URL

I'm new to Python and Scrapy. I'm trying to create a spider to scrape: https://www.festicket.com/festivals/

I've managed to get the spider working, the problem is that some URLs are like so: https://www.festicket.com/festivals/electric-daisy-carnival-edc-las-vegas/2018/

and some URLS have: /shop/#ticket appended to them which is stoping the spider from crawling the listing page.

My question is, is there some way that if the spider finds a URL with /shop/#ticket it simple deletes the /shop/#ticket but keeps the rest of the URL???

My code so far is below:

import scrapy

class AuthorsSpider(scrapy.Spider):
    name = "festicket"
    start_urls = ['https://www.festicket.com/festivals/']


    npages = 20

    # This mimics getting the pages using the next button.
    for i in range(2, npages + 2):
        start_urls.append("https://www.festicket.com/festivals/?page=" + str(i) + "")


    #Scrape and follow listings
    def parse(self, response):
        urls = response.xpath(
                "//h3[@class='festival-title heading-3ry notranslate']//@href").extract()
        for url in urls:
            url = response.urljoin(url)
            yield scrapy.Request(url=url, callback=self.parse_details)


    def parse_details(self, response):
        yield {
            'title': response.xpath("//h1[@class='sc-jzJRlG gbLQoU']/text()").extract_first(),
            'festival_url': response.xpath("//meta[@property='og:url']/@content").extract_first(),
            'location': response.xpath("//ul[contains(@class,'styles__StyledList')][1]/li[contains(@class,'styles__DotSeparatorSpan-h0jg7b')][1]/descendant::text()").extract_first(),
            'address': response.xpath("//div[@class='sc-gzVnrw bpJeJY'][2]/section[@class='sc-gZMcBi gDrvBk']/div/p[@class='sc-chPdSV hifsJb']/descendant::text()").extract_first(),
            'date': response.xpath("//ul[contains(@class,'styles__StyledList')][1]/li[contains(@class,'styles__DotSeparatorSpan-h0jg7b')][2]/descendant::text()").extract_first(),
            'genre1': response.xpath("//ul[contains(@class,'styles__StyledList')][2]/li[contains(@class,'styles__DotSeparatorSpan-h0jg7b')][1]/descendant::text()").extract_first(),
            'genre2': response.xpath("//ul[contains(@class,'styles__StyledList')][2]/li[contains(@class,'styles__DotSeparatorSpan-h0jg7b')][2]/descendant::text()").extract_first(),
            'genre3': response.xpath("//ul[contains(@class,'styles__StyledList')][2]/li[contains(@class,'styles__DotSeparatorSpan-h0jg7b')][3]/descendant::text()").extract_first(),
            'subtitle2': response.xpath( "//span[@class='styles__StyledHtmlWrapper-l0qhyk-0 cUaVYv sc-jAaTju jlDUtI']/p/descendant::text()").extract_first(),
            'subtitle1': response.xpath("//h2[@class='sc-cSHVUG gCeeYI']/descendant::text()").extract_first(),
            'para1': response.xpath("//span[@class='styles__StyledHtmlWrapper-s1eywhsl-0 cJBjEA sc-jAaTju jlDUtI']/p[1]/descendant::text()").extract_first(),
            'para2': response.xpath("//span[@class='styles__StyledHtmlWrapper-s1eywhsl-0 cJBjEA sc-jAaTju jlDUtI']/p[2]/descendant::text()").extract_first(),
            'para3': response.xpath("//span[@class='styles__StyledHtmlWrapper-s1eywhsl-0 cJBjEA sc-jAaTju jlDUtI']/p[3]/descendant::text()").extract_first(),
            'flyer': response.xpath("//img[contains(@class,'styles__Artwork')]/@src").extract_first(),
            'subtitle2': response.xpath("//span[@class='styles__StyledHtmlWrapper-l0qhyk-0 cUaVYv sc-jAaTju jlDUtI']/p/descendant::text()").extract_first(),
            'banner_image_1': response.xpath("//div[@class='styles__PhotoWrapper-s1brd5dy-2 cpnBtx'][1]/div[@class='styles__PhotoInnerWrapper-s1brd5dy-3 gVsbNY']/img[@class='styles__PhotoImage-s1brd5dy-4 cqQHmb']/@src").extract_first(),
            'banner_image_2': response.xpath("//div[@class='styles__PhotoWrapper-s1brd5dy-2 cpnBtx'][2]/div[@class='styles__PhotoInnerWrapper-s1brd5dy-3 gVsbNY']/img[@class='styles__PhotoImage-s1brd5dy-4 cqQHmb']/@src").extract_first(),
            'banner_image_3': response.xpath("//div[@class='styles__PhotoWrapper-s1brd5dy-2 cpnBtx'][3]/div[@class='styles__PhotoInnerWrapper-s1brd5dy-3 gVsbNY']/img[@class='styles__PhotoImage-s1brd5dy-4 cqQHmb']/@src").extract_first(),
        }




Aucun commentaire:

Enregistrer un commentaire