mercredi 11 mars 2020

Scrapy: parser profiles

I am doing a wellness site parser. Please tell me how to parse data from tabs Reviews and Phone Numbers & Directions. I received data from the profile tab.

My scraper wellness.py:

import scrapy

class Wellness(scrapy.Spider):

name = "wellness"
start_urls = ['https://www.wellness.com/find']

def parse(self, response):
    for a in response.css("li.categories-li a")[5:7]:
        yield response.follow(a, callback=self.state)

def state(self, response):
    for a in response.css("div.find-item-container a")[0:3]:
        yield response.follow(a, callback=self.city)

def city(self, response):
    for a in response.css("li.categories-li a"):
        yield response.follow(a, callback=self.profile_url)

def profile_url(self, response):
    for a in response.css("h2 a"):
        yield response.follow(a, callback=self.profile)

    next_page = response.css("li.pagination-next a")
    if next_page is not None:
        yield response.follow(next_page, self.profile_url)

def profile(self, response):
    services = response.xpath('.//span[contains(text(),"Services")]')
    education = response.xpath('.//span[contains(text(),"Education")]')
    training = response.xpath('.//span[contains(text(),"Training")]')

    yield {
            'First and Last name': response.css('h1::text').get(),
            'About': response.css('.listing-about::text').get(),
            'Services': services.xpath('following-sibling::span[1]/text()').extract(),
            'Primary Specialty': response.css('.normal::text').get(),
            'Address': ' '.join([i.strip() for i in response.css('.office-address span::text').getall()]),
            'Practice': response.css('.years-in-service::text').get(),
            'Education': education.xpath('following-sibling::span[1]/text()').extract(),
            'Training': training.xpath('following-sibling::span[1]/text()').extract(),
            'Consumer Feedback': response.css('.item-rating-container a::text').get()                
        }

    reviews_tab = response.css("#reviews_tab a")
    if next_page is not None:
        yield response.follow(reviews_tab, self.reviews)

    directions_tab = response.css("#directions_tab a")
    if directions_tab is not None:
        yield response.follow(directions_tab, self.directions)

def reviews(self, response):
    yield {
    'Rewiew': response.css('.listing-review-text::text').get()}

def directions(self, response):
    yield{
    'Number': response.css('.directions-number::text').get()}

Thank you in advanceᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ ᅠ




Aucun commentaire:

Enregistrer un commentaire