samedi 26 janvier 2019

How to scrape all the children tags of a specifiic tag, from google scholar website

I'm trying to scrape data from google scholar profile, let's say "https://scholar.google.com.au/citations?user=zD0vtfwAAAAJ&hl=en#d=gs_md_cita-d&u=%2Fcitations%3Fview_op%3Dview_citation%26hl%3Den%26user%3DzD0vtfwAAAAJ%26citation_for_view%3DzD0vtfwAAAAJ%3Ad1gkVwhDpl0C%26tzom%3D-330". while going through the website. I want to scrape all the coauthor's names which is in the div tag with class 'gsc_vcd_value' but I'm not able to do that directly so I tried going in the sequence. what my exact problem is that till the div tag with id "gs_md_cita-d-bdy", I'm able to scrape everything i.e (all the children tags" contained in the specific tag. but afterwards when I try to do the same for div tag with id "gs_md_cita-l", I'm only getting the tag itself in return. I'm not getting the children tags in return, I'm only getting the tag itself. please suggest me what am I missing ?

import bs4
import urllib
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup 

import urllib.request

class Scraper(): def init(self, url, maxP): self.url= url self.maxP = maxP def f(self): for i in range(0,1000,100): if (self.maxP

    for j in range(0, pageSize, 100):

        S_url=self.url + "&cstart=" + str(j) +"&pagesize=100"

        my_url = uReq(S_url)
        page_html = my_url.read()
        my_url.close()

        page_soup = soup(page_html, "lxml")

        aTag = page_soup.findAll('td', {'class': 'gsc_rsb_std'})

        Titles = page_soup.findAll('td', {'class': 'gsc_a_t'})

        Citations = page_soup.findAll('td', {'class': 'gsc_a_c'})

        Years = page_soup.findAll('td', {'class': 'gsc_a_y'})

        info_page = page_soup.findAll('a', {'class' : 'gsc_a_at'})

        for author in info_page:
            Author_names_link = author["data-href"]
            user=Author_names_link[53:65]
            n_input=Author_names_link[-12:]

            n_author_url="https://scholar.google.com.au/citations?user="+user+"&hl=en#d=gs_md_cita-d&u=%2Fcitations%3Fview_op%3Dview_citation%26hl%3Den%26user%3D"+user+"%26citation_for_view%3D"+user+"%3A"+n_input+"%26tzom%3D-330"

            author_url=uReq(n_author_url)

            n_page=author_url.read()

            author_url.close()

            n_page_soup=soup(n_page, "html.parser")

            n_tag=n_page_soup

            m_tag=n_tag.findAll('div', {'id': 'gs_top'})

            for i in m_tag:
                p_tag=i.findAll('div', {'data-h': '800'})

                for j in p_tag:
                    q_tag=j.findAll('div', {'id': 'gs_md_cita-d'})

                    for k in q_tag:
                        r_tag=k.findAll('div', {'id': 'gs_md_cita-d-bdy'})

                        for l in r_tag:
                            s_tag=l.findAll('div', {'id': 'gs_md_cita-l'})




Aucun commentaire:

Enregistrer un commentaire