I'm trying to scrape data from google scholar profile, let's say "https://scholar.google.com.au/citations?user=zD0vtfwAAAAJ&hl=en#d=gs_md_cita-d&u=%2Fcitations%3Fview_op%3Dview_citation%26hl%3Den%26user%3DzD0vtfwAAAAJ%26citation_for_view%3DzD0vtfwAAAAJ%3Ad1gkVwhDpl0C%26tzom%3D-330". while going through the website. I want to scrape all the coauthor's names which is in the div tag with class 'gsc_vcd_value' but I'm not able to do that directly so I tried going in the sequence. what my exact problem is that till the div tag with id "gs_md_cita-d-bdy", I'm able to scrape everything i.e (all the children tags" contained in the specific tag. but afterwards when I try to do the same for div tag with id "gs_md_cita-l", I'm only getting the tag itself in return. I'm not getting the children tags in return, I'm only getting the tag itself. please suggest me what am I missing ?
import bs4
import urllib
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
import urllib.request
class Scraper(): def init(self, url, maxP): self.url= url self.maxP = maxP def f(self): for i in range(0,1000,100): if (self.maxP
for j in range(0, pageSize, 100):
S_url=self.url + "&cstart=" + str(j) +"&pagesize=100"
my_url = uReq(S_url)
page_html = my_url.read()
my_url.close()
page_soup = soup(page_html, "lxml")
aTag = page_soup.findAll('td', {'class': 'gsc_rsb_std'})
Titles = page_soup.findAll('td', {'class': 'gsc_a_t'})
Citations = page_soup.findAll('td', {'class': 'gsc_a_c'})
Years = page_soup.findAll('td', {'class': 'gsc_a_y'})
info_page = page_soup.findAll('a', {'class' : 'gsc_a_at'})
for author in info_page:
Author_names_link = author["data-href"]
user=Author_names_link[53:65]
n_input=Author_names_link[-12:]
n_author_url="https://scholar.google.com.au/citations?user="+user+"&hl=en#d=gs_md_cita-d&u=%2Fcitations%3Fview_op%3Dview_citation%26hl%3Den%26user%3D"+user+"%26citation_for_view%3D"+user+"%3A"+n_input+"%26tzom%3D-330"
author_url=uReq(n_author_url)
n_page=author_url.read()
author_url.close()
n_page_soup=soup(n_page, "html.parser")
n_tag=n_page_soup
m_tag=n_tag.findAll('div', {'id': 'gs_top'})
for i in m_tag:
p_tag=i.findAll('div', {'data-h': '800'})
for j in p_tag:
q_tag=j.findAll('div', {'id': 'gs_md_cita-d'})
for k in q_tag:
r_tag=k.findAll('div', {'id': 'gs_md_cita-d-bdy'})
for l in r_tag:
s_tag=l.findAll('div', {'id': 'gs_md_cita-l'})
Aucun commentaire:
Enregistrer un commentaire