i got a DB of files from theguardian.com. I need to reduce these files to only text and remove all Ads and other text. I am able to get main text but when i try to remove bottom element ("div", attrs={"class": "submeta"}) it delete whole text, but text is not part of this element.
# Decomposing
for remove1 in soup.select("figure", attrs={"class": "element-atom"}):
remove1.decompose()
for remove2 in soup.select("aside", attrs={"data-component": "rich-link"}):
remove2.decompose()
for remove3 in soup.select("div", attrs={"class": "submeta"}):
remove3.decompose()
# Extraction of text
textHeadline = soup.find("h1", attrs={"class": "content__headline"})
textUnderline = soup.find("div", attrs={"class": "tonal__standfirst"})
textBody = soup.find("div", attrs={"class": "content__article-body from-content-api js-article__body"})
# Final text
reductionResult = str(textHeadline) + str(textUnderline) + str(textBody)
Thank you for any help
Aucun commentaire:
Enregistrer un commentaire