Hello I'm a python newbie, sorry to be asking such a specific question when I don't know what is going wrong..
I'm trying to crawl news articles from a korean new site. When i run this code
import sys
from bs4 import BeautifulSoup
import urllib.request
from urllib.parse import quote
target_url_b4_pn="http://ift.tt/2so3rEd"
target_url_b4_keyword='&query='
target_url_rest="&check_news1&more=1&sorting1&search_date1&v1=&v2=&range=1"
def get_text(URL, output_file):
source_code_from_URL=urllib.request.urlopen(URL)
soup=BeautifulSoup(source_code_from_URL, 'lxml', from_encoding='UTF-8')
content_of_article=soup.select('div.article')
for item in content_of_article:
string_item=str(item.find_all(text=True))
output_file.write(string_item)
def get_link_from_news_title(page_num, URL, output_file):
for i in range(page_num):
current_page_num=1+i*15
position=URL.index('=')
URL_with_page_num=URL[:position+1]+str(current_page_num)+URL[position+1:]
source_code_from_URL=urllib.request.urlopen(URL_with_page_num)
soup=BeautifulSoup(source_code_from_URL, 'lxml',from_encoding='UTF-8')
for title in soup.find_all('p','tit'):
title_link=title.select('a')
article_URL=title_link[0]['href']
get_text(article_URL, output_file)
def main():
keyword="노무현"
page_num=1
output_file_name="output.txt"
target_url=target_url_b4_pn+target_url_b4_keyword+quote(keyword)+target_url_rest
output_file=open(output_file_name, "w", -1, "utf-8")
get_link_from_news_title(page_num, target_url, output_file)
output_file.close()
if __name__=='__main__':
main()
print(target_url)
print(11111)
The jupyter notebook doesn't respond to the input, doesnt even spit out any simple command at the bottom(doesn't print anything)
Think the code is freezing it somehow, please tell me where it might be going wrong?
the picture where it's not responding
Aucun commentaire:
Enregistrer un commentaire