I modified a web scraping code from http://ift.tt/1tOihwR.
from PyQt4.QtCore import QUrl
from PyQt4.QtGui import QApplication
from PyQt4.QtWebKit import QWebPage
from sys import argv
from bs4 import BeautifulSoup
# Use result of rendering.
class Render(QWebPage):
def __init__(self, url):
self.app = QApplication(argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
self.app.exec_()
def _loadFinished(self, result):
self.frame = self.mainFrame()
self.app.quit()
r = Render(url)
result = unicode(r.frame.toHtml().toUtf8(), encoding="UTF-8")
soup = BeautifulSoup(result, 'html.parser')
for script in soup(["script", "style"]):
script.extract()
text = soup.get_text().encode("utf-8")
With this code, I found that "nate.com" doesn't return any text, but "www.nate.com" does. So, I am trying to add "www" for all domains.
- Are there some websites that I shouldn't add "www" to their domains?
(like this)
if "www" in url:
url = url.split("//")[1]
url = "www" + url
- (optional) Why "nate.com" doesn't return any texts but "www.nate.com" does? I found it redirects to "www.nate.com" with chrome.
Any comments are welcome.
Aucun commentaire:
Enregistrer un commentaire