lundi 15 avril 2019

Obtaining Twiiter Likes

I'm trying to extract the information on each twitter like but it only returns the wrong number of likes or none at all. I'm pretty sure my code is correct. I believe it might be because of the fact that Twitter is trying to prevent people from web scrape information from their site. Is there a way to fix this? Also is there a way to see everyone that has liked a specific tweet?

import re
import requests
import urllib
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import NoAlertPresentException
from bs4 import BeautifulSoup
import sys
import unittest, time
import openpyxl
url = ["https://twitter.com/CocaCola?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor"]
for x in url:
   d = webdriver.Chrome()
   actions = ActionChains(d)
   d.get(x)
   res = requests.get(x)
   page = urllib.urlopen(x)
   numb = 0;
   SCROLL_PAUSE_TIME = 0.5
# Get scroll height
   last_height = d.execute_script("return document.body.scrollHeight")
   while True:
    # Scroll down to bottom
      d.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    # Wait to load page
      time.sleep(SCROLL_PAUSE_TIME)
    # Calculate new scroll height and compare with last scroll height
      new_height = d.execute_script("return document.body.scrollHeight")
      soup = BeautifulSoup(page, "html.parser")
      for posts in soup.findAll('div',{"class":"content"}):
         if(posts.find('p').text.encode('utf-8').find("Retweeted") == -1):
            print(posts.find('span',{"class": "_timestamp js-short-timestamp"}).text)
            print(posts.find('p').text.encode('utf-8'))
            retweet = posts.find('button',{"class": "ProfileTweet-actionButton js-actionButton js-actionFavorite"})
            #print(retweet.find('span',{"class":"ProfileTweet-actionCount"})["data-tweet-stat-count"])
            print(retweet)
            likes = posts.find('div',{"class":"ProfileTweet-action ProfileTweet-action--favorite js-toggleState"})
            print(likes.find('span',{"class": "ProfileTweet-actionCountForPresentation"}))
            numb = numb+1
            if new_height == last_height:
               break
            if numb > 1:
               break
      if numb > 1:
         break
      last_height = new_height
   d.close()

Aucun commentaire:

Enregistrer un commentaire