samedi 23 novembre 2019

Getting a Traceback error for a simple python script to scrape GoodReads.com urls

This is a script to take a list of book titles (BookTitles.txt), search the site Goodreads for the first result of each title and return a URL list to a csv file (GoodReadsBooksNew.csv).

I am getting the error below:

iii@iii:~$ python /home/iii/AudioBookReviews/WebScraping/GoodreadsScraper.py

Traceback (most recent call last):

File "/home/iii/AudioBookReviews/WebScraping/GoodreadsScraper.py", line 72, in create_csv_file()

File "/home/iii/AudioBookReviews/WebScraping/GoodreadsScraper.py", line 29, in create_csv_file with open('/home/iii/AudioBookReviews/WebScraping/GoodReadsBooksNew.csv', 'w+', encoding='utf-8') as csv_file:

TypeError: 'encoding' is an invalid keyword argument for this function


from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException

from selenium.webdriver.firefox.options import Options
from selenium.webdriver.chrome.options import Options

from pyvirtualdisplay import Display
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common import keys
import csv
import time
import json

class Book:
    def __init__(self, title, url):
        self.title = title
        self.url = url
    def __iter__(self):
        return iter([self.title, self.url])

url = 'https://www.goodreads.com/'

def create_csv_file():
    header = ['Title', 'URL']
    with open('/home/iii/AudioBookReviews/WebScraping/GoodReadsBooksNew.csv', 'w+', encoding='utf-8') as csv_file:
        wr = csv.writer(csv_file, delimiter=',')
        wr.writerow(header)

def read_from_txt_file():
    lines = [line.rstrip('\n') for line in open('/home/iii/AudioBookReviews/WebScraping/BookTitles.txt', encoding='utf-8')]
    return lines

def init_selenium():
    chrome_options = Options()
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage') 
    options = Options()
    options.add_argument('--headless')
    global driver
    driver = webdriver.Chrome("/home/iii/AudioBookReviews/WebScraping/chromedriver",  chrome_options=chrome_options)
    driver.get(url)
    time.sleep(5)
    driver.get('https://www.goodreads.com/search?q=')

def search_for_title(title):
    driver.get('https://www.goodreads.com/search?q=')
    search_field = driver.find_element_by_name('q')
    search_field.clear()
    search_field.send_keys(title)
    search_field.send_keys(keys.Keys.RETURN) # you missed this part
    url = driver.find_element_by_xpath(
        '/html/body/div[2]/div[3]/div[1]/div[2]/div[2]/table/tbody/tr[1]/td[2]/a')
    print(url.get_attribute('href'))

def scrape_url():
    try:
        url = driver.find_element_by_css_selector('a.bookTitle').get_attribute('href')
    except:
        url = "N/A"

    return url

def write_into_csv_file(vendor):
   with open('/home/iii/AudioBookReviews/WebScraping/GoodReadsBooksNew.csv', 'a', encoding='utf-8') as csv_file:
        wr = csv.writer(csv_file, delimiter=',')
        wr.writerow(list(vendor))

create_csv_file()
titles = read_from_txt_file()    
init_selenium()

for title in titles:
    search_for_title(title)
    url = scrape_url()
    book = Book(title, url)
    write_into_csv_file(book)



Aucun commentaire:

Enregistrer un commentaire