This is a script to take a list of book titles (BookTitles.txt), search the site Goodreads for the first result of each title and return a URL list to a csv file (GoodReadsBooksNew.csv).
I am getting the error below:
iii@iii:~$ python /home/iii/AudioBookReviews/WebScraping/GoodreadsScraper.py
Traceback (most recent call last):
File "/home/iii/AudioBookReviews/WebScraping/GoodreadsScraper.py", line 72, in create_csv_file()
File "/home/iii/AudioBookReviews/WebScraping/GoodreadsScraper.py", line 29, in create_csv_file with open('/home/iii/AudioBookReviews/WebScraping/GoodReadsBooksNew.csv', 'w+', encoding='utf-8') as csv_file:
TypeError: 'encoding' is an invalid keyword argument for this function
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.chrome.options import Options
from pyvirtualdisplay import Display
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common import keys
import csv
import time
import json
class Book:
def __init__(self, title, url):
self.title = title
self.url = url
def __iter__(self):
return iter([self.title, self.url])
url = 'https://www.goodreads.com/'
def create_csv_file():
header = ['Title', 'URL']
with open('/home/iii/AudioBookReviews/WebScraping/GoodReadsBooksNew.csv', 'w+', encoding='utf-8') as csv_file:
wr = csv.writer(csv_file, delimiter=',')
wr.writerow(header)
def read_from_txt_file():
lines = [line.rstrip('\n') for line in open('/home/iii/AudioBookReviews/WebScraping/BookTitles.txt', encoding='utf-8')]
return lines
def init_selenium():
chrome_options = Options()
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
options = Options()
options.add_argument('--headless')
global driver
driver = webdriver.Chrome("/home/iii/AudioBookReviews/WebScraping/chromedriver", chrome_options=chrome_options)
driver.get(url)
time.sleep(5)
driver.get('https://www.goodreads.com/search?q=')
def search_for_title(title):
driver.get('https://www.goodreads.com/search?q=')
search_field = driver.find_element_by_name('q')
search_field.clear()
search_field.send_keys(title)
search_field.send_keys(keys.Keys.RETURN) # you missed this part
url = driver.find_element_by_xpath(
'/html/body/div[2]/div[3]/div[1]/div[2]/div[2]/table/tbody/tr[1]/td[2]/a')
print(url.get_attribute('href'))
def scrape_url():
try:
url = driver.find_element_by_css_selector('a.bookTitle').get_attribute('href')
except:
url = "N/A"
return url
def write_into_csv_file(vendor):
with open('/home/iii/AudioBookReviews/WebScraping/GoodReadsBooksNew.csv', 'a', encoding='utf-8') as csv_file:
wr = csv.writer(csv_file, delimiter=',')
wr.writerow(list(vendor))
create_csv_file()
titles = read_from_txt_file()
init_selenium()
for title in titles:
search_for_title(title)
url = scrape_url()
book = Book(title, url)
write_into_csv_file(book)
Aucun commentaire:
Enregistrer un commentaire