thank you for read this post, i need some help because i 've tried to do scrapy ath this web "https://ift.tt/3jFnxpS", but i only receive this message from console
<403 https://www.arcountydata.com/index.asp>: HTTP status code is not handled or not allowed
this is my code if any could advice me i will be gratefull:
import scrapy
import pandas as pd
import datetime
import re
from scrapy import FormRequest
import time
import json
from scrapy import Request
from lxml.html import fromstring
import requests
from itertools import cycle
import traceback
from scrapy.utils.response import open_in_browser
from BentoncountyAR.items import BentoncountyarItem
class LakeCounty(scrapy.Spider) :
name="OlmstedCounty"
custom_settings = {'LOG_LEVEL':'INFO'}
user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)
Chrome/87.0.4280.88 Safari/537.36"
def clean(self, cadena):
cadena = str(cadena)
cadena = cadena.strip()
cadena = cadena.replace(',','')
cadena = cadena.replace('*','').replace('(','').replace(')','')
cadena = cadena.replace('\n','')
cadena = cadena.replace('\r','')
cadena = cadena.replace('\t','')
cadena = cadena.replace(';','')
cadena = cadena.replace('$','')
cadena = cadena.replace('%','percent')
if cadena.lower() == 'nan':
cadena = cadena.lower().replace('nan','')
if cadena.lower() == 'none':
cadena = cadena.lower().replace('none','')
cadena = cadena.replace('N/A','')
cadena = cadena.replace('"','')
cadena = cadena.replace('\xa0','')
cadena = cadena.replace('&','&')
cadena = cadena.replace('PARID: ','')
if cadena.lower() == 'unknown':
cadena = cadena.lower().replace('unknown','')
cadena = cadena.strip()
return cadena
def start_requests(self):
csv_path = 'input/Parcels.csv'
df = pd.read_csv(csv_path, dtype=str)
df =df.dropna(axis=0, subset=['PARCELID'])
df.to_csv('input/parcels.csv', index=False)
csv_path = 'input/parcels.csv'
df = pd.read_csv(csv_path, dtype=str,nrows=3)
url = 'https://www.arcountydata.com/index.asp'
for i, row in df.iterrows():
try:
header = {
'Connection': 'keep-alive',
'Accept-Encoding': 'gzip, deflate, br',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'user-agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
}
yield scrapy.Request(url=url,method='GET',headers=header,callback=self.lala ,meta = {'row':row, 'cookiejar': i}, dont_filter=True)
except Exception as e:
print("[0]###########################", row['PARCELID'], e)
Aucun commentaire:
Enregistrer un commentaire