web: it is posible make scrapy at this web?

jeudi 2 septembre 2021

it is posible make scrapy at this web?

thank you for read this post, i need some help because i 've tried to do scrapy ath this web "https://ift.tt/3jFnxpS", but i only receive this message from console

<403 https://www.arcountydata.com/index.asp>: HTTP status code is not handled or not allowed

this is my code if any could advice me i will be gratefull:

import scrapy
import pandas as pd
import datetime
import re
from scrapy import FormRequest
import time
import json
from scrapy import Request

from lxml.html import fromstring
import requests
from itertools import cycle
import traceback
from scrapy.utils.response import open_in_browser



from BentoncountyAR.items import BentoncountyarItem

class LakeCounty(scrapy.Spider) :
    name="OlmstedCounty"
    custom_settings = {'LOG_LEVEL':'INFO'}
    user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) 

Chrome/87.0.4280.88 Safari/537.36"
def clean(self, cadena):
    cadena = str(cadena)
    cadena = cadena.strip()
    cadena = cadena.replace(',','')
    cadena = cadena.replace('*','').replace('(','').replace(')','')
    cadena = cadena.replace('\n','')
    cadena = cadena.replace('\r','')
    cadena = cadena.replace('\t','')
    cadena = cadena.replace(';','')
    cadena = cadena.replace('$','')
    cadena = cadena.replace('%','percent')
    if cadena.lower() == 'nan':
        cadena = cadena.lower().replace('nan','')
    if cadena.lower() == 'none':
        cadena = cadena.lower().replace('none','')
    cadena = cadena.replace('N/A','')
    cadena = cadena.replace('"','')
    cadena = cadena.replace('\xa0','')
    cadena = cadena.replace('&amp','&')
    cadena = cadena.replace('PARID: ','')
    
    if cadena.lower() == 'unknown':
        cadena = cadena.lower().replace('unknown','')
    
    cadena = cadena.strip()
    return cadena

def start_requests(self):
    csv_path = 'input/Parcels.csv'
    df = pd.read_csv(csv_path, dtype=str)
    df =df.dropna(axis=0, subset=['PARCELID'])
    df.to_csv('input/parcels.csv', index=False)
    csv_path = 'input/parcels.csv'
    df = pd.read_csv(csv_path, dtype=str,nrows=3)

    url = 'https://www.arcountydata.com/index.asp'
    for i, row in df.iterrows():
        try:
            header = {
                'Connection': 'keep-alive',
                'Accept-Encoding': 'gzip, deflate, br',
                'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
                'user-agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
            }
            yield scrapy.Request(url=url,method='GET',headers=header,callback=self.lala ,meta = {'row':row, 'cookiejar': i}, dont_filter=True)
        except Exception as e:
            print("[0]###########################", row['PARCELID'], e)

web

jeudi 2 septembre 2021

it is posible make scrapy at this web?

Aucun commentaire:

Enregistrer un commentaire