I need to do some real estate market research and for that I need the prices and other values of new homes.
So my idea was to go to the website where I get the information. Go to the main search page and scrape all the RealEstateIDs that would take me directly to the individual pages for each house where I can extract the information I need. I have two problems: First, the website has around 60 IDs, but I only get about 20 IDs. Second, the format in the Excel spreadsheet puts the column headers back above each entry, and each entry only in one cell instead of in different columns.
If someone could explain to me how I can solve these problems and tell me what I did wrong, I would be very happy to hear that
import requests
import json
from bs4 import BeautifulSoup as bs
import datetime as dt
import os
import pandas as pd
import pandas_datareader.data as web
import re
import time
import urllib.request
from urllib.request import urlopen
import csv
res = requests.get('https://www.immobilienscout24.de/Suche/S-T/Wohnung-Kauf/Nordrhein-Westfalen/Duesseldorf/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/true?enteredFrom=result_list') #This is the main search site where I get the IDs
soup = bs(res.content, 'lxml')
r = re.compile(r'resultListModel:(.*)')
data = soup.find('script', text=r).text
script = r.findall(data)[0].rstrip(',')
results = json.loads(script)
ids = [item['@id'] for item in results['searchResponseModel']['resultlist.resultlist']['resultlistEntries'][0]['resultlistEntry']] #This should get me the IDs and put it into a list so i can use it later
print(ids) #My test that I get all the IDs and not just a few
data = json.dumps(ids)
houseinfo = {}
csvData = [['id','purchasePrice','Spacesize','District','Flattyp','Rooms']] #Dataformat for the Excel-Spreadsheet colums later
def get_house_info (House): #this is the function that inserts the IDs from the List into the URLs and than scraps the Values that I need into a new list
for id in ids:
try:
sourceCode = urllib.request.urlopen('https://www.immobilienscout24.de/expose/' + str(id)).read()
purchasePrice = str(sourceCode).split('"purchasePrice":')[1].split(',"geoCode"')[0]
Spacesize = str(sourceCode).split('"area":')[1].split('},"details"')[0]
District = str(sourceCode).split('"quarter":')[1].split('},')[0]
Flattyp = str(sourceCode).split('"is24qa-typ grid-item three-fifths">')[1].split('</dd> </dl> <dl class')[0]
Rooms = str(sourceCode).split('is24qa-zimmer grid-item three-fifths"> ')[1].split(' </dd> </dl> <dl class=')[0]
with open('foo.csv', 'a') as csvfile: #appends the values to the file
cols = ['id', 'price', 'size', 'district', 'flattyp', 'rooms']
dict_result = {'id': id, 'price': purchasePrice, 'size': Spacesize, 'district': District, 'flattyp': Flattyp, 'rooms': Rooms}
writer = csv.DictWriter(csvfile, fieldnames=cols)
writer.writeheader()
writer.writerow(dict_result)
csvfile.close()
except Exception as e:
print("failed in the main loop", str(e))
get_house_info(ids)
Aucun commentaire:
Enregistrer un commentaire