I have been developing a python web-crawler for this website(http://ift.tt/2i6fd3b). Until now, I made two functions, which works well separately. One is to collect the list of stock and another is to collect the content data of each list.
I would like to make the output of my code with pairs of "list#1/content#1", "list#2/content#2", "list#3/content#3", ... What code should I add at the last part of my code?
Please help me to work this out.
Thanks.
from bs4 import BeautifulSoup
import urllib.request
CAR_PAGE_TEMPLATE = "http://ift.tt/2iD2gej"
BASE_PAGE = 'http://ift.tt/1NnGOTt'
def fetch_post_list():
for i in range(20,21):
URL = CAR_PAGE_TEMPLATE + str(i)
res = urllib.request.urlopen(URL)
html = res.read()
soup = BeautifulSoup(html, 'html.parser')
table = soup.find('table', class_='cyber')
#print ("Page#", i)
# 50 lists per each page
lists=table.find_all('tr', itemtype="http://ift.tt/pngIkZ")
count=0
for lst in lists:
if lst.find_all('td')[3].find('em').text:
lst_price=lst.find_all('td')[3].find('em').text
lst_title=lst.find_all('td')[1].find('a').text
lst_link = lst.find_all('td')[1].find('a')['href']
lst_photo_url=''
if lst.find_all('td')[0].find('img'):
lst_photo_url = lst.find_all('td')[0].find('img')['src']
count+=1
else: continue
#print('#',count, lst_title, lst_photo_url, lst_link, lst_price)
return lst_link
def fetch_post_content(lst_link):
URL = BASE_PAGE + lst_link
res = urllib.request.urlopen(URL)
html = res.read()
soup = BeautifulSoup(html, 'html.parser')
#Basic Information
table = soup.find('div', class_='rightarea')
# Number, Year, Mileage, Gas Type, Color, Accident
content_table1 = table.find_all('div')[0]
dds = content_table1.find_all('dd')
for dd in dds:
span_t = dd.find_all('span', {'class': 't'})[0]
span_s = dd.find_all('span', {'class': 's'})[0]
#print(span_t.text, ':', span_s.text)
# Seller Information
content_table2 = table.find_all('div')[1]
dds2 = content_table2.find_all('dd')
for dd2 in dds2:
span_t = dd.find_all('span', {'class': 't'})[0]
span_s = dd.find_all('span', {'class': 's'})[0]
#print(span_t.text, ':', span_s.text)
return dds
Aucun commentaire:
Enregistrer un commentaire