mardi 27 décembre 2016

How can I make the output with pair of list : content in my python code?

I have been developing a python web-crawler for this website(http://ift.tt/2i6fd3b). Until now, I made two functions, which works well separately. One is to collect the list of stock and another is to collect the content data of each list.

I would like to make the output of my code with pairs of "list#1/content#1", "list#2/content#2", "list#3/content#3", ... What code should I add at the last part of my code?

Please help me to work this out.

Thanks.

from bs4 import BeautifulSoup
import urllib.request

CAR_PAGE_TEMPLATE = "http://ift.tt/2iD2gej"
BASE_PAGE = 'http://ift.tt/1NnGOTt'

def fetch_post_list():

    for i in range(20,21):
        URL = CAR_PAGE_TEMPLATE + str(i)
        res = urllib.request.urlopen(URL)
        html = res.read()
        soup = BeautifulSoup(html, 'html.parser')
        table = soup.find('table', class_='cyber')
        #print ("Page#", i)

        # 50 lists per each page
        lists=table.find_all('tr', itemtype="http://ift.tt/pngIkZ")

        count=0
        for lst in lists:
            if lst.find_all('td')[3].find('em').text:
                lst_price=lst.find_all('td')[3].find('em').text
                lst_title=lst.find_all('td')[1].find('a').text
                lst_link = lst.find_all('td')[1].find('a')['href']
                lst_photo_url=''
                if lst.find_all('td')[0].find('img'):
                    lst_photo_url = lst.find_all('td')[0].find('img')['src']
                count+=1
            else: continue

            #print('#',count, lst_title, lst_photo_url, lst_link, lst_price)

    return lst_link

def fetch_post_content(lst_link):

    URL = BASE_PAGE + lst_link
    res = urllib.request.urlopen(URL)
    html = res.read()
    soup = BeautifulSoup(html, 'html.parser')

    #Basic Information
    table = soup.find('div', class_='rightarea')

    # Number, Year, Mileage, Gas Type, Color, Accident
    content_table1 = table.find_all('div')[0]
    dds = content_table1.find_all('dd')
    for dd in dds:
        span_t = dd.find_all('span', {'class': 't'})[0]
        span_s = dd.find_all('span', {'class': 's'})[0]
        #print(span_t.text, ':', span_s.text)

    # Seller Information
    content_table2 = table.find_all('div')[1]
    dds2 = content_table2.find_all('dd')
    for dd2 in dds2:
        span_t = dd.find_all('span', {'class': 't'})[0]
        span_s = dd.find_all('span', {'class': 's'})[0]
        #print(span_t.text, ':', span_s.text)

    return dds




Aucun commentaire:

Enregistrer un commentaire