jeudi 1 octobre 2020

How do you scrape a table on a .ASPX website that has multiple tables?

I'm working on scraping soil data from https://websoilsurvey.sc.egov.usda.gov/App/WebSoilSurvey.aspx I've created the following parameters to select an area of interest, run a report, and I am left with a table of soil data on a page with multiple tables. I'm not familiar with scraping tables and I've never written on an .ASPX site before.

All online documentation about parsing with Beautiful Soup requires obtaining a URL where the table is. My URL on the .ASPX site doesn't change as I enter parameters so I can't use the URL above (it remains that URL the entire time). Then the site has multiple tables so I would need to classify which one I need to scrape.

Any ideas on how to parse this table?

import selenium
from selenium import webdriver
from selenium.webdriver.support.ui import Select
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import xlrd
import os
from openpyxl import load_workbook
import win32com.client as win32
import openpyxl as xl
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup as bs



# Downloading File #

# Using Chrome to access web
driver = webdriver.Chrome()
delay = 5
# Open the website
driver.get('https://websoilsurvey.sc.egov.usda.gov/App/WebSoilSurvey.aspx')

# Soil Survey Area
while True:
    try:
        driver.find_element_by_xpath('//*[@id="Quick_Navigation_Soil_Survey_Area_header"]/span/span').click()
        break
    except:
        continue

states = ['ND', 'SD', 'MN']

for state in states:
# Select State
    select = Select(driver.find_element_by_id('ssa_select_state'))
    select.select_by_value(state)
    while True:
        try:
            myElem = WebDriverWait(driver, delay).until(EC.text_to_be_present_in_element((By.XPATH, '//*[@id="navigateBySSAGrid"]/div[2]/div[1]/div[3]'), state))
            break
        except:
            continue
    select = driver.find_element_by_id('ssa_county_select')
    selectall = Select(driver.find_element_by_id('ssa_county_select'))
    counties = [x for x in select.find_elements_by_tag_name('option')]
    for county in counties[1:]:
        # Print for checks
        print('Total Counties for ' + state + ': ' + str(len(counties[1:])))
        # Select County
        selectall.select_by_value(county.get_attribute('value'))
        myElem = WebDriverWait(driver, delay).until(EC.text_to_be_present_in_element((By.XPATH, '//*[@id="navigateBySSAGrid"]/div[2]/div[1]/div[3]'), county.get_attribute('value')))
        # Click first county bubble
        time.sleep(3)
        
        driver.execute_script("arguments[0].click();", bubble)
        # Set AOI
        time.sleep(1)
        driver.find_element_by_xpath('//*[@id="navigatebyssaformid"]/div[3]/button[1]').click()
        # Click Soil Data Explorer tab
        time.sleep(4)
        driver.find_element_by_xpath('//*[@id="Soil_Data_Explorer"]').click()
        # Expand Vegetative Productivity
        myElem = WebDriverWait(driver, delay).until(EC.visibility_of_element_located((By.XPATH, '//*[@id="Vegetative_Productivity_unfold"]')))
        driver.find_element_by_xpath('//*[@id="Vegetative_Productivity_unfold"]').click()
        # Expand Crop Productivity Index
        time.sleep(3)
        driver.find_element_by_xpath('//*[@id="Vegetative_Productivity_Crop_Productivity_Index_header"]/span/span').click()
        # View Rating
        myElem = WebDriverWait(driver, delay).until(EC.visibility_of_element_located((By.XPATH, '//*[@id="ParameterForm190ViewRating_bottom"]')))
        driver.find_element_by_xpath('//*[@id="ParameterForm190ViewRating_bottom"]').click()
        # Parse table
    break



Aucun commentaire:

Enregistrer un commentaire