I'm working on scraping soil data from https://websoilsurvey.sc.egov.usda.gov/App/WebSoilSurvey.aspx I've created the following parameters to select an area of interest, run a report, and I am left with a table of soil data on a page with multiple tables. I'm not familiar with scraping tables and I've never written on an .ASPX site before.
All online documentation about parsing with Beautiful Soup requires obtaining a URL where the table is. My URL on the .ASPX site doesn't change as I enter parameters so I can't use the URL above (it remains that URL the entire time). Then the site has multiple tables so I would need to classify which one I need to scrape.
Any ideas on how to parse this table?
import selenium
from selenium import webdriver
from selenium.webdriver.support.ui import Select
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import xlrd
import os
from openpyxl import load_workbook
import win32com.client as win32
import openpyxl as xl
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup as bs
# Downloading File #
# Using Chrome to access web
driver = webdriver.Chrome()
delay = 5
# Open the website
driver.get('https://websoilsurvey.sc.egov.usda.gov/App/WebSoilSurvey.aspx')
# Soil Survey Area
while True:
try:
driver.find_element_by_xpath('//*[@id="Quick_Navigation_Soil_Survey_Area_header"]/span/span').click()
break
except:
continue
states = ['ND', 'SD', 'MN']
for state in states:
# Select State
select = Select(driver.find_element_by_id('ssa_select_state'))
select.select_by_value(state)
while True:
try:
myElem = WebDriverWait(driver, delay).until(EC.text_to_be_present_in_element((By.XPATH, '//*[@id="navigateBySSAGrid"]/div[2]/div[1]/div[3]'), state))
break
except:
continue
select = driver.find_element_by_id('ssa_county_select')
selectall = Select(driver.find_element_by_id('ssa_county_select'))
counties = [x for x in select.find_elements_by_tag_name('option')]
for county in counties[1:]:
# Print for checks
print('Total Counties for ' + state + ': ' + str(len(counties[1:])))
# Select County
selectall.select_by_value(county.get_attribute('value'))
myElem = WebDriverWait(driver, delay).until(EC.text_to_be_present_in_element((By.XPATH, '//*[@id="navigateBySSAGrid"]/div[2]/div[1]/div[3]'), county.get_attribute('value')))
# Click first county bubble
time.sleep(3)
driver.execute_script("arguments[0].click();", bubble)
# Set AOI
time.sleep(1)
driver.find_element_by_xpath('//*[@id="navigatebyssaformid"]/div[3]/button[1]').click()
# Click Soil Data Explorer tab
time.sleep(4)
driver.find_element_by_xpath('//*[@id="Soil_Data_Explorer"]').click()
# Expand Vegetative Productivity
myElem = WebDriverWait(driver, delay).until(EC.visibility_of_element_located((By.XPATH, '//*[@id="Vegetative_Productivity_unfold"]')))
driver.find_element_by_xpath('//*[@id="Vegetative_Productivity_unfold"]').click()
# Expand Crop Productivity Index
time.sleep(3)
driver.find_element_by_xpath('//*[@id="Vegetative_Productivity_Crop_Productivity_Index_header"]/span/span').click()
# View Rating
myElem = WebDriverWait(driver, delay).until(EC.visibility_of_element_located((By.XPATH, '//*[@id="ParameterForm190ViewRating_bottom"]')))
driver.find_element_by_xpath('//*[@id="ParameterForm190ViewRating_bottom"]').click()
# Parse table
break
Aucun commentaire:
Enregistrer un commentaire