mardi 2 février 2021

How to extract data with accent mark from a table on the web in R

I'm scraping a pharmacy page, where I need to extract all the data they have. What I did in the first for is to extract the data that comes out in the catalog, which it extracts well. Then for the program to get into the link of each product, I made it extract the product link, which I then occupy in the second for, so that it can extract the information from a table with the details of the drug. It extracts only some data well, since precisely those with an accent mark it puts as NA. So I think it has something to do with it not reading the accent mark words. I need to extract that data, so any suggestions are welcome.

Here I leave the code for you to see and in advance thank you very much!

library(rvest)
library(tidyverse)

#Ultima pagina de ahumada es la 192
lista_url <- paste0("https://www.farmazon.cl/categorias/medicamentos/page/", 1:50,".html?order=name&dir=asc")
#class(lista_url)
lista_url
ultima_pagina= length(lista_url)

#Luego en el for habra que reemplazar el 1:2 por la variable ultima_pagina
lista_final = list()
for(i in 1:2){
  leahtml = read_html(lista_url[i])
  lista_pagina =leahtml %>% 
    html_nodes('.page-main')  %>% html_nodes(".product-item-info") %>%
    map_df( ~ {
      marca = html_nodes(.x, ".product-item-link") %>%  html_text()
      marca <- ifelse(length(marca)>0, marca, NA_character_)
      
      precio = html_nodes(.x, ".price") %>%  html_text()
      precio <- ifelse(length(precio)>0, precio, NA_character_)
      
      
      farmacia = html_nodes(.x, ".vendor-label") %>%  html_text()
      farmacia <- ifelse(length(farmacia)>0, farmacia, NA_character_)
      
      link = html_nodes(.x, "[class='product name product-item-name'] > a") %>% html_attr("href")
      link <- ifelse(length(link)>0, link, NA_character_)
      
      
      list(Marca=marca, Precio=precio, Farmacia=farmacia, Link=link)
    }) 
  lista_final = rbind(lista_final, lista_pagina)
  lista_pagina = list()
  Sys.sleep(2)
  
}


enlaces = as.vector(unlist(lista_final['Link']))

ultimo_enlace = length(enlaces)


resumen_final = list()
for(i in 1:ultimo_enlace){
  leahtml = read_html(enlaces[i])
  
  resumen_pagina =leahtml %>% 
    html_nodes('.data.table.additional-attributes')  %>% 
    map_df( ~ {
      
      recetamed = html_nodes(.x, "[data-th='Receta Médica']") %>% html_text() 
      recetamed <- ifelse(length(recetamed)>0, recetamed, NA_character_)
      
      codisp = html_nodes(.x, "[data-th='Código ISP']") %>% html_text() 
      codisp <- ifelse(length(codisp)>0, codisp, NA_character_)
      
      forma = html_nodes(.x, "[data-th='Forma']") %>% html_text()
      forma <- ifelse(length(forma)>0, forma, NA_character_)
      
      lab = html_nodes(.x, "[data-th='Laboratorio / Fabricante']") %>% html_text() 
      lab <- ifelse(length(lab)>0, lab, NA_character_)
      
      pactivo = html_nodes(.x, "[data-th='Principio Activo']") %>% html_text() 
      pactivo <- ifelse(length(pactivo)>0, pactivo, NA_character_)
      
      concentracion = html_nodes(.x, "[data-th='PA Concentración']") %>% html_text()
      concentracion <- ifelse(length(concentracion)>0, concentracion, NA_character_)
      
      list(Recetamed=recetamed, CodigoISP=codisp, Forma=forma, Lab=lab, PActivo=pactivo, Concentracion=concentracion)
    }) 
  resumen_final = rbind(resumen_final, resumen_pagina)
  resumen_pagina = list()
  Sys.sleep(2)
}

bbdd_final = cbind(lista_final,resumen_final) #une la lista con toda la info con el resumen de cada producto



Aucun commentaire:

Enregistrer un commentaire