I' am trying to scrape a specific webpage and its sub-pages for later use within an web application.
Unfortunally I 'am failing to copy my history object created by the getHistory()-Method into my umfragen object.
$history = $this->getHistory($j, $i);
//$history = (object) [];
$td = $this->getData($tr, $inst, $date, $party, $j, $history);
$this->umfragen->{$party}->{$inst} = $td;
My object keys are generated on the fly. If i call the $this->getHistory() on its own and echo its result everything works fine.
Can somebody pls explain me what I'am doing wrong?
Many thanks!
<?php
require('vendor/autoload.php');
header('Content-type: application/xml');
header("Access-Control-Allow-Origin: *");
use PHPHtmlParser\Dom;
ini_set('max_execution_time', 300); //300 seconds = 5 minutes
class Scrpr {
private $institutes = array("allensbach", "emnid", "forsa", "politbarometer", "gms", "dimap", "insa");
private $parties = array("cdu/csu", "spd", "grüne", "fdp", "linke", "afd", "sonstige", "nw_un", "fw");
private $partyids = array("cdu", "spd", "gru", "fdp", "lin", "afd", "son");
private $theaders = array("CDU/CSU", "SPD", "GRÜNE", "FDP", "LINKE", "AfD", "Sonstige", "Nichtwähler/Unentschl.", "FW", "Befragte", "Zeitraum");
private $dom;
private $dates;
private $umfragen;
public function __construct() {
$this->var2 = date("Y/m/d");
$this->dom = new Dom;
$this->dom->load('http://ift.tt/1d7VYiB');
$this->dates = $this->getDomDates();
$this->umfragen = (object) [];
}
public function toJson() {
return json_encode($this->umfragen);
}
public function writeToJson() {
$fp = fopen('results.json', 'w');
fwrite($fp, json_encode($this->umfragen));
fclose($fp);
}
public function run_history() {
for ($i = 0; $i < sizeof($this->partyids); $i++) {
$party = $this->partyids[$i];
for ($j = 0; $j < sizeof($this->institutes); $j++) {
$history = $this->getHistory($j, $i);
$this->umfragen[$i][$j]->history = $history;
//$this->umfragen->{$party}->{$inst}->history = new stdClass();
//$this->umfragen->{$party}->{$inst}->history = $history;
}
}
//return json_encode($history);
}
function getHistory($instId, $partyId) {
$instName = $this->institutes[$instId];
$partyName = $this->partyids[$partyId];
$subDom = $this->dom;
$html = $subDom->load('http://ift.tt/1d7VYiB'.$instName.".htm");
$tbody = $html->find('tbody');
$td_arr = [];
if ($tbody) {
$parent_splitted = $this->splitHtml($tbody);
if ($parent_splitted != false) {
for ($i = 0; $i < sizeof($parent_splitted); $i++) {
$val = $parent_splitted[$i];
if ($this->isEmpty($val)) {
$date = "";
if(preg_match("/\d{2}\.\d{2}.\d{4}/", $val, $matches)) {
$date = $matches[0];
}
else if (preg_match('#\d+(?:\,+\d{1})?#', $val, $matches) && strpos($val, '%')) {
$td = (object) [
"datum" => $date,
"y" => $val,
];
array_push($td_arr, $td);
}
}
}
}
}
$td_filtered = [];
for ($j = 0; $j < sizeof($td_arr); $j++) {
if (fmod($j, 7) == $partyId) {
array_push($td_filtered, $td_arr[$j]);
}
}
return $td_filtered;
}
public function run() {
for ($i = 0; $i < sizeof($this->partyids); $i++) {
$party = $this->partyids[$i];
for ($j = 0; $j < sizeof($this->institutes); $j++) {
$inst = $this->institutes[$j];
$date = $this->dates[$j];
$tr = $this->dom->find('tbody/tr[id='.$party.']');
$history = $this->getHistory($j, $i);
//$history = (object) [];
$td = $this->getData($tr, $inst, $date, $party, $j, $history);
$this->umfragen->{$party}->{$inst} = $td;
}
}
}
private function getData ($parent, $inst, $date, $party, $index , $history) {
$td = (object) [];
$result = [];
$parent_splitted = $this->splitHtml($parent);
if ($parent_splitted != false) {
$td_arr = [];
for ($i = 0; $i < sizeof($parent_splitted); $i++) {
$val = $parent_splitted[$i];
if ($this->isEmpty($val)) {
if ($this->isNumValueEmpty($val)) {
array_push($td_arr, $val);
}
}
}
}
for ($j = 0; $j< sizeof($td_arr); $j++) {
$val = $td_arr[$j];
if ($j === $index) {
$fval = str_replace(",", ".", $val);
$td = (object) [
"history" => $this->getHistory(1, 1),
"datum" => $date,
"y" => $fval,
];
}
}
return $td;
}
private function getDomDates() {
$html = $this->dom->find('tbody/tr[id="datum"]')->innerHtml;
$result = [];
$dates_splitted = $this->splitHtml($html);
foreach ($dates_splitted as $key => $value) {
if ($this->isDate($value)) {
$d = str_replace(".", "-", $value);
$date=date_create($d);
array_push($result, $date);
}
}
return $result;
}
private function splitHtml($html) {
//return preg_split('/<[^>]*[^\/]>/i', $html, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
return preg_split('/<[^>]*[^\/]>/i', $html, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
}
private function isEmpty($val) {
return preg_match('/\S/', $val);
}
private function isNumValueEmpty($val) {
return preg_match('#\d+(?:\,\d{1,2})?#', $val);
}
private function isDate($val) {
return preg_match("/\d{2}\.\d{2}.\d{4}/", $val);
}
private function getArrPosByVal($arr, $value) {
$res = 0;
for ($i = 0; $i < sizeof($arr); $i++) {
if ($arr[$i] === $value) {
$res = $i;
}
}
return $res;
}
}
$scrpr = new Scrpr();
$scrpr->run();
//$scrpr->run_history();
//$scrpr->writeToJson();
echo ($scrpr->toJson());
?>
Aucun commentaire:
Enregistrer un commentaire