dimanche 31 janvier 2016

How improve this torrent file scraping php script to reduce disk r/w operations

Here is the php script that I created for downloading torrent file using torrent hash.

It doesn't download torrent file sometimes maybe because of poor header array I created but the failed download info automatically written to a text file for later viewing and downloading mannually .

Please let me post this now. How much details do you want stackexchange.

Also tell if OOP will improve this code. And how to download torrent file reliably in this.

<?php
set_time_limit(0);
include('simple_html_dom.php');

//------------------------------------------------
function getUserAgent()
{
$agents   = array();
$agents[] = "Mozilla/5.0 (Windows NT 10.0; 
        WOW64; rv:41.0) 
        Gecko/20100101 Firefox/41.0";
$agents[] = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:41.0) Gecko/20100101 Firefox/41.0";
$agents[] = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36";
$agents[] = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36";
$agents[] = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.71 Safari/537.36";
$agents[] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11) AppleWebKit/601.1.56 (KHTML, like Gecko) Version/9.0 Safari/601.1.56";
$agents[] = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36";
$agents[] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/601.2.7 (KHTML, like Gecko) Version/9.0.1 Safari/601.2.7";
$agents[] = "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko";
$agents[] = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:41.0) Gecko/20100101 Firefox/41.0";
$agents[] = "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:41.0) Gecko/20100101 Firefox/41.0";
$agents[] = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:41.0) Gecko/20100101 Firefox/41.0";
$agents[] = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.71 Safari/537.36";
$agents[] = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36";
$agents[] = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36";
$agents[] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:41.0) Gecko/20100101 Firefox/41.0";
$agents[] = "Mozilla/5.0 (Windows NT 6.3; WOW64; rv:41.0) Gecko/20100101 Firefox/41.0";
$agents[] = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36";
$agents[] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36";
return $agents[rand(0, count($agents) - 1)];
}
//----------------------------------------------------------
//function for curl options
function curl_setopt_my(&$chandle)
{
$headers   = array();
$headers[] = 'Content-Type: */*';
$headers[] = 'Accept: */*';
curl_setopt($chandle, CURLOPT_SSL_VERIFYPEER, 0);
curl_setopt($chandle, CURLOPT_SSL_VERIFYHOST, 0);
curl_setopt($chandle, CURLOPT_CONNECTTIMEOUT, 20);
curl_setopt($chandle, CURLOPT_TIMEOUT, 50);
curl_setopt($chandle, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($chandle, CURLOPT_USERAGENT, getUserAgent());
//curl_setopt($chandle, CURLOPT_COOKIESESSION, 1);
curl_setopt($chandle, CURLOPT_COOKIEJAR, 'katcook.txt');
curl_setopt($chandle, CURLOPT_COOKIEFILE, 'katcook1.txt');
curl_setopt($chandle, CURLOPT_ENCODING, "gzip");
curl_setopt($chandle, CURLOPT_HTTPHEADER, $headers);
}
   //------------------------------------------------------------------------------------
$data = "";
do {
$ch = curl_init('http://ift.tt/1PJmuMc');
curl_setopt_my($ch);
$data = curl_exec($ch);
curl_close($ch);
} while (!$data);
//file_put_contents("newtkat.html", $data);
//echo "------------------------------------------------------------------";

$html = new simple_html_dom();
$html->load($data);
//$magnet = $html->find('a[title="Torrent magnet link"]');
$torrentFile     = $html->find('a[title="Download torrent file"]');
//$magElements = array();
$torFileElements = array();
foreach ($torrentFile as $e)
$torFileElements[] = $e->href;
$countFailed = 1;
foreach ($torFileElements as $e) {
$fileName = substr($e, stripos($e, "?title=") + 7);
$hash     = substr($e, stripos($e, "/torrent/") + 9, 40);
$fp       = fopen('tors\\' . $fileName . '.torrent', 'w+');
$ch1      = curl_init('http://ift.tt/1PJmxaP' . $hash . '.torrent');
curl_setopt_my($ch1);
curl_setopt($ch1, CURLOPT_FILE, $fp);
curl_exec($ch1);
curl_close($ch1);
fclose($fp);
sleep(1);
//echo $fileName. "----completed<br />". PHP_EOL ;

if (filesize('tors\\' . $fileName . '.torrent') > 3000) {
    continue;
}
$ch2 = curl_init('http://ift.tt/1Kjx3t4' . $hash . '.torrent');
$fp2 = fopen('tors\\' . $fileName . '.torrent', 'w+');
curl_setopt_my($ch2);
curl_setopt($ch2, CURLOPT_FILE, $fp2);
curl_exec($ch2);
curl_close($ch2);
fclose($fp2);
sleep(1);

if (filesize('tors\\' . $fileName . '.torrent') > 3000) {
    continue;
}

$ch3 = curl_init('http://ift.tt/1Kjx3t6' . $hash . '.torrent');
$fp3 = fopen('tors\\' . $fileName . '.torrent', 'w+');
curl_setopt_my($ch3);
curl_setopt($ch3, CURLOPT_FILE, $fp3);
curl_exec($ch3);
curl_close($ch3);
fclose($fp3);
sleep(1);

if (filesize('tors\\' . $fileName . '.torrent') > 3000) {
    continue;
}

$ch4 = curl_init('http://ift.tt/1PJmuMe' . $hash . '.torrent');
$fp4 = fopen('tors\\' . $fileName . '.torrent', 'w+');
curl_setopt_my($ch4);
curl_setopt($ch4, CURLOPT_FILE, $fp4);
curl_exec($ch4);
curl_close($ch4);
fclose($fp4);
sleep(1);

if (filesize('tors\\' . $fileName . '.torrent') < 3000) {
    unlink('tors\\' . $fileName . '.torrent');
    file_put_contents("tors\\magnets.txt", $countFailed . ")->" . $hash . ">->" . $fileName . PHP_EOL, FILE_APPEND | LOCK_EX);
    $countFailed++;
    }

}

?> 

Reply soon. Thanks




Aucun commentaire:

Enregistrer un commentaire