Parsování stránky v PHP

Musíme mít nainstalovanou knihovnu php5-curl.

sudo apt-get install php5-curl
sudo service apache2 restart

Vytvoříme si například soubor cURLapi.php

<?php

/* Pozn.:
 * apt-get install php5-curl 
 */

function cURL_getPage($url) {
    $curl = curl_init();

    /* BEGIN: userAgent */
    $userAgent = "Firefox (WindowsXP) - Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.8.1.6) Gecko/20070725 Firefox/2.0.0.6";
    /* END: userAgent */

    /* BEGIN: header */
    $httpHeaderArray = array(
        "Accept: text/xml,application/xml,application/xhtml+xml,text/html;q=0.9",
        "Cache-Control: max-age=0",
        "Connection: keep-alive",
        "Keep-Alive: 300",
        "Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7",
        "Accept-Language: en-us,en;q=0.5",
        "Pragma: "
    );
    /* END: header */

    /* BEGIN: referer */
    $refererLink = "https://www.google.com/search?q=" . $url;
    /* END: referer */

    curl_setopt($curl, CURLOPT_URL, $url);  // url
    curl_setopt($curl, CURLOPT_SSLVERSION, 3);
    curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, FALSE);
    curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, 2);
    curl_setopt($curl, CURLOPT_USERAGENT, $userAgent);  // userAgent
    curl_setopt($curl, CURLOPT_HTTPHEADER, $httpHeaderArray);  // hlavicka
    curl_setopt($curl, CURLOPT_REFERER, $refererLink);  // referer
    curl_setopt($curl, CURLOPT_HEADER, FALSE);  // zobrazi nam serverou hlavicku (TRUE/FALSE)
    curl_setopt($curl, CURLOPT_RETURNTRANSFER, TRUE);  // vrati nam webovou stranku (TRUE/FALSE)
    curl_setopt($curl, CURLOPT_FOLLOWLOCATION, TRUE);  // povoli presmerovani v url (TRUE/FALSE)
    curl_setopt($curl, CURLOPT_AUTOREFERER, TRUE);  // povoli posilani referecniho odkazu (TRUE/FALSE)
    curl_setopt($curl, CURLOPT_ENCODING, "");
    curl_setopt($curl, CURLOPT_MAXREDIRS, 10);  // zastavi stahovani po 10 presmerovanich
    curl_setopt($curl, CURLOPT_CONNECTTIMEOUT, 120);  // timeout pro pripojeni
    curl_setopt($curl, CURLOPT_TIMEOUT, 10);  // timeout na odpoved
    /* Proxy */
    // curl_setopt($curl, CURLOPT_PROXYTYPE, CURLPROXY_SOCKS5);
    // curl_setopt($curl, CURLOPT_PROXY, "127.0.0.1");
    // curl_setopt($curl, CURLOPT_PROXYPORT, 9050);

    $html = curl_exec($curl);  // provede curl prikazy
    curl_close($curl);  // zavre spojeni
    return $html;  // vrati webovou stranku
}

A do indexu.php

<?php

require('cURLapi.php');
header('Content-Type: text/html; charset=utf-8'); // Vytiskne parsovany web v UTF-8

$url = "http://nejakaUrl.tld";
$page = cURL_getPage($url);

$page = htmlspecialchars($page, ENT_QUOTES); // Priprava pro strojove zpracovani - zobrazeni kodu. Zakomentovanim tohoto radku se nam webova stranka zobrazi klasicky.
echo '<pre>' . $page . '</pre>';

A krásně nám to stránku tahá 😉 , toť vše.