Scraping Links With PHP
By Justin Laing
2008-01-06
Your Completed Link Scraper
|
|
function storeLink($url,$gathered_from) { $query = "INSERT INTO links (url, gathered_from) VALUES ('$url', '$gathered_from')"; mysql_query($query) or die('Error, insert query failed'); }
$target_url = "http://www.merchantos.com/"; $userAgent = 'Googlebot/2.1 (http://www.googlebot.com/bot.html)';
// make the cURL request to $target_url $ch = curl_init(); curl_setopt($ch, CURLOPT_USERAGENT, $userAgent); curl_setopt($ch, CURLOPT_URL,$target_url); curl_setopt($ch, CURLOPT_FAILONERROR, true); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); curl_setopt($ch, CURLOPT_AUTOREFERER, true); curl_setopt($ch, CURLOPT_RETURNTRANSFER,true); curl_setopt($ch, CURLOPT_TIMEOUT, 10); $html= curl_exec($ch); if (!$html) { echo " cURL error number:" .curl_errno($ch); echo " cURL error:" . curl_error($ch); exit; }
// parse the html into a DOMDocument $dom = new DOMDocument(); @$dom->loadHTML($html);
// grab all the on the page $xpath = new DOMXPath($dom); $hrefs = $xpath->evaluate("/html/body//a");
for ($i = 0; $i < $hrefs->length; $i++) { $href = $hrefs->item($i); $url = $href->getAttribute('href'); storeLink($url,$target_url); echo " Link stored: $url"; }
Tutorial Pages:
»
Scraping Links With PHP
»
Get The Page Content
»
Tip: Fake Your User Agent
»
Using PHP’s DOM Functions To Parse The HTML
»
XPath Makes Getting The Links You Want Easy
»
Iterate And Store Your Links
» Your Completed Link Scraper
»
What Else Could I Do With This Thing?
»
Is Scraping Content Legal?
Originally posted on Makebeta
|

|