Trying to scrape a page with php/curl, trouble with cookies, post vars, and hidden fields.
Posted
by Patrick
on Stack Overflow
See other posts from Stack Overflow
or by Patrick
Published on 2010-03-29T03:34:31Z
Indexed on
2010/03/29
3:43 UTC
Read the original article
Hit count: 514
Im trying to use CURL to scrape the results from the page http://adlab.msn.com/Online-Commercial-Intention/Default.aspx
My understanding, is that i visit this page, it places cookie info and sets a few variables. I enter my query, select the query radio option, and click go.
The problem, is that its not working the way that it does through the website as im trying to get it to using the code below.
Ive tweaked several things, but im posting here in hopes someone can find what im missing.
This is my code as it stands now:
include("simple_html_dom.php");
$ckfile = tempnam ("/tmp", "CURLCOOKIE");
$query = $_GET['query'];
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, "http://adlab.msn.com/Online-Commercial-Intention/default.aspx");
curl_setopt($ch, CURLOPT_HEADER, 1);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_COOKIEJAR, $ckfile);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.3) Gecko/20070309 Firefox/2.0.0.3");
$pagetext1 = curl_exec($ch);
curl_exec($ch);
$html2 = str_get_html($pagetext1);
$viewstate = $html2->find('input[id=__VIEWSTATE]', 1)->plaintext;
echo $query."<br>".$viewstate."<br>";
$params = array(
'__EVENTTARGET' => "",
'__EVENTARGUMENT' => "",
'__LASTFOCUS' => "",
'__VIEWSTATE' => "$viewstate",
'MyMaster%3ADemoPageContent%3AtxtQuery' => "$query",
'MyMaster%3ADemoPageContent%3Alan' => "QueryRadio",
'MyMaster%3ADemoPageContent%3AgoButton.x' => "17",
'MyMaster%3ADemoPageContent%3AgoButton.y' => "12",
'MyMaster%3ADemoPageContent%3AidQuery' => "$query",
'MyMaster%3AHiddenKeywordTextBox' => "",
);
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, 'http://adlab.msn.com/Online-Commercial-Intention/default.aspx');
curl_setopt($ch, CURLOPT_POST, 1);
curl_setopt($ch, CURLOPT_REFERER, 'http://adlab.msn.com/Online-Commercial-Intention/default.aspx');
curl_setopt($ch, CURLOPT_POSTFIELDS, '$params');
curl_setopt($ch, CURLOPT_HEADER, 1);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt ($ch, CURLOPT_COOKIEFILE, $ckfile);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.3) Gecko/20070309 Firefox/2.0.0.3");
$pagetext = curl_exec($ch);
curl_exec($ch);
// echo $ckfile;
$html = str_get_html($pagetext);
$ret = $html->find('.intentionLabel', 1)->plaintext;
echo $ret."<br><br><br>";
echo $pagetext;
© Stack Overflow or respective owner