webscraper grabbing images, but not entering info into database
Posted
by
Jason
on Stack Overflow
See other posts from Stack Overflow
or by Jason
Published on 2011-01-08T05:47:24Z
Indexed on
2011/01/08
5:53 UTC
Read the original article
Hit count: 251
Hello, again. I'm having more issues with my script entering info into my database. The script below grabs a page, strips down the necessary info, then downloads the related image file. After that, it is supposed to enter the information gleaned from the URL into the database. For some reason, the script seems to iterate through the URLs, as I get downloaded images for each URL, but each URL's product is not entered into the database. The script will insert the first product's categories and product info, and then it just stops, and continues to download images.
Any suggestions?
<?php
define('IN_PHPBB', true);
$phpbb_root_path = (defined('PHPBB_ROOT_PATH')) ? PHPBB_ROOT_PATH : './';
$phpEx = substr(strrchr(__FILE__, '.'), 1);
include($phpbb_root_path . 'common.' . $phpEx);
include($phpbb_root_path . 'includes/simple_html_dom.' . $phpEx);
// Start session management
$user->session_begin();
$auth->acl($user->data);
$user->setup();
set_time_limit(259200);
function save($in, $out)
{
$ch = curl_init ($in);
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_BINARYTRANSFER,1);
$rawdata=curl_exec($ch);
curl_close ($ch);
if(file_exists($out))
{
unlink($out);
}
$fp = fopen($out,'x');
fwrite($fp, $rawdata);
fclose($fp);
}
function scrape($i)
{
$url = 'http:/xxxxxxxx/index.php?main_page=product_info&products_id='.$i.'&zenid=e4b7dde8de02e1df005d4549e2e3e529';
echo "$url -- ";
$exists = file_get_contents($url);
if ($exists != false)
{
$html = file_get_html($url);
foreach($html->find('body') as $html)
{
$test = $html->find('#productName', 0);
if ($test)
{
$item['title'] = trim($html->find('#productName', 0)->plaintext);
$item['price'] = trim($html->find('#productPrices', 0)->plaintext);
$item['cat'] = $html->find('#navBreadCrumb', 0)->plaintext;
list($home, $item['cat'], $item['subcat'], $title) = explode("::", $item['cat']);
$item['cat'] = str_replace(" ", "", $item['cat']);
$item['subcat'] = str_replace("\n", "", str_replace(" ", "", $item['subcat']));
$item['desc'] = trim($html->find('#productDescription', 0)->plaintext);
$item['model'] = $html->find('ul#productDetailsList', 0)->find('li', 0)->plaintext;
$item['model'] = explode(":", $item['model']);
$item['model'] = trim($item['model'][1]);
$item['manufacturer'] = $html->find('ul#productDetailsList', 0)->find('li', 1)->plaintext;
$item['manufacturer'] = explode(":", $item['manufacturer']);
$item['manufacturer'] = trim($item['manufacturer'][1]);
foreach($html->find('img') as $img)
{
if($img->alt == $item['title'])
{
$item['img_sm'] = $img->src;
}
}
$ret[] = $item;
}
}
$html->clear();
unset($html);
unset($item);
return $ret;
}
else
{
echo "Could not find page<br />";
}
unset($exists);
}
$i = 1;
$end = 9999999;
while($i < $end)
{
$ret = scrape($i);
if(isset($ret))
{
foreach($ret as $v)
{
$item['title'] = $v['title'];
$item['price'] = $v['price'];
$item['desc'] = $v['desc'];
$item['model'] = $v['model'];
$item['manufacturer'] = $v['manufacturer'];
$item['image'] = $v['image'];
$item['cat'] = $v['cat'];
$item['subcat'] = $v['subcat'];
$item['img_sm'] = $v['img_sm'];
}
unset($ret);
unset($v);
$sm_img_src = "http://xxxxxx/".$item['img_sm'];
$ext = strrchr($item['img_sm'], '.');
$filename = $item['model'] . $ext;
$lg_img_src = "http://xxxxx/images/STC/".$filename;
$new_sm = "./rip_images/small/{$filename}";
$new_lg = "./rip_images/large/{$filename}";
$item['image'] = $filename;
save($lg_img_src,$new_lg);
save($sm_img_src,$new_sm);
//see if parent cat exists
$sql = 'SELECT cat_id FROM ' . SHOP_CAT_TABLE . ' WHERE cat_name = "'.$db->sql_escape($item['cat']).'"';
$result = $db->sql_query($sql);
$parent = $db->sql_fetchrow($result);
$db->sql_freeresult($result);
// if not exists
if($parent['cat_id'] == '')
{
//add the parent cat to the db
$sql_ary = array(
'cat_name' => $item['cat'],
'cat_parent' => 0
);
$sql = 'INSERT INTO '.SHOP_CAT_TABLE.' '.$db->sql_build_array('INSERT', $sql_ary);
$db->sql_query($sql);
$cat_id = $db->sql_nextid();
//see if subcat exists
$sql = 'SELECT cat_id FROM ' . SHOP_CAT_TABLE . ' WHERE cat_name = "'.$db->sql_escape($item['subcat']).'"';
$result = $db->sql_query($sql);
$row = $db->sql_fetchrow($result);
$db->sql_freeresult($result);
// if not exists
if($row['cat_id'] == '')
{
//add subcat to db
$sql_ary = array(
'cat_name' => $db->sql_escape($item['subcat']),
'cat_parent' => $cat_id
);
$sql = 'INSERT INTO '.SHOP_CAT_TABLE.' '.$db->sql_build_array('INSERT', $sql_ary);
$db->sql_query($sql);
$item_cat = $db->sql_nextid();
}
else //if exists
{
$item_cat = $row['cat_id'];
}
}
else //if parent cat exists
{
//see if subcat exists
$sql = 'SELECT cat_id FROM ' . SHOP_CAT_TABLE . ' WHERE cat_name = "'.$db->sql_escape($item['subcat']).'"';
$result = $db->sql_query($sql);
$row = $db->sql_fetchrow($result);
$db->sql_freeresult($result);
// if not exists
if($row['cat_id'] == '')
{
//add the subcat to the db
$sql_ary = array(
'cat_name' => $db->sql_escape($item['subcat']),
'cat_parent' => $parent['cat_id']
);
$sql = 'INSERT INTO '.SHOP_CAT_TABLE.' '.$db->sql_build_array('INSERT', $sql_ary);
$db->sql_query($sql);
$item_cat = $db->sql_nextid();
}
else //if exists
{
$item_cat = $row['cat_id'];
}
}
$sql_ary = array(
'item_title' => $db->sql_escape($item['title']),
'item_price' => $db->sql_escape($item['price']),
'item_desc' => $db->sql_escape($item['desc']),
'item_model' => $db->sql_escape($item['model']),
'item_manufacturer' => $db->sql_escape($item['manufacturer']),
'item_image' => $db->sql_escape($item['image']),
'item_cat' => $db->sql_escape($item_cat)
);
$sql = 'INSERT INTO ' . SHOP_ITEM_TABLE . ' ' . $db->sql_build_array('INSERT', $sql_ary);
$db->sql_query($sql);
garbage_collection();
echo 'Done<br />';
}
$i++;
unset($item);
}
?>
© Stack Overflow or respective owner