webscraper grabbing images, but not entering info into database

Posted by Jason on Stack Overflow See other posts from Stack Overflow or by Jason
Published on 2011-01-08T05:47:24Z Indexed on 2011/01/08 5:53 UTC
Read the original article Hit count: 248

Filed under:
|
|
|

Hello, again. I'm having more issues with my script entering info into my database. The script below grabs a page, strips down the necessary info, then downloads the related image file. After that, it is supposed to enter the information gleaned from the URL into the database. For some reason, the script seems to iterate through the URLs, as I get downloaded images for each URL, but each URL's product is not entered into the database. The script will insert the first product's categories and product info, and then it just stops, and continues to download images.

Any suggestions?

<?php

define('IN_PHPBB', true);
$phpbb_root_path = (defined('PHPBB_ROOT_PATH')) ? PHPBB_ROOT_PATH : './';
$phpEx = substr(strrchr(__FILE__, '.'), 1);
include($phpbb_root_path . 'common.' . $phpEx);
include($phpbb_root_path . 'includes/simple_html_dom.' . $phpEx);

// Start session management
$user->session_begin();
$auth->acl($user->data);
$user->setup();

set_time_limit(259200);

function save($in, $out)
{
    $ch = curl_init ($in);
    curl_setopt($ch, CURLOPT_HEADER, 0);
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
    curl_setopt($ch, CURLOPT_BINARYTRANSFER,1);
    $rawdata=curl_exec($ch);
    curl_close ($ch);
    if(file_exists($out))
    {
        unlink($out);
    }
    $fp = fopen($out,'x');
    fwrite($fp, $rawdata);
    fclose($fp);
}

function scrape($i)
{
    $url = 'http:/xxxxxxxx/index.php?main_page=product_info&products_id='.$i.'&zenid=e4b7dde8de02e1df005d4549e2e3e529';
    echo "$url -- ";
    $exists = file_get_contents($url);
    if ($exists != false)
    {
        $html = file_get_html($url); 

        foreach($html->find('body') as $html)
        {
            $test = $html->find('#productName', 0);
            if ($test)
            {
                $item['title'] = trim($html->find('#productName', 0)->plaintext);
                $item['price'] = trim($html->find('#productPrices', 0)->plaintext);
                $item['cat'] = $html->find('#navBreadCrumb', 0)->plaintext;
                list($home, $item['cat'], $item['subcat'], $title) = explode("::", $item['cat']);
                $item['cat'] = str_replace("&nbsp;", "", $item['cat']);
                $item['subcat'] = str_replace("\n", "", str_replace("&nbsp;", "", $item['subcat']));
                $item['desc'] = trim($html->find('#productDescription', 0)->plaintext);
                $item['model'] = $html->find('ul#productDetailsList', 0)->find('li', 0)->plaintext;
                $item['model'] = explode(":", $item['model']);
                $item['model'] = trim($item['model'][1]);
                $item['manufacturer'] = $html->find('ul#productDetailsList', 0)->find('li', 1)->plaintext;
                $item['manufacturer'] = explode(":", $item['manufacturer']);
                $item['manufacturer'] = trim($item['manufacturer'][1]);
                foreach($html->find('img') as $img)
                {
                    if($img->alt == $item['title'])
                    {
                        $item['img_sm'] = $img->src;
                    }
                }

                $ret[] = $item;

            }
        }
        $html->clear();
        unset($html);
        unset($item);
        return $ret;
    }
    else
    {
        echo "Could not find page<br />";
    }
    unset($exists);
}

$i = 1;
$end = 9999999;

while($i < $end)
{
    $ret = scrape($i);

    if(isset($ret))
    {
        foreach($ret as $v)
        {
            $item['title'] = $v['title'];
            $item['price'] = $v['price'];
            $item['desc'] = $v['desc'];
            $item['model'] = $v['model'];
            $item['manufacturer'] = $v['manufacturer'];
            $item['image'] = $v['image'];
            $item['cat'] = $v['cat'];
            $item['subcat'] = $v['subcat'];
            $item['img_sm'] = $v['img_sm'];
        }
        unset($ret);
        unset($v);

        $sm_img_src = "http://xxxxxx/".$item['img_sm'];
        $ext = strrchr($item['img_sm'], '.');

        $filename = $item['model'] . $ext;

        $lg_img_src = "http://xxxxx/images/STC/".$filename;
        $new_sm = "./rip_images/small/{$filename}";
        $new_lg = "./rip_images/large/{$filename}";

        $item['image'] = $filename;

        save($lg_img_src,$new_lg);
        save($sm_img_src,$new_sm);

        //see if parent cat exists
        $sql = 'SELECT cat_id FROM ' . SHOP_CAT_TABLE . ' WHERE cat_name = "'.$db->sql_escape($item['cat']).'"';
        $result = $db->sql_query($sql);
        $parent = $db->sql_fetchrow($result);
        $db->sql_freeresult($result);
        // if not exists
        if($parent['cat_id'] == '')
        {
            //add the parent cat to the db
            $sql_ary = array(
                'cat_name' => $item['cat'],
                'cat_parent' => 0
            );
            $sql = 'INSERT INTO '.SHOP_CAT_TABLE.' '.$db->sql_build_array('INSERT', $sql_ary);
            $db->sql_query($sql);
            $cat_id = $db->sql_nextid();

            //see if subcat exists
            $sql = 'SELECT cat_id FROM ' . SHOP_CAT_TABLE . ' WHERE cat_name = "'.$db->sql_escape($item['subcat']).'"';
            $result = $db->sql_query($sql);
            $row = $db->sql_fetchrow($result);
            $db->sql_freeresult($result);
            // if not exists
            if($row['cat_id'] == '')
            {
                //add subcat to db
                $sql_ary = array(
                    'cat_name' => $db->sql_escape($item['subcat']),
                    'cat_parent' => $cat_id
                );
                $sql = 'INSERT INTO '.SHOP_CAT_TABLE.' '.$db->sql_build_array('INSERT', $sql_ary);
                $db->sql_query($sql);
                $item_cat = $db->sql_nextid();
            }
            else //if exists
            {
                $item_cat = $row['cat_id'];
            }
        }
        else //if parent cat exists
        {
            //see if subcat exists
            $sql = 'SELECT cat_id FROM ' . SHOP_CAT_TABLE . ' WHERE cat_name = "'.$db->sql_escape($item['subcat']).'"';
            $result = $db->sql_query($sql);
            $row = $db->sql_fetchrow($result);
            $db->sql_freeresult($result);
            // if not exists
            if($row['cat_id'] == '')
            {
                //add the subcat to the db
                $sql_ary = array(
                    'cat_name' => $db->sql_escape($item['subcat']),
                    'cat_parent' => $parent['cat_id']
                );
                $sql = 'INSERT INTO '.SHOP_CAT_TABLE.' '.$db->sql_build_array('INSERT', $sql_ary);
                $db->sql_query($sql);
                $item_cat = $db->sql_nextid();
            }
            else //if exists
            {
                $item_cat = $row['cat_id'];
            }
        }

        $sql_ary = array(
            'item_title'      => $db->sql_escape($item['title']),
            'item_price'     => $db->sql_escape($item['price']),
            'item_desc'      => $db->sql_escape($item['desc']),
            'item_model'    => $db->sql_escape($item['model']),
            'item_manufacturer' => $db->sql_escape($item['manufacturer']),
            'item_image'    => $db->sql_escape($item['image']),
            'item_cat'      => $db->sql_escape($item_cat)
        );

        $sql = 'INSERT INTO ' . SHOP_ITEM_TABLE . ' ' . $db->sql_build_array('INSERT', $sql_ary);
        $db->sql_query($sql);
        garbage_collection();
        echo 'Done<br />';
    }
    $i++;
    unset($item);
}

?>

© Stack Overflow or respective owner

Related posts about php

Related posts about sql