urllib2.Request() with data returns empty url

Posted by Mr. Polywhirl on Stack Overflow See other posts from Stack Overflow or by Mr. Polywhirl
Published on 2013-10-31T20:37:25Z Indexed on 2013/10/31 21:55 UTC
Read the original article Hit count: 341

Filed under:

urllib2

My main concern is the function: getUrlAndHtml()

If I manually build and append the query to the end of the uri, I can get the response.url(), but if I pass a dictionary as the request data, the url does not come back. Is there anyway to guarantee the redirected url?

In my example below, if thisWorks = True I get back a url, but the returned url is the request url as opposed to a redirect link.

On a sidenote, the encoding for .E2.80.93 does not translate to - for some reason?

#!/usr/bin/python
import pprint
import urllib
import urllib2
from bs4 import BeautifulSoup
from sys import argv

URL = 'http://en.wikipedia.org/w/index.php?'

def yesOrNo(boolVal):
    return 'yes' if boolVal else 'no'

def getTitleFromRaw(page):
    return page.strip().replace(' ', '_')

def getUrlAndHtml(title, printable=False):    
    thisWorks = False

    if thisWorks:
        query = 'title={:s}&printable={:s}'.format(title, yesOrNo(printable))
        opener = urllib2.build_opener()
        opener.addheaders = [('User-agent', 'Mozilla/5.0')]
        response = opener.open(URL + query)

    else:
        params = {'title':title,'printable':yesOrNo(printable)}
        data = urllib.urlencode(params)
        headers = {'User-agent':'Mozilla/5.0'};
        request = urllib2.Request(URL, data, headers)
        response = urllib2.urlopen(request)

    return response.geturl(), response.read()

def getSoup(html, name=None, attrs=None):
    soup = BeautifulSoup(html)
    if name is None:
        return None
    return soup.find(name, attrs)

def setTitle(soup, newTitle):
    title = soup.find('div', {'id':'toctitle'})
    h2 = title.find('h2')
    h2.contents[0].replaceWith('{:s} for {:s}'.format(h2.getText(), newTitle))

def updateLinks(soup, url):
    fragment = '#'
    for a in soup.findAll('a', href=True):
        a['href'] = a['href'].replace(fragment, url + fragment)


def writeToFile(soup, filename='out.html', indentLevel=2):
    with open(filename, 'wt') as out:
        pp = pprint.PrettyPrinter(indent=indentLevel, stream=out)
        pp.pprint(soup)
    print('Wrote {:s} successfully.'.format(filename))


if __name__ == '__main__':
    def exitPgrm():
        print('usage: {:s} "<PAGE>" <FILE>'.format(argv[0]))
        exit(0)

    if len(argv) == 2:
        help = argv[1]
        if help == '-h' or help == '--help':
            exitPgrm()

    if False:'''
    if not len(argv) == 3:
        exitPgrm()
    '''

    page = 'Led Zeppelin' # argv[1]
    filename = 'test.html' # argv[2]

    title = getTitleFromRaw(page)
    url, html = getUrlAndHtml(title)
    soup = getSoup(html, 'div', {'id':'toc'})
    setTitle(soup, page)
    updateLinks(soup, url)
    writeToFile(soup, filename)

Developer IT

urllib2.Request() with data returns empty url - Developer IT

urllib2.Request() with data returns empty url

python

url

python-2.7

beautifulsoup

urllib2

Related posts about python

unmet dependencies in Ubuntu 12.04

How can I get sikuli-ide to work?

Getting PATH right for python after MacPorts install

call python with system() in R to run a python script emulating the python console

Python - Calling a non python program from python?

Related posts about url

mod_rewrite for clean URL doesn't convert the URL to clean URL (but it's accessible) [on hold]

Tip/Trick: Fix Common SEO Problems Using the URL Rewrite Extension

mod_rewrite one url to another url without changing source url

ASP.NET MVC without Url Rewriting/Pretty Url

Ant get task throws "get doesn't support nested resources element" error

Categories cloud