My main concern is the function: getUrlAndHtml()
If I manually build and append the query to the end of the uri, I can get the response.url(), but if I pass a dictionary as the request data, the url does not come back. Is there anyway to guarantee the redirected url?
In my example below, if thisWorks = True I get back a url, but the returned url is the request url as opposed to a redirect link.
On a sidenote, the encoding for .E2.80.93 does not translate to - for some reason?
#!/usr/bin/python
import pprint
import urllib
import urllib2
from bs4 import BeautifulSoup
from sys import argv
URL = 'http://en.wikipedia.org/w/index.php?'
def yesOrNo(boolVal):
return 'yes' if boolVal else 'no'
def getTitleFromRaw(page):
return page.strip().replace(' ', '_')
def getUrlAndHtml(title, printable=False):
thisWorks = False
if thisWorks:
query = 'title={:s}&printable={:s}'.format(title, yesOrNo(printable))
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
response = opener.open(URL + query)
else:
params = {'title':title,'printable':yesOrNo(printable)}
data = urllib.urlencode(params)
headers = {'User-agent':'Mozilla/5.0'};
request = urllib2.Request(URL, data, headers)
response = urllib2.urlopen(request)
return response.geturl(), response.read()
def getSoup(html, name=None, attrs=None):
soup = BeautifulSoup(html)
if name is None:
return None
return soup.find(name, attrs)
def setTitle(soup, newTitle):
title = soup.find('div', {'id':'toctitle'})
h2 = title.find('h2')
h2.contents[0].replaceWith('{:s} for {:s}'.format(h2.getText(), newTitle))
def updateLinks(soup, url):
fragment = '#'
for a in soup.findAll('a', href=True):
a['href'] = a['href'].replace(fragment, url + fragment)
def writeToFile(soup, filename='out.html', indentLevel=2):
with open(filename, 'wt') as out:
pp = pprint.PrettyPrinter(indent=indentLevel, stream=out)
pp.pprint(soup)
print('Wrote {:s} successfully.'.format(filename))
if __name__ == '__main__':
def exitPgrm():
print('usage: {:s} "<PAGE>" <FILE>'.format(argv[0]))
exit(0)
if len(argv) == 2:
help = argv[1]
if help == '-h' or help == '--help':
exitPgrm()
if False:'''
if not len(argv) == 3:
exitPgrm()
'''
page = 'Led Zeppelin' # argv[1]
filename = 'test.html' # argv[2]
title = getTitleFromRaw(page)
url, html = getUrlAndHtml(title)
soup = getSoup(html, 'div', {'id':'toc'})
setTitle(soup, page)
updateLinks(soup, url)
writeToFile(soup, filename)