In the book Programming Collective Intelligence I found the following function to compute the PageRank:
def calculatepagerank(self,iterations=20):
# clear out the current PageRank tables
self.con.execute("drop table if exists pagerank")
self.con.execute("create table pagerank(urlid primary key,score)")
self.con.execute("create index prankidx on pagerank(urlid)")
# initialize every url with a PageRank of 1.0
self.con.execute("insert into pagerank select rowid,1.0 from urllist")
self.dbcommit()
for i in range(iterations):
print "Iteration %d" % i
for (urlid,) in self.con.execute("select rowid from urllist"):
pr=0.15
# Loop through all the pages that link to this one
for (linker,) in self.con.execute("select distinct fromid from link where toid=%d" % urlid):
# Get the PageRank of the linker
linkingpr=self.con.execute("select score from pagerank where urlid=%d" % linker).fetchone()[0]
# Get the total number of links from the linker
linkingcount=self.con.execute("select count(*) from link where fromid=%d" % linker).fetchone()[0]
pr+=0.85*(linkingpr/linkingcount)
self.con.execute("update pagerank set score=%f where urlid=%d" % (pr,urlid))
self.dbcommit()
However, this function is very slow, because of all the SQL queries in every iteration
>>> import cProfile
>>> cProfile.run("crawler.calculatepagerank()")
2262510 function calls in 136.006 CPU seconds
Ordered by: standard name
ncalls tottime percall cumtime percall filename:lineno(function)
1 0.000 0.000 136.006 136.006 <string>:1(<module>)
1 20.826 20.826 136.006 136.006 searchengine.py:179(calculatepagerank)
21 0.000 0.000 0.528 0.025 searchengine.py:27(dbcommit)
21 0.528 0.025 0.528 0.025 {method 'commit' of 'sqlite3.Connecti
1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler
1339864 112.602 0.000 112.602 0.000 {method 'execute' of 'sqlite3.Connec
922600 2.050 0.000 2.050 0.000 {method 'fetchone' of 'sqlite3.Cursor'
1 0.000 0.000 0.000 0.000 {range}
So I optimized the function and came up with this:
def calculatepagerank2(self,iterations=20):
# clear out the current PageRank tables
self.con.execute("drop table if exists pagerank")
self.con.execute("create table pagerank(urlid primary key,score)")
self.con.execute("create index prankidx on pagerank(urlid)")
# initialize every url with a PageRank of 1.0
self.con.execute("insert into pagerank select rowid,1.0 from urllist")
self.dbcommit()
inlinks={}
numoutlinks={}
pagerank={}
for (urlid,) in self.con.execute("select rowid from urllist"):
inlinks[urlid]=[]
numoutlinks[urlid]=0
# Initialize pagerank vector with 1.0
pagerank[urlid]=1.0
# Loop through all the pages that link to this one
for (inlink,) in self.con.execute("select distinct fromid from link where toid=%d" % urlid):
inlinks[urlid].append(inlink)
# get number of outgoing links from a page
numoutlinks[urlid]=self.con.execute("select count(*) from link where fromid=%d" % urlid).fetchone()[0]
for i in range(iterations):
print "Iteration %d" % i
for urlid in pagerank:
pr=0.15
for link in inlinks[urlid]:
linkpr=pagerank[link]
linkcount=numoutlinks[link]
pr+=0.85*(linkpr/linkcount)
pagerank[urlid]=pr
for urlid in pagerank:
self.con.execute("update pagerank set score=%f where urlid=%d" % (pagerank[urlid],urlid))
self.dbcommit()
This function is 20 times faster (but uses a lot more memory for all the temporary dictionaries) because it avoids the unnecessary SQL queries in every iteration:
>>> cProfile.run("crawler.calculatepagerank2()")
64802 function calls in 6.950 CPU seconds
Ordered by: standard name
ncalls tottime percall cumtime percall filename:lineno(function)
1 0.004 0.004 6.950 6.950 <string>:1(<module>)
1 1.004 1.004 6.946 6.946 searchengine.py:207(calculatepagerank2
2 0.000 0.000 0.104 0.052 searchengine.py:27(dbcommit)
23065 0.012 0.000 0.012 0.000 {meth 'append' of 'list' objects}
2 0.104 0.052 0.104 0.052 {meth 'commit' of 'sqlite3.Connection
1 0.000 0.000 0.000 0.000 {meth 'disable' of '_lsprof.Profiler'
31298 5.809 0.000 5.809 0.000 {meth 'execute' of 'sqlite3.Connectio
10431 0.018 0.000 0.018 0.000 {method 'fetchone' of 'sqlite3.Cursor'
1 0.000 0.000 0.000 0.000 {range}
But is it possible to further reduce the number of SQL queries to speed up the function even more?