Currently it takes about 3 minutes to run through a single 53 page word document. Hopefully you all have some advice about speeding up the process.
Code:
import win32com.client as win32
from glob import glob
import io
import re
from collections import namedtuple
from collections import defaultdict
import pprint
raw_files = glob('*.docx')
word = win32.gencache.EnsureDispatch('Word.Application')
word.Visible = False
oFile = io.open("rawsort.txt", "w+", encoding = "utf-8")#text dump
doccat= list()
for f in raw_files:
word.Documents.Open(f)
doc = word.ActiveDocument #whichever document is active at the time
doc.ConvertNumbersToText()
print doc.Paragraphs.Count
for x in xrange(1, doc.Paragraphs.Count+1):#for loop to print through paragraphs
oText = doc.Paragraphs(x)
if not oText.Range.Tables.Count >0 :
results = re.match('(?P<number>(([1-3]*[A-D]*[0-9]*)(.[1-3]*[0-9])+))', oText.Range.Text)
stylematch = re.match('Heading \d', oText.Style.NameLocal)
if results!= None and oText.Style != None and stylematch != None:
doccat.append((oText.Style.NameLocal, oText.Range.Text[:len(results.group('number'))],oText.Range.Text[len(results.group('number')):]))
style = oText.Style.NameLocal
else:
if oText.Range.Font.Bold == True :
doccat.append(style, oText)
oFile.write(unicode(doccat))
oFile.close()
The for Paragraph loop obviously takes the most amount of time. Is there some way of identifying and appending it without going through every Paragraph?