So if you have been the last few posts you know I want to get data out of a PDF file (in this case a Bank of England Inflation Report). The python library PyPDF2 is good and will split a pdf into separate pages. No doubt, PyPDF2 will do a ton of other stuff to manipulate pdf files. However, I find the pdf file format is difficult to understand which limits how much I want to use a pdf python library. Instead, I have found that Inkscape converts PDFs to SVGs and that we can automate this on the command line.
This means I am in a position to give the latest version of a program which breaks up a PDF file into separate SVG files for each page. (You'll need Inkscape installed). Here it is...
# with thanks to user26294 at Stack Overflow
# https://stackoverflow.com/questions/490195/split-a-multi-page-pdf-file-into-multiple-pdf-files-with-python#answer-490203
from PyPDF2 import PdfFileWriter, PdfFileReader
def DecryptPdf(pdfFileReader,password):
if pdfFileReader.isEncrypted:
try:
pdfFileReader.decrypt(password)
print ('File decrypted')
except Exception as e:
print ('File decryption failed:' + str(e))
else:
print ('File not enrypted')
def SuffixFilename(fileName, suffix):
import os.path
filePath = os.path.split(fileName)
filePath2 = filePath[1].split('.')
return filePath[0] + '\' +filePath2[0] + suffix + '.' + filePath2[1]
def OutputPage(pdfFileNameSrc,pdfFileNamePage, pageNum):
pdfFileSrc = open(pdfFileNameSrc, "rb")
pdfFileReaderSrc = PdfFileReader(pdfFileSrc)
DecryptPdf(pdfFileReaderSrc,'')
pageOutput = PdfFileWriter()
pageOutput.addPage(pdfFileReaderSrc.getPage(pageNum))
with open(pdfFileNamePage, "wb") as outputStream:
pageOutput.write(outputStream)
print('written page%s' % pageNum)
outputStream.close
pdfFileSrc.close #tidy up
def InkscapePdfToSvg(pdfFileName):
import subprocess
svgFileName=pdfFileName.replace(".pdf",".svg")
completed = subprocess.run(['c:/Progra~1/Inkscape/Inkscape.exe',
'-z',
'-f', pdfFileName ,
'-l', svgFileName])
return svgFileName
if __name__ == "__main__":
pdfFileNameInflation = "C:\Users\Simon\Downloads\pdf_skunkworks\inflation-report-may-2018.pdf"
pdfFileInflation = open(pdfFileNameInflation, "rb")
pdfFileReaderInflation = PdfFileReader(pdfFileInflation)
DecryptPdf(pdfFileReaderInflation,'')
pageCount = pdfFileReaderInflation.numPages
for i in range(pageCount):
pdfFileNamePage=SuffixFilename(pdfFileNameInflation,"-page%s" % i)
OutputPage(pdfFileNameInflation,pdfFileNamePage,i)
print (InkscapePdfToSvg(pdfFileNamePage))
No comments:
Post a Comment