python pdf

2020-01-08 17:01:54 浏览数 (1)

代码语言:javascript复制
# 从pdf中读取文本
# 写pdf
# 加密解密pdf
# 和平pdf,加水印
代码语言:javascript复制
# pip install PyPDF2
� D:python全站office
import PyPDF2
代码语言:javascript复制
D:python全站office
代码语言:javascript复制
pdf_obj = open('coop.pdf', 'rb')
pdf = PyPDF2.PdfFileReader(pdf_obj)
pdf.numPages
代码语言:javascript复制
3
代码语言:javascript复制
page = pdf.getPage(0)
代码语言:javascript复制
page.extractText()  # 提取文件
代码语言:javascript复制
'nn n n1\n1nN¥n nde8ug wordn nde8ug wordn nde8ug wordn nde8ug wordn nnn n nn nde8ug wordn nde8ug wordn nde8ug wordn nde8ug wordn n nnn n nn nde8ug wordn nde8ug wordn nde8ug wordn nde8ug wordn n n'
代码语言:javascript复制
# 提取中文 pip install pdfminer3k  #支持中文
from pdfminer.pdfinterp import PDFResourceManager, process_pdf # 资源管理
from pdfminer.converter import TextConverter  # 文本转换
from pdfminer.layout import LAParams #布局
from io import StringIO  # 生成临时文件

def convert_pdf(path):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, laparams = laparams)
    fp = open(path, 'rb')
    process_pdf(rsrcmgr, device, fp)
    fp.close()
    device.close()
    out = retstr.getvalue()
    retstr.close()
    return out
代码语言:javascript复制
s = convert_pdf('coop.pdf')
# print(s)
# convert_pdf('coop.pdf')
s.split('nx0c')
代码语言:javascript复制
['测试语句 nn第 1 页 nnde8ug word nnde8ug word nnde8ug word nnde8ug word nn测试语句 nn第一页 nnde8ug word nnde8ug word nnde8ug word nnde8ug word nn测试语句 nn第一页 nnde8ug word nnde8ug word nnde8ug word nnde8ug word nn n n n n ',
 '测试语句 nn第 2 页 nnde8ug word nnde8ug word nnde8ug word nnde8ug word nnde8ug word nnde8ug word nnde8ug word nnde8ug word nnde8ug word nnde8ug word nnde8ug word nnde8ug word nnde8ug word nnde8ug word nnde8ug word nnde8ug word nnde8ug word nnde8ug word nnde8ug word nn n n n n ',
 'de8ug word nn测试语句 nn第 3 页 nnde8ug word nnde8ug word nnde8ug word nnde8ug word nn n n ',
 '']
代码语言:javascript复制
# 写pdf,从上文打开的pdf找出第二页,新鞋一个pdf
pdf_writer = PyPDF2.PdfFileWriter()
page = pdf.getPage(1)
pdf_writer.addPage(page)
代码语言:javascript复制
with open('coop-1.pdf', 'wb') as f:
    pdf_writer.write(f)
代码语言:javascript复制
pdf_obj.close()
代码语言:javascript复制
# 加密pdf
with open('coop.pdf', 'rb') as f_in:
    pdf = PyPDF2.PdfFileReader(f_in)
    pdf_writer = PyPDF2.PdfFileWriter()
    for page_num in range(pdf.numPages):
        pdf_writer.addPage(pdf.getPage(page_num))
    pdf_writer.encrypt('hicoop')
    with open('coop-s.pdf', 'wb') as f_out:
        pdf_writer.write(f_out)
代码语言:javascript复制
# 解密
with open('coop-s.pdf', 'rb') as f_in:
    pdf = PyPDF2.PdfFileReader(f_in)
    print(pdf.isEncrypted)
    pdf.decrypt('hicoop')
    pdf.getPage(0) #取到解密后的数据才能正常操作
代码语言:javascript复制
True
代码语言:javascript复制
# 合并多个pdf,加水印
with open('coop.pdf', 'rb') as f_in:
    with open('coop-watermarked.pdf', 'rb') as f_w:
        pdf = PyPDF2.PdfFileReader(f_in)
        pdf_w = PyPDF2.PdfFileReader(f_w)

        pdf_write = PyPDF2.PdfFileWriter()
        for page_num in range(pdf.numPages):
            page = pdf.getPage(page_num)
            page.mergePage(pdf_w.getPage(0))
            pdf_write.addPage(page)
        with open('coop-watermarked.pdf', 'wb') as f_out:
            pdf_write.write(f_out)
代码语言:javascript复制
---------------------------------------------------------------------------

OSError                                   Traceback (most recent call last)

<ipython-input-39-b87325251ec9> in <module>()
      3     with open('coop-watermarked.pdf', 'rb') as f_w:
      4         pdf = PyPDF2.PdfFileReader(f_in)
----> 5         pdf_w = PyPDF2.PdfFileReader(f_w)
      6 
      7         pdf_write = PyPDF2.PdfFileWriter()

c:userscoopminiconda3envscooplibsite-packagesPyPDF2pdf.py in __init__(self, stream, strict, warndest, overwriteWarnings)
   1082             stream = BytesIO(b_(fileobj.read()))
   1083             fileobj.close()
-> 1084         self.read(stream)
   1085         self.stream = stream
   1086 

c:userscoopminiconda3envscooplibsite-packagesPyPDF2pdf.py in read(self, stream)
   1687         if debug: print(">>read", stream)
   1688         # start at the end:
-> 1689         stream.seek(-1, 2)
   1690         if not stream.tell():
   1691             raise utils.PdfReadError('Cannot read an empty file')

OSError: [Errno 22] Invalid argument

0 人点赞