使用python删除word文档中的指定段落,顺便实现一下文档中的图片导出

2022-04-29 11:07:09 浏览数 (1)

代码语言:javascript复制
#! /etc/env/bin python3
#! *_* coding=utf8 *_*


from pathlib import Path
from docx import Document
import os

# 从word中导出图片
def extract_img_word(filename='',doc_path=''):
    '''
    docx文档其实也是一个zip压缩包,所以我们可以通过zip包解压它
    也可以直接改文件后缀
    '''
    from zipfile import ZipFile
    
    with ZipFile(filename) as zip_file:
        for names in zip_file.namelist():
            if names.startswith("word/media/image"):
                zip_file.extract(names, doc_path)
            

'''
pip install python-docx
https://python-docx.readthedocs.io/en/latest/
'''
#创建文档
def createWord():
    document = Document()
    document.add_heading('Document Title', 0)
    document.add_paragraph('A plain paragraph having some')
    document.add_heading('Heading, level 1', level=1)
    document.add_heading('Heading, level 1', level=2)
    document.add_paragraph('以下段落需要删除')
    document.add_paragraph('A plain paragraph')
    document.add_paragraph('A plain paragraph 新段落')
    document.add_heading('Heading, level 2', level=2)
    document.save('H:/temp/test.docx')
    
createWord()

#删除指定段落
def delete_paragraph(paragraph):
    p = paragraph._element
    p.getparent().remove(p)
    # p._p = p._element = None
    paragraph._p = paragraph._element = None

def delWordContent(docx_file='',dest_file=''):
    #读取文本
    doc = Document(docx_file)
    paragraphs = doc.paragraphs
    i = 0
    flag = False
    for p in paragraphs:
        i =1
        #print(str(i))
        #print(p.text)
        if p.text.find('需要删除') > -1:
            #print('找到了')
            flag = True
        if flag is True:
            #print('deleting')
            delete_paragraph(p)
    if flag is True:
        #保存为新文件
        doc.save(dest_file)

delWordContent(docx_file='H:/temp/test.docx',dest_file='H:/temp/test-new.docx')

def testDel():
    dest_dir = 'words'
    for filename in Path('H:/').glob('*.docx'):
        print(str(filename))
        dest_file = str(filename.parent / f'{dest_dir}'/filename.name)
        delWordContent(docx_file = str(filename), dest_file = dest_file)
        os.remove(str(filename))

0 人点赞