Java实现pdf转html

2022-05-11 10:18:09 浏览数 (1)

引入pdf2dom

代码语言:javascript复制
        <dependency>
            <groupId>net.sf.cssbox</groupId>
            <artifactId>pdf2dom</artifactId>
            <version>1.8</version>
        </dependency>

测试代码:

代码语言:javascript复制
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringWriter;
import java.io.Writer;

import javax.xml.parsers.ParserConfigurationException;

import org.apache.commons.io.FileUtils;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.fit.pdfdom.PDFDomTree;
import org.fit.pdfdom.PDFDomTreeConfig;
import org.junit.Test;

public class TestPDFDomTree {

    public static String parseWithPdfDomTree(InputStream is, int startPage, int endPage, PDFDomTreeConfig config)
            throws IOException, ParserConfigurationException {
        PDDocument pdf = PDDocument.load(is);
        PDFDomTree parser = new PDFDomTree(config);
        parser.setStartPage(startPage);
        parser.setEndPage(endPage);
        Writer output = new StringWriter();
        parser.writeText(pdf, output);
        pdf.close();
        String htmlOutput = output.toString();
        return htmlOutput;
    }

    @Test
    public void test_convert_pdf_to_html() throws Exception {
        PDFDomTreeConfig config = PDFDomTreeConfig.createDefaultConfig();
        config.setImageHandler(PDFDomTreeConfig.saveToDirectory(new File("/mnt/res/")));
        config.setFontHandler(config.getImageHandler());
        String html = parseWithPdfDomTree(new FileInputStream("/mnt/电子版.pdf"), 0, 10, config);
	FileUtils.write(new File("/mnt/test.html"), htmlOutput, "utf-8");
    }
}

感谢您的认真阅读。

0 人点赞