引入pdf2dom
代码语言:javascript复制 <dependency>
<groupId>net.sf.cssbox</groupId>
<artifactId>pdf2dom</artifactId>
<version>1.8</version>
</dependency>
测试代码:
代码语言:javascript复制import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringWriter;
import java.io.Writer;
import javax.xml.parsers.ParserConfigurationException;
import org.apache.commons.io.FileUtils;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.fit.pdfdom.PDFDomTree;
import org.fit.pdfdom.PDFDomTreeConfig;
import org.junit.Test;
public class TestPDFDomTree {
public static String parseWithPdfDomTree(InputStream is, int startPage, int endPage, PDFDomTreeConfig config)
throws IOException, ParserConfigurationException {
PDDocument pdf = PDDocument.load(is);
PDFDomTree parser = new PDFDomTree(config);
parser.setStartPage(startPage);
parser.setEndPage(endPage);
Writer output = new StringWriter();
parser.writeText(pdf, output);
pdf.close();
String htmlOutput = output.toString();
return htmlOutput;
}
@Test
public void test_convert_pdf_to_html() throws Exception {
PDFDomTreeConfig config = PDFDomTreeConfig.createDefaultConfig();
config.setImageHandler(PDFDomTreeConfig.saveToDirectory(new File("/mnt/res/")));
config.setFontHandler(config.getImageHandler());
String html = parseWithPdfDomTree(new FileInputStream("/mnt/电子版.pdf"), 0, 10, config);
FileUtils.write(new File("/mnt/test.html"), htmlOutput, "utf-8");
}
}
感谢您的认真阅读。