C# 用 iTextSharp 将 PDF 转成文本

2021-11-02 16:05:25 浏览数 (1)

代码语言:javascript复制
using System;
using System.IO;
using iTextSharp.text;
using iTextSharp.text.pdf;
using iTextSharp.text.pdf.parser;
public class ParsingPDF {

static string PDF;
static string TEXT2;

/**
 * Parses the PDF using PRTokeniser
 * @param src  the path to the original PDF file
 * @param dest the path to the resulting text file
 */
public void parsePdf(String src, String dest)
{
    PdfReader reader = new PdfReader(src);
    StreamWriter output = new StreamWriter(new FileStream(dest, FileMode.Create));
    int pageCount = reader.NumberOfPages;
    for (int pg = 1; pg <= pageCount; pg  )
    {
        // we can inspect the syntax of the imported page
        byte[] streamBytes = reader.GetPageContent(pg);
        PRTokeniser tokenizer = new PRTokeniser(streamBytes);
        while (tokenizer.NextToken())
        {
            if (tokenizer.TokenType == PRTokeniser.TokType.STRING)
            {
                output.WriteLine(tokenizer.StringValue);
            }
        }
    }
    output.Flush();
    output.Close();
}

/**
 * Main method.
 */
static void Main(string[] args)
{
    if (args.Length < 1 || args.Length > 2)
    {
        Console.WriteLine("USAGE: ParsePDF infile.pdf <outfile.txt>");
        return;
    }
    else if (args.Length == 1)
    {
        PDF = args[0];
        TEXT2 = Path.GetFileNameWithoutExtension(PDF)   ".txt";
    }
    else
    {
        PDF = args[0];
        TEXT2 = args[1];
    }

    try
    {
        DateTime t1 = DateTime.Now;

        ParsingPDF example = new ParsingPDF();
        example.parsePdf(PDF, TEXT2);

        DateTime t2 = DateTime.Now;
        TimeSpan ts = t2 - t1;
        Console.WriteLine("Parsing completed in {0:0.00} seconds.", ts.TotalSeconds);
    }
    catch (Exception ex)
    {
        Console.WriteLine("ERROR: "   ex.Message);
    }
} // class

public class MyTextRenderListener : IRenderListener
{
    /** The print writer to which the information will be written. */
    protected StreamWriter output;

    /**
     * Creates a RenderListener that will look for text.
     */
    public MyTextRenderListener(StreamWriter output)
    {
        this.output = output;
    }

    public void BeginTextBlock()
    {
        output.Write("<");
    }

    public void EndTextBlock()
    {
        output.WriteLine(">");
    }

    public void RenderImage(ImageRenderInfo renderInfo)
    {
    }

    public void RenderText(TextRenderInfo renderInfo)
    {
        output.Write("<");
        output.Write(renderInfo.GetText());
        output.Write(">");
    }
} // class
 
} // namespace  </pre> 

0 人点赞