代码语言:javascript复制
using System;
using System.IO;
using iTextSharp.text;
using iTextSharp.text.pdf;
using iTextSharp.text.pdf.parser;
public class ParsingPDF {
static string PDF;
static string TEXT2;
/**
* Parses the PDF using PRTokeniser
* @param src the path to the original PDF file
* @param dest the path to the resulting text file
*/
public void parsePdf(String src, String dest)
{
PdfReader reader = new PdfReader(src);
StreamWriter output = new StreamWriter(new FileStream(dest, FileMode.Create));
int pageCount = reader.NumberOfPages;
for (int pg = 1; pg <= pageCount; pg )
{
// we can inspect the syntax of the imported page
byte[] streamBytes = reader.GetPageContent(pg);
PRTokeniser tokenizer = new PRTokeniser(streamBytes);
while (tokenizer.NextToken())
{
if (tokenizer.TokenType == PRTokeniser.TokType.STRING)
{
output.WriteLine(tokenizer.StringValue);
}
}
}
output.Flush();
output.Close();
}
/**
* Main method.
*/
static void Main(string[] args)
{
if (args.Length < 1 || args.Length > 2)
{
Console.WriteLine("USAGE: ParsePDF infile.pdf <outfile.txt>");
return;
}
else if (args.Length == 1)
{
PDF = args[0];
TEXT2 = Path.GetFileNameWithoutExtension(PDF) ".txt";
}
else
{
PDF = args[0];
TEXT2 = args[1];
}
try
{
DateTime t1 = DateTime.Now;
ParsingPDF example = new ParsingPDF();
example.parsePdf(PDF, TEXT2);
DateTime t2 = DateTime.Now;
TimeSpan ts = t2 - t1;
Console.WriteLine("Parsing completed in {0:0.00} seconds.", ts.TotalSeconds);
}
catch (Exception ex)
{
Console.WriteLine("ERROR: " ex.Message);
}
} // class
public class MyTextRenderListener : IRenderListener
{
/** The print writer to which the information will be written. */
protected StreamWriter output;
/**
* Creates a RenderListener that will look for text.
*/
public MyTextRenderListener(StreamWriter output)
{
this.output = output;
}
public void BeginTextBlock()
{
output.Write("<");
}
public void EndTextBlock()
{
output.WriteLine(">");
}
public void RenderImage(ImageRenderInfo renderInfo)
{
}
public void RenderText(TextRenderInfo renderInfo)
{
output.Write("<");
output.Write(renderInfo.GetText());
output.Write(">");
}
} // class
} // namespace </pre>