Skip to content

Commit

Permalink
replace pdfsharp with itextsharp to parse pdf
Browse files Browse the repository at this point in the history
  • Loading branch information
tonyqus committed Sep 6, 2015
1 parent 525fb17 commit 56742ce
Show file tree
Hide file tree
Showing 4 changed files with 30 additions and 22 deletions.
12 changes: 9 additions & 3 deletions Toxy.Test/PDFParserTest.cs
Original file line number Diff line number Diff line change
Expand Up @@ -41,14 +41,20 @@ public void TestParseToxyDocumentFromPDF()
Assert.AreEqual("du 18-22 Mars 2003", results[3]);
}
[Test]
public void TestParsePlainTextFromPDF2()
public void TestParsePlainTextFromSample5()
{
string path = TestDataSample.GetPdfPath("Sample5.PDF");
var parser = new PDFTextParser(new ParserContext(path));
string result = parser.Parse();
Assert.IsTrue(result.StartsWith("Philadelphia, Atlanta, Dallas, San Diego, and New Orleans."));


}
[Test]
public void TestReadBigPDFFile()
{
string path = TestDataSample.GetPdfPath("Word97-2007BinaryFileFormat(doc)Specification.pdf");
var parser = new PDFTextParser(new ParserContext(path));
string result = parser.Parse();
Assert.IsTrue(true);
}
}
}
22 changes: 11 additions & 11 deletions ToxyFramework/Parsers/PDFDocumentParser.cs
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
using PdfSharp.Pdf;
using PdfSharp.Pdf.Content;
using PdfSharp.Pdf.IO;
using iTextSharp.text.pdf;
using iTextSharp.text.pdf.parser;
using System;
using System.Collections.Generic;
using System.IO;
using System.Text;

namespace Toxy.Parsers
{
public class PDFDocumentParser: IDocumentParser
public class PDFDocumentParser : IDocumentParser
{
public PDFDocumentParser(ParserContext context)
{
Expand All @@ -21,22 +20,23 @@ public ToxyDocument Parse()
throw new FileNotFoundException("File " + Context.Path + " is not found");

ToxyDocument rdoc = new ToxyDocument();
using (Stream stream = File.OpenRead(Context.Path))
ITextExtractionStrategy its = new iTextSharp.text.pdf.parser.LocationTextExtractionStrategy();

using (PdfDocument doc = PdfReader.Open(stream, PdfDocumentOpenMode.ReadOnly))
using (PdfReader reader = new PdfReader(this.Context.Path))
{
for (int i = 0; i < doc.PageCount; i++)

for (int i = 1; i <= reader.NumberOfPages; i++)
{
var texts = doc.Pages[i].ExtractText();
foreach (var text in texts)
string thePage = PdfTextExtractor.GetTextFromPage(reader, i, its);
string[] theLines = thePage.Split('\n');
foreach (var theLine in theLines)
{
ToxyParagraph para = new ToxyParagraph();
para.Text = text;
para.Text = theLine;
rdoc.Paragraphs.Add(para);
}
}
}

return rdoc;
}
public ParserContext Context
Expand Down
17 changes: 9 additions & 8 deletions ToxyFramework/Parsers/PDFTextParser.cs
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
using PdfSharp.Pdf;
using PdfSharp.Pdf.IO;
using iTextSharp.text.pdf;
using iTextSharp.text.pdf.parser;
using System;
using System.Collections.Generic;
using System.IO;
using System.Text;

namespace Toxy.Parsers
{
public class PDFTextParser:ITextParser
public class PDFTextParser : ITextParser
{
public PDFTextParser(ParserContext context)
{
Expand All @@ -18,19 +18,20 @@ public string Parse()
if (!File.Exists(Context.Path))
throw new FileNotFoundException("File " + Context.Path + " is not found");

using (PdfDocument reader = PdfReader.Open(this.Context.Path, PdfDocumentOpenMode.ReadOnly))
using (PdfReader reader = new PdfReader(this.Context.Path))
{
StringBuilder text = new StringBuilder();

for (int i = 0; i < reader.PageCount; i++)
for (int i = 1; i <= reader.NumberOfPages; i++)
{
var lines = reader.Pages[i].ExtractWholeText();
text.AppendLine(lines);
ITextExtractionStrategy its = new iTextSharp.text.pdf.parser.LocationTextExtractionStrategy();
string thePage = PdfTextExtractor.GetTextFromPage(reader, i, its);
text.AppendLine(thePage);
}
return text.ToString();
}
}

public ParserContext Context { get; set; }
}
}
}
1 change: 1 addition & 0 deletions ToxyFramework/packages.config
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
<packages>
<package id="DocumentFormat.OpenXmlSDK" version="2.0" targetFramework="net40" />
<package id="HtmlAgilityPack" version="1.4.9" targetFramework="net40" />
<package id="iTextSharp" version="5.5.6" targetFramework="net40" />
<package id="LumenWorks.Framework.IO" version="3.8.0" targetFramework="net40" />
<package id="NPOI" version="2.1.3.1" targetFramework="net40" />
<package id="NUnit" version="2.6.4" targetFramework="net40" />
Expand Down

0 comments on commit 56742ce

Please sign in to comment.