package pdfainspector;
import java.io.IOException;
import nu.xom.Attribute;
import nu.xom.Element;
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.parser.PdfReaderContentParser;
import com.itextpdf.text.pdf.parser.SimpleTextExtractionStrategy;
import com.itextpdf.text.pdf.parser.TextExtractionStrategy;
/**
* Convert text in a PDF to a XOM XML element.
* @author schiele1
*/
public class TextExtractor {
/**
* Given an iText PDF Reader, extract text from the PDF and store it in a
* XOM XML element.
* @param reader A reader for the given PDF.
* @return A XOM element containing the text.
*/
public static Element extractToXML(PdfReader reader){
Element root = new Element("Text");
// Set up iText's PDF text extraction tools.
PdfReaderContentParser parser = new PdfReaderContentParser(reader);
TextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
// Make an element for each page of text, labeled with the page number.
for (int i = 1; i <= reader.getNumberOfPages(); i++) {
try{
// There are several different extraction strategies available.
strategy = parser.processContent(i, new SimpleTextExtractionStrategy());
//strategy = parser.processContent(i, new LocationTextExtractionStrategy());
}catch(IOException e){}
String result = strategy.getResultantText();
// If there's text on the page, label it and add it to the root.
if(result != null){
Element page = new Element("Plaintext");
page.addAttribute(new Attribute("Page", Integer.toString(i)));
String pageText = "";
for(int it = 0; it < result.length(); it++){
char c = result.charAt(it);
if(c != '\0'){
pageText = pageText + c;
}
}
page.appendChild(pageText);
root.appendChild(page);
}
}
return root;
}
}