package pdfainspector;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import nu.xom.Attribute;
import nu.xom.Element;
import com.itextpdf.text.pdf.PdfArray;
import com.itextpdf.text.pdf.PdfDictionary;
import com.itextpdf.text.pdf.PdfName;
import com.itextpdf.text.pdf.PdfNumber;
import com.itextpdf.text.pdf.PdfObject;
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.parser.FilteredTextRenderListener;
import com.itextpdf.text.pdf.parser.MarkedContentRenderFilter;
import com.itextpdf.text.pdf.parser.PdfContentStreamProcessor;
import com.itextpdf.text.pdf.parser.RenderFilter;
import com.itextpdf.text.pdf.parser.SimpleTextExtractionStrategy;
import com.itextpdf.text.pdf.parser.TextExtractionStrategy;
/**
* Convert tag data in a PDF to a XOM XML element. This is based loosely on the
* TaggedPdfReaderTool from iText 5.1.3, and very heavily modified to include
* desired features, such as page numbers, attributes, and XOM elements.
* @author schiele1
*/
public class TagExtractor {
private static PdfReader reader;
/**
* Given an iText PDF Reader, extract tag data from the PDF and store it in
* a XOM XML element.
* @param reader A reader for the given PDF.
* @return A XOM element containing tag data.
*/
public static Element extractToXML(PdfReader reader){
TagExtractor.reader = reader;
Element root = new Element("tags");
// Find the root of the tag structure tree
PdfDictionary catalog = reader.getCatalog();
if(!catalog.contains(PdfName.STRUCTTREEROOT)){
return root;
}
PdfDictionary structTree = catalog.getAsDict(PdfName.STRUCTTREEROOT);
// Parse the tag tree into XOM elements then add them to the root.
List<Element> tags = parseChild(structTree.getDirectObject(PdfName.K));
if(tags != null){
for(Element tag : tags){
root.appendChild(tag);
}
}
return root;
}
/**
* Wrapper function for parsing a PDF Object, passes it to either the array
* or the dictionary parser and returns the list those parsers output.
* @param child The PdfObject to be parsed.
* @return The list of XOM elements representing that object.
*/
private static List<Element> parseChild(PdfObject child){
List<Element> tags = new ArrayList<Element>();
if(child != null){
if (child instanceof PdfArray){
tags = parseArray((PdfArray) child);
}
else if (child instanceof PdfDictionary){
tags = parseDictionary((PdfDictionary) child);
}
}
return tags;
}
/**
* Parse each object in the given PdfArray into a list of XOM elements,
* then append them all to a master list of elements representing the array.
* @param array The PdfArray to be parsed.
* @return A list of XOM elements representing the combination of every
* PdfObject in the array.
*/
private static List<Element> parseArray(PdfArray array){
List<Element> tags = new ArrayList<Element>();
if(array != null){
for (int i = 0; i < array.size(); i++) {
List<Element> childList = parseChild(array.getDirectObject(i));
if(childList != null){
tags.addAll(childList);
}
}
}
return tags;
}
/**
* A dictionary will either directly contain tag data, or it will contain
* references to other objects which may contain the data. This is where
* the bulk of the parsing work is done.
* @param dict The PdfDictionary to be parsed.
* @return A list of elements corresponding to the tag data contained in
* the dictionary and/or its children.
*/
private static List<Element> parseDictionary(PdfDictionary dict){
List<Element> tags = new ArrayList<Element>();
if(dict != null){
// If the dict contains tag data, we need to extract it.
PdfName tagString = dict.getAsName(PdfName.S);
if (tagString != null) {
// Decode the tag name and make a XOM element with that name.
String tagDecode = PdfName.decodeName(tagString.toString());
String tagName = fixTagName(tagDecode);
Element tag = new Element(tagName);
// Fetch the tag attributes (including page numbers and alt
// text), and add them to the tag element.
List<Attribute> attributes = extractAttributes(dict);
for(Attribute attribute : attributes){
tag.addAttribute(attribute);
}
// Then, read in the actual contents of the tag.
PdfDictionary page = dict.getAsDict(PdfName.PG);
String contents = null;
if (page != null){
contents = parseTag(tagDecode, dict.getDirectObject(PdfName.K), page);
}
if(contents != null){
tag.appendChild(sanitize(contents));
}
// If the tag has children, we need to parse them, too.
List<Element> childList = parseChild(dict.getDirectObject(PdfName.K));
if(childList != null){
for(Element element : childList){
tag.appendChild(element);
}
}
// Once we've done all that, we return our finished element.
tags.add(tag);
}
// If the dict is not a tag, we need to dig deeper into it to find
// the tag data we need.
else {
tags = parseChild(dict.get(PdfName.K));
}
}
return tags;
}
/**
* Taken from iText's TaggedPdfReaderTool, this renders tag names into an
* XML-compatible format.
* @param tag The tag to format.
* @return A string representing the tag name.
*/
private static String fixTagName(String tag) {
StringBuilder sb = new StringBuilder();
for (int k = 0; k < tag.length(); ++k) {
char c = tag.charAt(k);
boolean nameStart =
c == ':'
|| (c >= 'A' && c <= 'Z')
|| c == '_'
|| (c >= 'a' && c <= 'z')
|| (c >= '\u00c0' && c <= '\u00d6')
|| (c >= '\u00d8' && c <= '\u00f6')
|| (c >= '\u00f8' && c <= '\u02ff')
|| (c >= '\u0370' && c <= '\u037d')
|| (c >= '\u037f' && c <= '\u1fff')
|| (c >= '\u200c' && c <= '\u200d')
|| (c >= '\u2070' && c <= '\u218f')
|| (c >= '\u2c00' && c <= '\u2fef')
|| (c >= '\u3001' && c <= '\ud7ff')
|| (c >= '\uf900' && c <= '\ufdcf')
|| (c >= '\ufdf0' && c <= '\ufffd');
boolean nameMiddle =
c == '-'
|| c == '.'
|| (c >= '0' && c <= '9')
|| c == '\u00b7'
|| (c >= '\u0300' && c <= '\u036f')
|| (c >= '\u203f' && c <= '\u2040')
|| nameStart;
if (k == 0) {
if (!nameStart)
c = '_';
}
else {
if (!nameMiddle)
c = '-';
}
sb.append(c);
}
return sb.toString();
}
/**
* Use iText's text parsing tools to read the text inside the given tag. It
* scans the given page dictionary to find the start of the tag, and reads
* all the text until it finds the end of the tag.
* @param tag The tag type to search for on the page.
* @param object The actual tag object we are parsing (the "K" element of
* the parent PdfDictionary).
* @param page The dictionary representing the page on which the tag starts.
* @return A string containing the text within the given tag.
*/
private static String parseTag(String tag, PdfObject object, PdfDictionary page){
// If object is a number, then it is the Marked Content ID of the tag
// we're looking for, and we can jump to that tag on the page.
if (object instanceof PdfNumber) {
PdfNumber mcid = (PdfNumber) object;
// The filter will only search for text corresponding to the MCID.
RenderFilter filter = new MarkedContentRenderFilter(mcid.intValue());
TextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
FilteredTextRenderListener listener = new FilteredTextRenderListener(
strategy, filter);
PdfContentStreamProcessor processor = new PdfContentStreamProcessor(listener);
try{
processor.processContent(PdfReader.getPageContent(page), page.getAsDict(PdfName.RESOURCES));
}catch(IOException e){
return "";
}
return listener.getResultantText();
}
// If object is an array, we can search for tags within each element.
else if (object instanceof PdfArray) {
PdfArray arr = (PdfArray) object;
int n = arr.size();
String text = "";
for (int i = 0; i < n; i++) {
text = text + parseTag(tag, arr.getPdfObject(i), page);
if (i < n - 1)
text = text + "\n";
}
return text;
}
// If it's a dictionary, we can simply parse its MCID element.
else if (object instanceof PdfDictionary) {
PdfDictionary mcr = (PdfDictionary) object;
return parseTag(tag, mcr.getDirectObject(PdfName.MCID), mcr.getAsDict(PdfName.PG));
}
// We should never reach here.
else{
return "";
}
}
/**
* Recursively search for a page dictionary within the master Pages dict and return its
* page number. We search recursively because Pages dicts can be nested.
* @param page The page dictionary to search for.
* @param pages The page dictionary to search.
* @param num The number of pages already counted (since Pages dicts can be nested).
* @return The page number. This is 0 if the page is null, positive if the page was not
* found, and negative if the page was found.
*/
private static int getPageHelper(PdfDictionary page, PdfArray pages, int num){
// Return zero if we aren't passed a page.
if(page == null){
return 0;
}
// Behave differently depending on whether we're reading a Page or Pages dict.
for(int i = 0; i < pages.size(); i++){
PdfDictionary child = pages.getAsDict(i);
// If it's a Page dict, we need to check to see if it's our page.
if(child.getAsName(PdfName.TYPE) == PdfName.PAGE){
num++;
if(child == page){
return (-1) * num;
}
}
// If it's a Pages dict, we need to recursively check all of its children.
else if(child.getAsName(PdfName.TYPE) == PdfName.PAGES){
int numChild = getPageHelper(page, child.getAsArray(PdfName.KIDS), num);
if(numChild < 0){
return numChild;
}
num = numChild;
}
}
return num;
}
/**
* Wraps the getPageHelper function to find the page number of a given page dict.
* @param page The page dictionary whose number we want to know.
* @return The page number, or zero if it is not known.
*/
private static int getPage(PdfDictionary page){
// Ensure we actually have a page dictionary, just in case, then call our helper.
PdfDictionary catalog = TagExtractor.reader.getCatalog();
if(catalog.contains(PdfName.PAGES)){
PdfArray pages = catalog.getAsDict(PdfName.PAGES).getAsArray(PdfName.KIDS);
int pageNumber = getPageHelper(page, pages, 0);
// Our helper returns a negative number if it actually finds the page.
if(pageNumber < 0){
return (-1) * pageNumber;
}
}
return 0;
}
/**
* Search a tag dictionary for attributes and return a list of them.
* @param dict The tag dictionary to search.
* @return A list of all the attributes found.
*/
private static List<Attribute> extractAttributes(PdfDictionary dict){
ArrayList<Attribute> attributes = new ArrayList<Attribute>();
// To find the page number, first get the page dictionary.
PdfDictionary page = dict.getAsDict(PdfName.PG);
// ...then search for it in the master list of pages.
int pageNumber = getPage(page);
attributes.add(new Attribute("Page", Integer.toString(pageNumber)));
// If there's an alt-text, get it.
if (dict.get(PdfName.ALT) != null){
String alt = dict.get(PdfName.ALT).toString();
attributes.add(new Attribute("Alt", sanitize(alt)));
}
// Some tags, such as table elements, may have IDs.
if(dict.get(PdfName.ID) != null){
String id = dict.get(PdfName.ID).toString();
attributes.add(new Attribute("ID", id));
}
// The rest of the attributes are contained in a dictionary. We can
// pull out the ones we want here.
PdfDictionary a = dict.getAsDict(PdfName.A);
if (a != null){
PdfObject summary = a.get(new PdfName("Summary"));
PdfObject scope = a.get(new PdfName("Scope"));
PdfObject header = a.get(new PdfName("Headers"));
PdfObject rowspan = a.get(new PdfName("RowSpan"));
PdfObject colspan = a.get(new PdfName("ColSpan"));
if (summary != null){
attributes.add(new Attribute("Summary", summary.toString()));
}
if (scope != null){
attributes.add(new Attribute("Scope", scope.toString()));
}
if (header != null){
attributes.add(new Attribute("Headers", header.toString()));
}
if (rowspan != null){
attributes.add(new Attribute("RowSpan", rowspan.toString()));
}
if (colspan != null){
attributes.add(new Attribute("ColSpan", colspan.toString()));
}
}
return attributes;
}
/**
* Remove all null characters from a string so we can put it into XML.
* @param dict The string to sanitize.
* @return The sanitized string (i.e. with all null chars removed).
*/
private static String sanitize(String input){
if(input.startsWith("\u00fe\u00ff")){
input = input.substring(2);
}
String sanitized = "";
for(int i = 0; i < input.length(); i++){
char c = input.charAt(i);
if(c != '\0'){
sanitized = sanitized + c;
}
}
return sanitized;
}
}