package pdfainspector; import java.util.Set; import nu.xom.Attribute; import nu.xom.Element; import com.itextpdf.text.pdf.PdfDictionary; import com.itextpdf.text.pdf.PdfName; import com.itextpdf.text.pdf.PdfReader; public class RoleMapExtractor { /** * Given an iText PDF Reader, extract the role map from the PDF and store * it in a XOM XML element. * @param reader A reader for the given PDF. * @return A XOM element containing tag data. */ public static Element extractToXML(PdfReader reader){ Element root = new Element("RoleMap"); PdfDictionary catalog = reader.getCatalog(); if(!catalog.contains(PdfName.STRUCTTREEROOT)){ return root; } PdfDictionary structTree = catalog.getAsDict(PdfName.STRUCTTREEROOT); if(!structTree.contains(PdfName.ROLEMAP)){ return root; } PdfDictionary roleMap = structTree.getAsDict(PdfName.ROLEMAP); Set<PdfName> keys = roleMap.getKeys(); for(PdfName key : keys){ Element mapElement = new Element("RoleMapEntry"); Attribute attribute = new Attribute(fixTagName(PdfName.decodeName(roleMap.getAsName(key).toString())), fixTagName(PdfName.decodeName(key.toString()))); mapElement.addAttribute(attribute); root.appendChild(mapElement); } return root; } /** * Taken from iText's TaggedPdfReaderTool, this renders tag names into an * XML-compatible format. * @param tag The tag to format. * @return A string representing the tag name. */ private static String fixTagName(String tag) { StringBuilder sb = new StringBuilder(); for (int k = 0; k < tag.length(); ++k) { char c = tag.charAt(k); boolean nameStart = c == ':' || (c >= 'A' && c <= 'Z') || c == '_' || (c >= 'a' && c <= 'z') || (c >= '\u00c0' && c <= '\u00d6') || (c >= '\u00d8' && c <= '\u00f6') || (c >= '\u00f8' && c <= '\u02ff') || (c >= '\u0370' && c <= '\u037d') || (c >= '\u037f' && c <= '\u1fff') || (c >= '\u200c' && c <= '\u200d') || (c >= '\u2070' && c <= '\u218f') || (c >= '\u2c00' && c <= '\u2fef') || (c >= '\u3001' && c <= '\ud7ff') || (c >= '\uf900' && c <= '\ufdcf') || (c >= '\ufdf0' && c <= '\ufffd'); boolean nameMiddle = c == '-' || c == '.' || (c >= '0' && c <= '9') || c == '\u00b7' || (c >= '\u0300' && c <= '\u036f') || (c >= '\u203f' && c <= '\u2040') || nameStart; if (k == 0) { if (!nameStart) c = '_'; } else { if (!nameMiddle) c = '-'; } sb.append(c); } return sb.toString(); } }