package pdfainspector; import java.io.ByteArrayOutputStream; import java.io.IOException; import nu.xom.Document; import nu.xom.Element; import nu.xom.Serializer; import org.json.JSONArray; import org.json.JSONException; import org.json.JSONML; import org.json.JSONObject; import com.itextpdf.text.pdf.PdfReader; /** * Given the path to a PDF file, PdfExtractor generates an XML tree of its * accessibility components. Given such an XML file, it converts that tree to * JSON for rules processing. * @author schiele1 */ public class PdfExtractor { /** * Scans the PDF for several indicators of accessibility (metadata, * bookmarks, tags, form data, text, and image data), and writes them to * an XML string. * @param pdfName The path to the PDF file to be inspected. * @return A string containing the XML tree representing the PDF. */ public static String extractToXML(String pdfName){ // Initialize the reader which will scan the PDF for our data. PdfReader reader = null; try{ reader = new PdfReader(pdfName); }catch(IOException e){ return null; } // Use our reader to scan the PDF for each of the necessary components. Element root = new Element("PdfInfo"); Element rolemap = RoleMapExtractor.extractToXML(reader); Element meta = MetaExtractor.extractToXML(reader); Element bookmarks = BookmarkExtractor.extractToXML(reader); Element tags = TagExtractor.extractToXML(reader); Element form = FormExtractor.extractToXML(reader); //Element text = TextExtractor.extractToXML(reader); //Element images = ImageExtractor.extractToXML(reader); // Add each component to the root of our XML tree. root.appendChild(rolemap); root.appendChild(meta); root.appendChild(bookmarks); root.appendChild(tags); root.appendChild(form); //root.appendChild(text); //root.appendChild(images); // Format and return the now-complete tree. Document doc = new Document(root); ByteArrayOutputStream baos = new ByteArrayOutputStream(); try{ Serializer serializer = new Serializer(baos); serializer.setIndent(4); serializer.write(doc); }catch(Exception e){ System.err.println("Error extracting XML"); } return baos.toString(); } /** * Given an XML string, presumably representing a PDF, convert that file * into a JSON string, specially formatted for rules parsing. * @param xml The XML string to be converted. * @return The formatted JSON string. */ public static String convertXMLToJSON(String xml){ String json = ""; try { JSONObject j = JSONML.toJSONObject(xml); j = formatJSON(j); json = j.toString(4); } catch (JSONException e) { System.err.println("Error converting to JSON"); } return json; } /** * A helper function for our JSON converter, formatJSON rearranges the JSON * objects into an order better suited for reading by our rules engine. * @param j The JSON object to be formatted. * @return The formatted JSON object. * @throws JSONException */ private static JSONObject formatJSON(JSONObject j) throws JSONException{ JSONArray children = new JSONArray(); JSONArray attributes = new JSONArray(); String[] names = JSONObject.getNames(j); for(String name : names){ Object obj = j.get(name); // Recursively format each child object. if(name.equals("childNodes") && obj instanceof JSONArray){ children = (JSONArray)j.remove("childNodes"); for(int i = 0; i < children.length(); i ++){ Object childObj = children.get(i); if(childObj instanceof JSONObject){ formatJSON((JSONObject)childObj); } } } // Add any attributes to the attribute array. else if(!name.equals("tagName")){ JSONObject child = new JSONObject(); child.put(name, obj); attributes.put(child); j.remove(name); } } j.put("attributes", attributes); j.put("content", children); return j; } }