package pdfainspector;
import java.util.HashMap;
import nu.xom.Element;
import com.itextpdf.text.pdf.PdfDictionary;
import com.itextpdf.text.pdf.PdfName;
import com.itextpdf.text.pdf.PdfReader;
/**
* Convert metadata in a PDF to a XOM XML element.
* @author schiele1
*/
public class MetaExtractor {
/**
* Given an iText PDF Reader, extract metadata from the PDF and store it in
* a XOM XML element.
* @param reader A reader for the given PDF.
* @return A XOM element containing metadata.
*/
public static Element extractToXML(PdfReader reader){
// For each desired metadata item, retrieve it from the document...
HashMap<String,String> metadata = reader.getInfo();
String title = metadata.get("Title");
String author = metadata.get("Author");
String creator = metadata.get("Creator");
String pages = Integer.toString(reader.getNumberOfPages());
// Language data is stored somewhere else
String language = "None";
PdfDictionary catalog = reader.getCatalog();
if(catalog.contains(PdfName.LANG)){
language = sanitize(catalog.getAsString(PdfName.LANG).toString());
}
if(language == ""){
language = "None";
}
// Make an element for it...
Element root = new Element("Metadata");
Element titleElement = new Element("Title");
Element authorElement = new Element("Author");
Element creatorElement = new Element("Creator");
Element pagesElement = new Element("Pages");
Element languageElement = new Element("Language");
// Add the retrieved data to the corresponding element...
titleElement.appendChild(title);
authorElement.appendChild(author);
creatorElement.appendChild(creator);
pagesElement.appendChild(pages);
languageElement.appendChild(language);
// And add each element to the root, which we return.
root.appendChild(titleElement);
root.appendChild(authorElement);
root.appendChild(creatorElement);
root.appendChild(pagesElement);
root.appendChild(languageElement);
return root;
}
/**
* Remove all null characters from a string so we can put it into XML.
* @param dict The string to sanitize.
* @return The sanitized string (i.e. with all null chars removed).
*/
private static String sanitize(String input){
if(input.startsWith("\u00fe\u00ff")){
input = input.substring(2);
}
String sanitized = "";
for(int i = 0; i < input.length(); i++){
char c = input.charAt(i);
if(c != '\0'){
sanitized = sanitized + c;
}
}
return sanitized;
}
}