package pdfainspector;
import java.util.Set;
import nu.xom.Element;
import com.itextpdf.text.pdf.AcroFields;
import com.itextpdf.text.pdf.PdfDictionary;
import com.itextpdf.text.pdf.PdfName;
import com.itextpdf.text.pdf.PdfReader;
/**
* Convert form data in a PDF to a XOM XML element.
* @author schiele1
*/
public class FormExtractor {
/**
* Given an iText PDF Reader, extract form data from the PDF and store it
* in a XOM XML element.
* @param reader A reader for the given PDF.
* @return A XOM element containing form data.
*/
public static Element extractToXML(PdfReader reader){
Element root = new Element("Form");
AcroFields form = reader.getAcroFields();
Set<String> fields = form.getFields().keySet();
String tag;
// Determine the field type each form object represents.
for (String key : fields){
switch (form.getFieldType(key)) {
case AcroFields.FIELD_TYPE_CHECKBOX:
tag = "Checkbox";
break;
case AcroFields.FIELD_TYPE_COMBO:
tag = "Combobox";
break;
case AcroFields.FIELD_TYPE_LIST:
tag = "List";
break;
case AcroFields.FIELD_TYPE_NONE:
tag = "None";
break;
case AcroFields.FIELD_TYPE_PUSHBUTTON:
tag = "Pushbutton";
break;
case AcroFields.FIELD_TYPE_RADIOBUTTON:
tag = "Radiobutton";
break;
case AcroFields.FIELD_TYPE_SIGNATURE:
tag = "Signature";
break;
case AcroFields.FIELD_TYPE_TEXT:
tag = "Text";
break;
default:
tag = "unknown";
}
// Create the element corresponding to each tag.
Element tagElement = new Element(tag);
Element name = new Element("Name");
name.appendChild(sanitize(key));
tagElement.appendChild(name);
// Give the element its attributes (tooltips) and add it to root.
int numWidgets = form.getFieldItem(key).size();
for (int j = 0; j < numWidgets; j++){
PdfDictionary widget = form.getFieldItem(key).getWidget(j);
if (widget.get(PdfName.TU) != null){
Element tooltip = new Element("Tooltip");
tooltip.appendChild(sanitize(widget.get(PdfName.TU).toString()));
tagElement.appendChild(tooltip);
}
}
root.appendChild(tagElement);
}
return root;
}
/**
* Remove all null characters from a string so we can put it into XML.
* @param dict The string to sanitize.
* @return The sanitized string (i.e. with all null chars removed).
*/
private static String sanitize(String input){
if(input.startsWith("\u00fe\u00ff")){
input = input.substring(2);
}
String sanitized = "";
for(int i = 0; i < input.length(); i++){
char c = input.charAt(i);
if(c != '\0'){
sanitized = sanitized + c;
}
}
return sanitized;
}
}