package input.parser.impl;
import input.model.Section;
import input.model.impl.CompositeSection;
import input.model.impl.Item;
import input.parser.SadParser;
import input.xmltemplatereader.TemplateStructure;
import input.xmltemplatereader.XmlReader;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSObject;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.interactive.action.type.PDAction;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDDestination;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
import org.apache.pdfbox.util.PDFTextStripper;
/**
*
* @author Marcos Basso
*
*/
public class PdfParser implements SadParser {
private XmlReader structureXml;
public static void main(String[] args){
String path = "D:\\MSLite architecture.pdf";
// String path = "D:\\Resumen de Mak.pdf";
// String path = "D:\\Nokia_N8-00.pdf";
String pathTemplate="";
PdfParser parser = new PdfParser();
Section section = parser.getSad(pathTemplate,path);
List<Section> list = section.getSections();
}
/**
* @see input.parser.SadParser#getSad(java.lang.String)
*/
public Section getSad(String pathTemplate, String urlSad) {
Section section = new CompositeSection();
if(!pathTemplate.isEmpty()){
structureXml = new XmlReader(pathTemplate);
}
try {
File input = new File(urlSad);
PDDocument doc;
doc = PDDocument.load(input);
PDDocumentOutline root = doc.getDocumentCatalog().getDocumentOutline();
if(root!=null){
// Se pide el primer nodo del �rbol
PDOutlineItem item = root.getFirstChild();
if(structureXml != null){
if(validateTemplate(item)){
section = parserSections(item, doc);
}else{
section = null;
}
}else{
section = parserSections(item, doc);
}
}else{
Item s = new Item();
s.setText(extractText(0, doc, doc.getNumberOfPages()));
s.setName(input.getName());
((CompositeSection) section).addSection(s);
}
return section;
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return null;
}
private boolean validateTemplate(PDOutlineItem item){
boolean validate=true;
if(item != null && structureXml != null){
validate = validateStructure(item, structureXml.getStructure());
}else{
validate = false;
}
return validate;
}
private boolean validateStructure(PDOutlineItem item, List<TemplateStructure> structure ){
boolean validate=true;
int i;
for(i=0; i < structure.size() && item != null && validate; i++){
TemplateStructure ts = structure.get(i);
if(item.getTitle().equals(ts.getTitle())){
if(item.getFirstChild() != null && ((ts.getStrcture() != null) && (ts.getStrcture().size() > 0))){
validate = validateStructure(item.getFirstChild(), ts.getStrcture());
}
}else{
validate = false;
}
item = item.getNextSibling();
}
if((i < structure.size())||(i == structure.size()&& item != null)){
validate = false;
}
return validate;
}
private CompositeSection parserSections(PDOutlineItem item, PDDocument doc){
CompositeSection pdfDocument = new CompositeSection();
try{
parse(item, doc,pdfDocument, item.getNextSibling());
doc.close();
return pdfDocument;
}catch(Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return null;
}
private void parse(PDOutlineItem item, PDDocument doc, CompositeSection pdfSection, PDOutlineItem tope) {
try {
if(item != null){
PDOutlineItem aux = null;
int t = getPage(tope, doc);
int i = getPage(item, doc);
if(item.getFirstChild() != null){
CompositeSection s = null;
s = new CompositeSection();
aux = item.getFirstChild();
int pageChild = getPage(aux, doc);
int inicialPage = getPage(item, doc);
if(inicialPage < pageChild){
s.setText(extractText(inicialPage, doc, pageChild));
}
s.setName(item.getTitle());
if(item.getNextSibling() != null){
parse(aux,doc,s,item.getNextSibling());
}else{
parse(aux,doc,s,aux.getNextSibling());
}
pdfSection.addSection(s);
}else{
Item s = new Item();
aux = item;
s.setText(extractText(getPage(aux, doc), doc, getPaginaFinal(item,doc, tope)));
s.setName(item.getTitle());
pdfSection.addSection(s);
// tope = tope.getNextSibling();
}
parse(item.getNextSibling(), doc, pdfSection, tope);
}
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
private String extractText(int inicialPage, PDDocument doc, int finalPage) {
PDFTextStripper stripper;
try {
if(inicialPage == finalPage){
finalPage++;
}
stripper = new PDFTextStripper();
stripper.setStartPage(inicialPage);
stripper.setEndPage(finalPage-1);
String original = stripper.getText(doc);
return original;
} catch (IOException e) {
e.printStackTrace();
}
return null;
}
protected int getPaginaFinal(PDOutlineItem item, PDDocument doc, PDOutlineItem tope) {
int page = 0;
if(item.getNextSibling() != null){
page = getPage(item.getNextSibling(), doc);
}else{
if((tope != null) && (getPage(item, doc) <= getPage(tope, doc))){
page = getPage(tope, doc);
}else{
page = doc.getNumberOfPages()+1;
}
}
return page;
}
public int getPage(PDOutlineItem current, PDDocument doc){
try{
if (current != null) {
PDDestination dest = current.getDestination();
PDAction pdAction = current.getAction();
if (pdAction != null) {
// From BM-Thread
COSObject targetPageRef = (COSObject) ((COSArray) current.getAction().getCOSDictionary().getDictionaryObject("D")).get(0);
String objStr = String.valueOf(targetPageRef.getObjectNumber().intValue());
String genStr = String.valueOf(targetPageRef.getGenerationNumber().intValue());
return (Integer) doc.getPageMap().get(objStr + "," + genStr);
} else if (dest != null) {
// From PDP-Thread
PDPage pdp = current.findDestinationPage(doc);
List allpages = new ArrayList();
doc.getDocumentCatalog().getPages().getAllKids(allpages);
return allpages.indexOf(pdp) + 1;
}
}
}catch(Exception e){
// TODO Auto-generated catch block
e.printStackTrace();
}
return 0;
}
}