package org.iswc.iswc2012main.dev;
import java.io.File;
import java.io.IOException;
import java.util.List;
import java.util.TreeMap;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.pdfbox.util.PDFTextStripper;
import org.iswc.iswc2012main.DataPaperInPdf;
import org.iswc.iswc2012main.DataPaperInPdf.STATE;
import org.iswc.util.DataKeyKeyValue;
import sw4j.util.Sw4jException;
import sw4j.util.ToolIO;
import sw4j.util.ToolString;
public class TaskParsePdf {
public static void main(String[] args) throws Sw4jException{
DataKeyKeyValue<String, String,String> mapPv = new DataKeyKeyValue<String, String,String>();
File fRootPdf = new File ("local/iswc2012pdf");
for (File f:fRootPdf.listFiles()){
if (!f.getAbsolutePath().endsWith(".pdf")){
continue;
}
System.out.println();
System.out.println();
System.out.println(f.getAbsolutePath());
try {
extractText(f);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
//print(mapPv.report(false));
}
enum PROP{
lineSeparator,
paragraphStart,
title,
author,
_abstract,
keywords,
}
private static void extractText(File f) throws IOException{
PDDocument pddDocument=PDDocument.load(f);
PDFTextStripper textStripper = new PDFTextStripper();
String content = textStripper.getText(pddDocument);
TreeMap<PROP, String> temp = new TreeMap<PROP, String> ();
temp.put(PROP.lineSeparator, textStripper.getLineSeparator());
temp.put(PROP.paragraphStart, textStripper.getParagraphStart());
System.out.println(temp);
DataPaperInPdf parser = new DataPaperInPdf(f.getName());
for (String line: content.split(temp.get(PROP.lineSeparator))){
parser.processLine(line);
if (DataPaperInPdf.STATE.content.equals(parser.state)){
break;
}
}
parser.printReport();
System.out.println("-----");
//System.out.println(content.substring(0, 500));
/*
PDDocumentInformation info = pddDocument.getDocumentInformation();
System.out.println( "Page Count=" + pddDocument.getNumberOfPages() );
System.out.println( "Title=" + info.getTitle() );
System.out.println( "Author=" + info.getAuthor() );
System.out.println( "Subject=" + info.getSubject() );
System.out.println( "Keywords=" + info.getKeywords() );
System.out.println( "Creator=" + info.getCreator() );
System.out.println( "Producer=" + info.getProducer() );
System.out.println( "Creation Date=" + info.getCreationDate() );
System.out.println( "Modification Date=" + info.getModificationDate());
System.out.println( "Trapped=" + info.getTrapped() );
*/
}
private static void print(List<String> data){
for (String line: data){
System.out.println(line);
}
System.out.println(data.size());
}
}