package edu.isi.bmkeg.lapdf.uima.ae;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import org.apache.log4j.Logger;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.resource.metadata.TypeSystemDescription;
import org.jpedal.exception.PdfException;
import org.uimafit.component.JCasAnnotator_ImplBase;
import org.uimafit.descriptor.ConfigurationParameter;
import org.uimafit.factory.AnalysisEngineFactory;
import org.uimafit.factory.ConfigurationParameterFactory;
import edu.isi.bmkeg.lapdf.controller.LapdfEngine;
import edu.isi.bmkeg.lapdf.controller.LapdfMode;
import edu.isi.bmkeg.lapdf.extraction.exceptions.AccessException;
import edu.isi.bmkeg.lapdf.extraction.exceptions.ClassificationException;
import edu.isi.bmkeg.lapdf.extraction.exceptions.EncryptionException;
import edu.isi.bmkeg.lapdf.model.LapdfDocument;
import edu.isi.bmkeg.utils.ISI_UIMA_PDFUtils;
public class ParserRuleBasedClassfierAE extends JCasAnnotator_ImplBase {
private static Logger logger = Logger.getLogger(ParserRuleBasedClassfierAE.class);
public static final String MODE = ConfigurationParameterFactory
.createConfigurationParameterName(ParserRuleBasedClassfierAE.class, "mode");
@ConfigurationParameter(mandatory = true,
description = "This is the mode of operation.")
private int mode;
public static final String RULE_FILE = ConfigurationParameterFactory
.createConfigurationParameterName(ParserRuleBasedClassfierAE.class, "ruleFile");
@ConfigurationParameter(mandatory = false,
description = "This is the rule file used for block classification.")
protected String ruleFile;
public static final String REPORT_BLOCKS = ConfigurationParameterFactory
.createConfigurationParameterName(ParserRuleBasedClassfierAE.class, "reportBlocks");
@ConfigurationParameter(mandatory = true,
description = "This is the flag used to trigger debug reporting.")
protected Boolean reportBlocks;
public static final String EXTRACT_UNCLASSIFIED = ConfigurationParameterFactory
.createConfigurationParameterName(ParserRuleBasedClassfierAE.class, "extractUnclassified");
@ConfigurationParameter(mandatory = true,
description = "this flag is used to decide whether unclassified flow aware output " +
"text is required.")
protected boolean extractUnclassified;
public static final String OUTPUT_FOLDER = ConfigurationParameterFactory
.createConfigurationParameterName(ParserRuleBasedClassfierAE.class, "outputFolder");
@ConfigurationParameter(mandatory = true,
description = "This is the location of the output for debug and results.")
private String outputFolder;
protected File outputFolderFileDescriptor;
private LapdfEngine pdfEng;
protected LapdfDocument doc;
public void initialize(UimaContext uimaContext)
throws ResourceInitializationException {
try {
super.initialize(uimaContext);
mode = (Integer) uimaContext.getConfigParameterValue(MODE);
ruleFile = (String) uimaContext.getConfigParameterValue(RULE_FILE);
if (ruleFile != null) {
logger.info("Using rulefile " + ruleFile);
this.pdfEng = new LapdfEngine(new File(ruleFile));
} else {
this.pdfEng = new LapdfEngine();
}
extractUnclassified = (Boolean) uimaContext
.getConfigParameterValue(EXTRACT_UNCLASSIFIED);
reportBlocks = (Boolean) uimaContext
.getConfigParameterValue(REPORT_BLOCKS);
outputFolder = (String) uimaContext
.getConfigParameterValue(OUTPUT_FOLDER);
outputFolderFileDescriptor = new File(outputFolder);
if (!outputFolderFileDescriptor.exists()) {
logger.info(outputFolderFileDescriptor.getAbsolutePath()
+ " does not exist! Creating it!!");
outputFolderFileDescriptor.mkdir();
}
} catch (Exception e) {
throw new ResourceInitializationException(e);
}
}
@Override
public void process(JCas jcas) throws AnalysisEngineProcessException {
String inputPDFFilePath = ISI_UIMA_PDFUtils
.getDocumentSecondaryID(jcas);
File inFile = new File(inputPDFFilePath);
String inputPDFFileName = inFile.getName();
String stem = inputPDFFileName.substring(0, inputPDFFileName.lastIndexOf("."));
try {
File outDir = new File(outputFolderFileDescriptor.getPath() + "/" + stem);
if( outDir.exists() ) {
return;
} else {
outDir.mkdir();
}
if( mode == LapdfMode.BLOCK_ONLY ) {
logger.info("Running block detection on " + inputPDFFilePath);
this.pdfEng.processBlocks(inFile, outDir, reportBlocks, extractUnclassified);
} else if( mode == LapdfMode.CLASSIFY ) {
this.pdfEng.processClassify(inFile, outDir, reportBlocks, extractUnclassified);
} else if( mode == LapdfMode.SECTION_FILTER ) {
this.pdfEng.processSectionFilter(inFile, outDir, reportBlocks, extractUnclassified);
}
} catch (Exception e) {
throw new AnalysisEngineProcessException(e);
}
}
public static AnalysisEngine createAnalysisEngine(
TypeSystemDescription typeSystem,
int mode,
boolean reportBlocks,
boolean extractUnclassified,
String outputFolderName,
String ruleFileName) throws ResourceInitializationException {
AnalysisEngineDescription aed = AnalysisEngineFactory.createPrimitiveDescription(
ParserRuleBasedClassfierAE.class,
typeSystem,
// name, value
ParserRuleBasedClassfierAE.OUTPUT_FOLDER, outputFolderName,
ParserRuleBasedClassfierAE.RULE_FILE, ruleFileName,
ParserRuleBasedClassfierAE.MODE, mode,
ParserRuleBasedClassfierAE.REPORT_BLOCKS, reportBlocks,
ParserRuleBasedClassfierAE.EXTRACT_UNCLASSIFIED, extractUnclassified);
return AnalysisEngineFactory.createPrimitive(aed);
}
public static AnalysisEngine createAnalysisEngine(
TypeSystemDescription typeSystem,
int mode,
boolean reportBlocks,
boolean extractUnclassified,
String outputFolderName)
throws ResourceInitializationException {
AnalysisEngineDescription aed = AnalysisEngineFactory.createPrimitiveDescription(
ParserRuleBasedClassfierAE.class,
typeSystem,
// name, value
ParserRuleBasedClassfierAE.OUTPUT_FOLDER, outputFolderName,
ParserRuleBasedClassfierAE.RULE_FILE, null,
ParserRuleBasedClassfierAE.MODE, mode,
ParserRuleBasedClassfierAE.REPORT_BLOCKS, reportBlocks,
ParserRuleBasedClassfierAE.EXTRACT_UNCLASSIFIED, extractUnclassified);
return AnalysisEngineFactory.createPrimitive(aed);
}
}