package edu.isi.bmkeg.lapdf.uima.cpe; import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.apache.log4j.Logger; import org.apache.uima.UIMAException; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.collection.CollectionReader; import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.resource.metadata.TypeSystemDescription; import org.uimafit.factory.CollectionReaderFactory; import org.uimafit.factory.TypeSystemDescriptionFactory; import edu.isi.bmkeg.lapdf.bin.CommandLineTool; import edu.isi.bmkeg.lapdf.uima.ae.ParserRuleBasedClassfierAE; import edu.isi.bmkeg.lapdf.uima.cr.DirectoryCollectionReader; import edu.isi.bmkeg.utils.PipelineLauncher; /** * The primary execution engine of the LAPDFText system. * */ public class CommandLineFitPipeline { private static Logger logger = Logger.getLogger(CommandLineFitPipeline.class); private List<AnalysisEngine> aeList = new ArrayList<AnalysisEngine>(); private CollectionReader documentCollectionReader; private static String PDF_SUFFIX = ".pdf"; private String modeNumber; /** * Start time of the processing - used to compute elapsed time. */ private long mStartTime; public CommandLineFitPipeline( String inputDocumentsLocation, String ruleFileLocation, boolean reportBlocks, boolean extractUnclassified, String outputDocumentsLocation) throws ResourceInitializationException { TypeSystemDescription typeSystem = TypeSystemDescriptionFactory.createTypeSystemDescription( "desc.typeSystem.LAPDFTextTypeSystemDescriptor" ); logger.info("Loaded the type system..."); documentCollectionReader = CollectionReaderFactory.createCollectionReader( DirectoryCollectionReader.class, DirectoryCollectionReader.DIRECTORY, inputDocumentsLocation, DirectoryCollectionReader.FILE_SUFFIX, PDF_SUFFIX, DirectoryCollectionReader.DIR_RECURSION, true, DirectoryCollectionReader.ITEMS_TO_SKIP, -1, DirectoryCollectionReader.END_INDEX, -1); AnalysisEngine pdfParserClassifier = null; if (reportBlocks) { pdfParserClassifier = ParserRuleBasedClassfierAE.createAnalysisEngine( typeSystem, 2, reportBlocks, extractUnclassified, outputDocumentsLocation, ruleFileLocation); } else { pdfParserClassifier = ParserRuleBasedClassfierAE.createAnalysisEngine( typeSystem, 3, reportBlocks, extractUnclassified, outputDocumentsLocation, ruleFileLocation); } aeList.add(pdfParserClassifier); } public CommandLineFitPipeline( String inputDocumentsLocation, String ruleFileLocation, boolean reportBlocks, boolean extractUnclassified, String outputDocumentsLocation, int itemsToSkip, int endIndex) throws ResourceInitializationException { TypeSystemDescription typeSystem = TypeSystemDescriptionFactory .createTypeSystemDescription("desc.typeSystem.LAPDFTextTypeSystemDescriptor"); logger.info("Loaded the type system..."); documentCollectionReader = CollectionReaderFactory.createCollectionReader( DirectoryCollectionReader.class, DirectoryCollectionReader.DIRECTORY, inputDocumentsLocation, DirectoryCollectionReader.FILE_SUFFIX, PDF_SUFFIX, DirectoryCollectionReader.DIR_RECURSION, true, DirectoryCollectionReader.ITEMS_TO_SKIP, itemsToSkip, DirectoryCollectionReader.END_INDEX, endIndex); AnalysisEngine pdfParserClassifier = null; if (reportBlocks) { pdfParserClassifier = ParserRuleBasedClassfierAE .createAnalysisEngine(typeSystem, 2, reportBlocks, extractUnclassified, outputDocumentsLocation, ruleFileLocation); } else { pdfParserClassifier = ParserRuleBasedClassfierAE .createAnalysisEngine(typeSystem, 3, reportBlocks, extractUnclassified, outputDocumentsLocation, ruleFileLocation); } aeList.add(pdfParserClassifier); } public CommandLineFitPipeline( String inputDocumentsLocation, boolean reportBlocks, boolean extractUnclassified, String outputDocumentsLocation) throws ResourceInitializationException { TypeSystemDescription typeSystem = TypeSystemDescriptionFactory .createTypeSystemDescription("desc.typeSystem.LAPDFTextTypeSystemDescriptor"); logger.info("Loaded the type system..."); documentCollectionReader = CollectionReaderFactory .createCollectionReader( DirectoryCollectionReader.class, DirectoryCollectionReader.DIRECTORY, inputDocumentsLocation, DirectoryCollectionReader.FILE_SUFFIX, PDF_SUFFIX, DirectoryCollectionReader.DIR_RECURSION, true, DirectoryCollectionReader.ITEMS_TO_SKIP, -1, DirectoryCollectionReader.END_INDEX, -1); AnalysisEngine pdfParserClassifier = ParserRuleBasedClassfierAE .createAnalysisEngine(typeSystem, 1, reportBlocks, extractUnclassified, outputDocumentsLocation); aeList.add(pdfParserClassifier); } public CommandLineFitPipeline( String inputDocumentsLocation, boolean reportBlocks, boolean extractUnclassified, String outputDocumentsLocation, int itemsToSkip, int endIndex) throws ResourceInitializationException { TypeSystemDescription typeSystem = TypeSystemDescriptionFactory .createTypeSystemDescription("desc.typeSystem.LAPDFTextTypeSystemDescriptor"); logger.info("Loaded the type system..."); documentCollectionReader = CollectionReaderFactory.createCollectionReader( DirectoryCollectionReader.class, DirectoryCollectionReader.DIRECTORY, inputDocumentsLocation, DirectoryCollectionReader.FILE_SUFFIX, PDF_SUFFIX, DirectoryCollectionReader.DIR_RECURSION, true, DirectoryCollectionReader.ITEMS_TO_SKIP, itemsToSkip, DirectoryCollectionReader.END_INDEX, endIndex); AnalysisEngine pdfParserClassifier = ParserRuleBasedClassfierAE.createAnalysisEngine( typeSystem, 1, reportBlocks, extractUnclassified, outputDocumentsLocation); aeList.add(pdfParserClassifier); } public void run() { logger.info("Running Pipeline..."); try { AnalysisEngine[] aeArray = aeList.toArray(new AnalysisEngine[0]); PipelineLauncher.runPipeline(documentCollectionReader, aeArray); // SimplePipeline.runPipeline(articleCollectionReader, aeArray); } catch (ResourceInitializationException e) { e.printStackTrace(); } catch (UIMAException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } }