/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.apache.uima.examples; import java.io.File; import java.util.Iterator; import org.apache.uima.UIMAException; import org.apache.uima.UIMAFramework; import org.apache.uima.cas.CAS; import org.apache.uima.cas.FSIterator; import org.apache.uima.cas.Feature; import org.apache.uima.cas.FeatureStructure; import org.apache.uima.cas.Type; import org.apache.uima.collection.CollectionProcessingEngine; import org.apache.uima.collection.EntityProcessStatus; import org.apache.uima.collection.StatusCallbackListener; import org.apache.uima.collection.impl.metadata.cpe.CpeDescriptorFactory; import org.apache.uima.collection.metadata.CasProcessorConfigurationParameterSettings; import org.apache.uima.collection.metadata.CpeCasProcessor; import org.apache.uima.collection.metadata.CpeCollectionReader; import org.apache.uima.collection.metadata.CpeComponentDescriptor; import org.apache.uima.collection.metadata.CpeDescription; import org.apache.uima.collection.metadata.CpeSofaMapping; import org.apache.uima.collection.metadata.CpeSofaMappings; import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.tools.components.FileSystemCollectionReader; import org.apache.uima.tools.components.InlineXmlCasConsumer; import org.apache.uima.tools.components.XmlDetagger; import org.apache.uima.util.AnalysisEnginePerformanceReports; /** * An example application that reads documents from the file system, sends them though an Analysis * Engine(AE), and produces XML files with inline annotations. This application uses a * {@link CollectionProcessingEngine} to drive the processing. For a simpler introduction to using * AEs in an application, see {@link ExampleApplication}. * <p> * <code>Usage: java org.apache.uima.examples.RunAE [OPTIONS] * <AE descriptor or JAR file name> <input dir> * [<output dir>]</code> * <p> * If <code>output dir</code> is not specified, the analysis results will not be output. This can * be useful when only interested in performance statistics. * <p> * <u>OPTIONS</u> * <p> * -t <TagName> (XML Text Tag) - specifies the name of an XML tag, found within the input * documents, that contains the text to be analyzed. The text will also be detagged. If this option * is not specified, the entire document will be processed. <br> * -l <ISO code> (Language) - specifies the ISO code for the language of the input documents. * Some AEs (e.g. PersonTitleAnnotator) require this. <br> * -e <Encoding> - specifies character encoding of the input documents. The default is UTF-8. * <br> * -q (Quiet) - supresses progress messages that are normally printed as each document is processed. * <br> * -s<x> (Stats level) - determines the verboseness of performance statistics. s0=none, * s1=brief, s2=full. The default is brief. <br> * -x - process input files as XCAS files. */ public class RunAE implements StatusCallbackListener { // Values read from cmd line args private File aeSpecifierFile = null; private File inputDir = null; private File outputDir = null; private String xmlTagName = null; private String language; private String encoding; private boolean genProgressMessages = true; private int statsLevel = 1; private boolean xcasInput = false; private boolean xmiInput = false; private boolean xLenient = false; int docsProcessed; private CollectionProcessingEngine mCPE; /** * Constructor. Sets up and runs an Analysis Engine. */ public RunAE(String[] args) { try { // Read and validate command line arguments if (!processCmdLineArgs(args)) { printUsageMessage(); return; } // Enable schema validation (omit this to speed up initialization) // UIMAFramework.getXMLParser().enableSchemaValidation(true); // build a Collection Processing Engine descriptor that will drive processing CpeDescription cpeDesc = CpeDescriptorFactory.produceDescriptor(); // add collection reader that will read input docs cpeDesc.addCollectionReader(FileSystemCollectionReader.getDescriptorURL().toString()); // specify configuration parameters for collection reader CasProcessorConfigurationParameterSettings crSettings = CpeDescriptorFactory .produceCasProcessorConfigurationParameterSettings(); CpeCollectionReader cpeCollRdr = cpeDesc.getAllCollectionCollectionReaders()[0]; cpeCollRdr.setConfigurationParameterSettings(crSettings); crSettings.setParameterValue(FileSystemCollectionReader.PARAM_INPUTDIR, inputDir .getAbsolutePath()); crSettings.setParameterValue(FileSystemCollectionReader.PARAM_ENCODING, encoding); crSettings.setParameterValue(FileSystemCollectionReader.PARAM_LANGUAGE, language); if (xcasInput) { crSettings.setParameterValue(FileSystemCollectionReader.PARAM_XCAS, "XCAS"); } else if (xmiInput) { crSettings.setParameterValue(FileSystemCollectionReader.PARAM_XCAS, "XMI"); } if (xLenient) { crSettings.setParameterValue(FileSystemCollectionReader.PARAM_LENIENT, "true"); } // if XML tag was specified, configure XmlDetagger annotator and add to CPE CpeCasProcessor xmlDetaggerCasProc = null; if (xmlTagName != null && xmlTagName.length() > 0) { xmlDetaggerCasProc = CpeDescriptorFactory.produceCasProcessor("XmlDetagger"); CpeComponentDescriptor cpeComponentDescriptor = CpeDescriptorFactory.produceComponentDescriptor(XmlDetagger.getDescriptorURL().toString()); xmlDetaggerCasProc.setCpeComponentDescriptor(cpeComponentDescriptor); CasProcessorConfigurationParameterSettings detaggerSettings = CpeDescriptorFactory .produceCasProcessorConfigurationParameterSettings(); xmlDetaggerCasProc.setConfigurationParameterSettings(detaggerSettings); detaggerSettings.setParameterValue(XmlDetagger.PARAM_TEXT_TAG, xmlTagName); xmlDetaggerCasProc.setMaxErrorCount(0); cpeDesc.addCasProcessor(xmlDetaggerCasProc); } // add user's AE to CPE CpeCasProcessor casProc = CpeDescriptorFactory.produceCasProcessor("UserAE"); CpeComponentDescriptor cpeComponentDescriptor = CpeDescriptorFactory.produceComponentDescriptor(aeSpecifierFile.getAbsolutePath()); casProc.setCpeComponentDescriptor(cpeComponentDescriptor); casProc.setMaxErrorCount(0); cpeDesc.addCasProcessor(casProc); // add CAS Consumer that will write the output // create and configure CAS consumer that will write the output CpeCasProcessor casCon = null; if (outputDir != null) { casCon = CpeDescriptorFactory.produceCasProcessor("CasConsumer"); cpeComponentDescriptor = CpeDescriptorFactory.produceComponentDescriptor(InlineXmlCasConsumer.getDescriptorURL().toString()); casCon.setCpeComponentDescriptor(cpeComponentDescriptor); CasProcessorConfigurationParameterSettings consumerSettings = CpeDescriptorFactory .produceCasProcessorConfigurationParameterSettings(); casCon.setConfigurationParameterSettings(consumerSettings); consumerSettings.setParameterValue(InlineXmlCasConsumer.PARAM_OUTPUTDIR, outputDir .getAbsolutePath()); if (xcasInput) { consumerSettings.setParameterValue(InlineXmlCasConsumer.PARAM_XCAS, "XCAS"); } else if (xmiInput) { consumerSettings.setParameterValue(InlineXmlCasConsumer.PARAM_XCAS, "XMI"); } casCon.setMaxErrorCount(0); cpeDesc.addCasProcessor(casCon); } // if XML detagger is used, we need to configure sofa mappings for the CPE if (xmlDetaggerCasProc != null) { // For XML detagger map default sofa to "xmlDocument" CpeSofaMapping sofaMapping = CpeDescriptorFactory.produceSofaMapping(); sofaMapping.setComponentSofaName("xmlDocument"); sofaMapping.setCpeSofaName(CAS.NAME_DEFAULT_SOFA); CpeSofaMappings xmlDetaggerSofaMappings = CpeDescriptorFactory.produceSofaMappings(); xmlDetaggerSofaMappings.setSofaNameMappings(new CpeSofaMapping[] { sofaMapping }); xmlDetaggerCasProc.setSofaNameMappings(xmlDetaggerSofaMappings); // User AE and InlineXmlCasConsumer (if present) operate on the "plainTextDocument" // sofa produced by the XmlDetagger CpeSofaMapping aeSofaMapping = CpeDescriptorFactory.produceSofaMapping(); aeSofaMapping.setCpeSofaName("plainTextDocument"); CpeSofaMappings userAeSofaMappings = CpeDescriptorFactory.produceSofaMappings(); userAeSofaMappings.setSofaNameMappings(new CpeSofaMapping[] { aeSofaMapping }); casProc.setSofaNameMappings(userAeSofaMappings); if (casCon != null) { CpeSofaMapping casConSofaMapping = CpeDescriptorFactory.produceSofaMapping(); casConSofaMapping.setCpeSofaName("plainTextDocument"); CpeSofaMappings consumerSofaMappings = CpeDescriptorFactory.produceSofaMappings(); consumerSofaMappings.setSofaNameMappings(new CpeSofaMapping[] { casConSofaMapping }); casCon.setSofaNameMappings(consumerSofaMappings); } } // instantiate CPE mCPE = UIMAFramework.produceCollectionProcessingEngine(cpeDesc); // register callback listener mCPE.addStatusCallbackListener(this); // execute docsProcessed = 0; mCPE.process(); } catch (Exception e) { //special check for using XML detagger with remotes, which will generate an error //since sofa mappings aren't supported for remotes if (xmlTagName != null && xmlTagName.length() > 0 && e instanceof UIMAException && ((UIMAException)e).hasMessageKey(ResourceInitializationException.SOFA_MAPPING_NOT_SUPPORTED_FOR_REMOTE)) { System.err.println("The XML detagging feature (-t) is not supported for remote Analysis Engines or for Aggregates containing remotes."); } else { e.printStackTrace(); } } } /** * @see org.apache.uima.collection.base_cpm.BaseStatusCallbackListener#initializationComplete() */ public void initializationComplete() { } /** * @see org.apache.uima.collection.StatusCallbackListener#entityProcessComplete(org.apache.uima.cas.CAS, * org.apache.uima.collection.EntityProcessStatus) */ public void entityProcessComplete(CAS aCas, EntityProcessStatus aStatus) { if (aStatus.isException()) { Iterator iter = aStatus.getExceptions().iterator(); while (iter.hasNext()) { ((Throwable) iter.next()).printStackTrace(); } } else if (genProgressMessages) { // retrieve the filename of the input file from the CAS // (it was put there by the FileSystemCollectionReader) if (!(xcasInput || xmiInput)) { Type fileLocType = aCas.getTypeSystem().getType( "org.apache.uima.examples.SourceDocumentInformation"); Feature fileNameFeat = fileLocType.getFeatureByBaseName("uri"); FSIterator it = aCas.getAnnotationIndex(fileLocType).iterator(); FeatureStructure fileLoc = it.get(); File inFile = new File(fileLoc.getStringValue(fileNameFeat)); System.out.println("Processed Document " + inFile.getName()); } else { System.out.println("doc" + docsProcessed++ + " processed successfully"); } } } /** * @see org.apache.uima.collection.base_cpm.BaseStatusCallbackListener#aborted() */ public void aborted() { System.out.println("Processing Aborted"); } /** * @see org.apache.uima.collection.base_cpm.BaseStatusCallbackListener#batchProcessComplete() */ public void batchProcessComplete() { } /** * @see org.apache.uima.collection.base_cpm.BaseStatusCallbackListener#collectionProcessComplete() */ public void collectionProcessComplete() { // output performance stats if (statsLevel > 0) { AnalysisEnginePerformanceReports performanceReports = new AnalysisEnginePerformanceReports( mCPE.getPerformanceReport()); System.out.println("\n\nPERFORMANCE STATS\n-----------------\n\n"); if (statsLevel > 1) { System.out.println(performanceReports.getFullReport()); System.out.println(); } System.out.println(performanceReports); } } /** * @see org.apache.uima.collection.base_cpm.BaseStatusCallbackListener#paused() */ public void paused() { } /** * @see org.apache.uima.collection.base_cpm.BaseStatusCallbackListener#resumed() */ public void resumed() { } /** * Prints usage message. */ private void printUsageMessage() { System.err.println("\nUsage: java " + this.getClass().getName() + " [OPTIONS] <AE descriptor filename> <input dir> [<output dir>] "); System.err.println("\nIf <output dir> is not specified, the analysis " + "results will not be output. This can be useful when only interested " + "in performance statistics."); System.err.println("\nOPTIONS\n-------"); System.err.println("-t <TagName> (XML Text Tag) - specifies the name of " + "an XML tag, found within the input documents, that contains the text " + "to be analyzed. The text will also be detagged. If this option is not " + "specified, the entire document will be processed."); System.err.println("-q (Quiet) - supresses progress messages that are " + "normally printed as each document is processed."); System.err.println("-s<x> (Stats level) - determines the verboseness of " + "performance statistics. s0=none, s1=brief, s2=full. The default is brief."); System.err.println("-x - process input files as XCAS files."); System.err.println("-xmi - process input files as XmiCas files."); System.err.println("-lenient - ignore out-of-typesystem content when deserializing XML files."); System.err.println("-l <ISO language> - specify the ISO Language code to set."); System.err.println("-e <encoding> - specify the character encoding to use."); } /** * Reads command line arguments and sets static class variables appropriately. * * @return true if command line args were valid, false if not */ private boolean processCmdLineArgs(String[] args) { encoding = "UTF-8"; // default int index = 0; while (index < args.length) { String arg = args[index++]; if (arg.equals("-q")) // quiet mode { genProgressMessages = false; } else if (arg.equals("-s0")) // no stats { statsLevel = 0; } else if (arg.equals("-s2")) // full stats { statsLevel = 2; } else if (arg.equals("-t")) // XML tag text { // tag name is next argument if (index >= args.length) { return false; } xmlTagName = args[index++]; } else if (arg.equals("-l")) // Language { // language ISO code is next argument if (index >= args.length) { return false; } language = args[index++]; } else if (arg.equals("-e")) // Encoding { // encoding is next argument if (index >= args.length) { return false; } encoding = args[index++]; } else if (arg.equals("-x")) // XCAS file input { xcasInput = true; } else if (arg.equals("-xmi")) // XMI file input { xmiInput = true; } else if (arg.equals("-lenient")) // lenient XML deserialization { xLenient = true; } else if (arg.startsWith("-")) // invalid option { System.err.println(arg + " is not a valid option"); return false; } else // one of the standard params - whichever we haven't read yet { if (aeSpecifierFile == null) { aeSpecifierFile = new File(arg); if (!aeSpecifierFile.exists() || aeSpecifierFile.isDirectory()) { System.err.println(arg + " does not exist"); System.exit(1); } } else if (inputDir == null) { inputDir = new File(arg); if (!inputDir.exists() || !inputDir.isDirectory()) { System.err.println(arg + " does not exist or is not a directory"); System.exit(1); } } else if (outputDir == null) { outputDir = new File(arg); if (!outputDir.exists() && !outputDir.mkdirs()) { System.err.println(arg + " does not exist and could not be created"); System.exit(1); } } } } // make sure required values were specified return (aeSpecifierFile != null) && (inputDir != null); } public static void main(String[] args) { new RunAE(args); } }