/**
* Copyright 2012-2013 The MITRE Corporation.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*
*
* **************************************************************************
* NOTICE This software was produced for the U. S. Government under Contract No.
* W15P7T-12-C-F600, and is subject to the Rights in Noncommercial Computer
* Software and Noncommercial Computer Software Documentation Clause
* 252.227-7014 (JUN 1995)
*
* (c) 2012 The MITRE Corporation. All Rights Reserved.
* **************************************************************************
*
*/
package org.opensextant.processing;
import java.util.List;
import java.util.ArrayList;
import org.opensextant.data.DocInput;
import org.opensextant.data.TextInput;
import org.opensextant.extraction.Extractor;
import org.opensextant.extraction.ExtractionException;
import org.opensextant.extraction.ExtractionResult;
import org.opensextant.extraction.TextMatch;
import org.opensextant.output.ResultsFormatter;
import org.opensextant.processing.progress.ProgressListener;
import org.opensextant.processing.progress.ProgressMonitor;
import org.opensextant.processing.progress.ProgressMonitorBase;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* A Group of Xponent Extractors. An Extractor has a simple interface:
*
* <pre>
* +configure() + extract()
* </pre>
*
* Configure any Extractor; add it to the stack here;
*
* Once you have added Extractors to your XtractorGroup, call
* XtractorGroup.setup()
*
* Since a single processor of several may throw an exception, while others
* succeed, The API does not throw exceptions failing a document completely. If
* you need access to exceptions thrown by each processor or formatter, then you
* would adapt the XtractorGroup here, but re-implementing the internal loops.
*
* @author ubaldino
*/
public class XtractorGroup {
/**
* API: child implementations have access to the core list of extractors.
*/
protected List<Extractor> extractors = new ArrayList<Extractor>();
/**
* API: child implementations have access to the core list of extractors.
*/
protected List<ResultsFormatter> formatters = new ArrayList<ResultsFormatter>();
/**
* API: child implementations should recreate their own logger.
*/
protected Logger log = LoggerFactory.getLogger(getClass());
/**
* API: child implementations have access to accumulated errors; reset()
* clears errors and other state.
*/
protected List<String> currErrors = new ArrayList<String>();
protected ProgressMonitor progressMonitor = new ProgressMonitorBase();
/**
*/
public XtractorGroup() {
}
public void addExtractor(Extractor xprocessor) {
// xprocessor.setProgressMonitor(progressMonitor);
extractors.add(xprocessor);
}
public void addFormatter(ResultsFormatter formatter) {
formatters.add(formatter);
}
public void addProgressListener(ProgressListener listener) {
progressMonitor.addProgressListener(listener);
}
public void removeProgressListener(ProgressListener listener) {
progressMonitor.removeProgressListener(listener);
}
/**
* Process one input. If you have no need for formatting output at this time
* use this. If you have complext ExtractionResults where you want to add
* meta attributes, then you would use this approach
*/
public List<TextMatch> process(TextInput input) {
List<TextMatch> oneResultSet = new ArrayList<TextMatch>();
progressMonitor.setNumberOfSteps(extractors.size());
/**
* Process all extraction and compile on a single list.
*/
for (Extractor x : extractors) {
try {
List<TextMatch> results = x.extract(input);
if (results != null && !results.isEmpty()) {
oneResultSet.addAll(results);
}
} catch (ExtractionException loopErr) {
log.error("Extractor=" + x.getName() + "on Input=" + input.id, loopErr);
currErrors.add("Extractor=" + x.getName() + " ERR=" + loopErr.getMessage());
}
}
progressMonitor.completeDocument();
return oneResultSet;
}
/**
* Format each result; Some formatters may pass on results For example,
* Shapefile formatter accepts only Geocoding-capable TextMatch.
*/
public int format(ExtractionResult compilation) {
int status = 2;
for (ResultsFormatter fmt : formatters) {
try {
fmt.formatResults(compilation);
status = 1;
} catch (ProcessingException fmtErr) {
log.error("Formatter=" + fmt.getOutputType(), fmtErr);
currErrors.add("Formatter=" + fmt.getOutputType() + " ERR=" + fmtErr.getMessage());
}
}
return status;
}
/**
* Use only if you intend to shutdown.
*/
public void cleanupAll() {
for (Extractor x : extractors) {
x.cleanup();
}
}
/**
* DRAFT: still figuring out the rules for 'reset' between processing or
* inputs.
*/
public void reset() {
currErrors.clear();
}
/**
* Processes input content against all extractors and all formatters This
* does not throw exceptions, as some processing may fail, while others
* succeed. TODO: Processing/Formatting details would have to be retrieved
* by calling some other method that is statefully tracking such things.
*
* @param input
* @return status -1 failure, 0 nothing found, 1 found matches and
* formatted; 2 found content but nothing formatted. them.
*/
public int processAndFormat(TextInput input) {
reset();
ExtractionResult compilation = new ExtractionResult(input.id);
if (input instanceof DocInput) {
compilation.recordFile = ((DocInput) input).getFilepath();
compilation.recordTextFile = ((DocInput) input).getTextpath();
}
compilation.matches = process(input);
compilation.input = input;
if (compilation.matches.isEmpty()) {
return 0; // nothing found
}
int status = format(compilation);
return status;
}
}