package eu.project.ttc.api;
import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription;
import java.io.File;
import java.io.FileInputStream;
import java.nio.charset.Charset;
import java.nio.file.Path;
import java.util.Collection;
import java.util.Optional;
import java.util.function.Function;
import java.util.stream.Stream;
import org.apache.uima.UIMAException;
import org.apache.uima.UIMAFramework;
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.impl.XmiCasDeserializer;
import org.apache.uima.fit.factory.JCasFactory;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.resource.ResourceManager;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import eu.project.ttc.engines.cleaner.TermProperty;
import eu.project.ttc.engines.desc.Lang;
import eu.project.ttc.history.TermHistory;
import eu.project.ttc.models.Term;
import eu.project.ttc.models.TermIndex;
import eu.project.ttc.readers.TermSuiteJsonCasDeserializer;
import eu.project.ttc.tools.TermSuitePipeline;
import eu.project.ttc.tools.api.internal.FileSystemUtils;
import eu.project.ttc.tools.api.internal.PipelineUtils;
import eu.project.ttc.utils.JCasUtils;
/**
*
* A builder and launcher class for execute a terminology extraction
* pipeline from raw text files or from TermSuite preprocessed files.
*
* @author Damien Cram
*
* @see TermSuitePreprocessor
*
*/
public class TerminoExtractor {
public static enum ContextualizerMode {ON_ALL_TERMS, ON_SWT_TERMS}
/*
* TreeTagger home
*/
private String treeTaggerHome;
/*
* Pipeline language
*/
private Lang lang = null;
/*
* Custom resources
*/
private Optional<String> customResourceDir = Optional.empty();
/*
* Contextualizer properties
*/
private boolean useContextualizer = false;
private int contextualizerScope = 3;
private ContextualizerMode contextualizerMode = ContextualizerMode.ON_SWT_TERMS;
/*
* true if the input is preprocessed, false otherwise.
*/
private boolean preprocessed = false;
/*
* The maximum number of terms allowed in memory. empty
* if maxSizeFiltering is deactivated.
*/
private Optional<Integer> maxSizeFilter = Optional.empty();
/*
* The total number of documents
*/
private long nbDocuments = -1;
/*
* The input streams.
*/
// document stream when the input is not preprocessed
private Stream<Document> documentStream;
// jcas stream when the input is preprocessed
private Stream<JCas> preprocessedCasStream;
/*
*
*/
private boolean scoringEnabled = true;
/*
*
*/
private boolean variationDetectionEnabled = true;
/*
* Filter properties
*/
private Optional<TerminoFilterConfig> postFilterConfig = Optional.empty();
private Optional<TerminoFilterConfig> preFilterConfig = Optional.empty();
public static TerminoExtractor fromTextString(Lang lang, String text) {
return fromSingleDocument(lang, new Document(lang, "file://inline.text", text));
}
public static TerminoExtractor fromSingleDocument(Lang lang, Document document) {
return fromDocumentCollection(lang, Lists.newArrayList(document));
}
public static TerminoExtractor fromPreprocessedJsonFiles(Lang lang, String directory) {
return fromPreprocessedJsonFiles(lang, directory, Charset.defaultCharset().name());
}
public static TerminoExtractor fromPreprocessedJsonFiles(Lang lang, String directory, String encoding) {
Function<Path, JCas> mapper = path -> {
try {
JCas cas = JCasFactory.createJCas();
cas.setDocumentLanguage(lang.getCode());
TermSuiteJsonCasDeserializer.deserialize(
new FileInputStream(path.toFile()),
cas.getCas(),
encoding);
return cas;
} catch (Exception e) {
throw new TermSuiteException("Unable to parse cas file " + path, e);
}
};
return fromPreprocessedDocumentStream(
lang,
FileSystemUtils.pathWalker(directory, "**/*.json", mapper),
FileSystemUtils.pathDocumentCount(directory, "**/*.json")
);
}
/**
* WARNING : encoding of XMI file must be UTF-8.
*/
public static TerminoExtractor fromPreprocessedXmiFiles(Lang lang, String directory) {
Function<Path, JCas> mapper = path -> {
try {
JCas cas = JCasFactory.createJCas();
cas.setDocumentLanguage(lang.getCode());
XmiCasDeserializer.deserialize(new FileInputStream(path.toFile()), cas.getCas());
return cas;
} catch (Exception e) {
throw new TermSuiteException("Unable to parse cas file " + path, e);
}
};
return fromPreprocessedDocumentStream(
lang,
FileSystemUtils.pathWalker(directory, "**/*.xmi", mapper),
FileSystemUtils.pathDocumentCount(directory, "**/*.xmi")
);
}
public static TerminoExtractor fromPreprocessedDocumentStream(Lang lang, Stream<JCas> casStream, long streamSize) {
TerminoExtractor extractor = new TerminoExtractor();
extractor.preprocessedCasStream = casStream;
extractor.nbDocuments = streamSize;
extractor.lang = lang;
extractor.preprocessed = true;
return extractor;
}
public static TerminoExtractor fromDocumentStream(Lang lang, Stream<Document> documentStream, long streamSize) {
TerminoExtractor extractor = new TerminoExtractor();
extractor.documentStream = documentStream;
extractor.lang = lang;
extractor.nbDocuments = streamSize;
return extractor;
}
public static TerminoExtractor fromDocumentCollection(Lang lang, Collection<Document> documents) {
return fromDocumentStream(lang, documents.stream(), documents.size());
}
public static TerminoExtractor fromTxtCorpus(Lang lang, String directory, String pattern) {
return fromTxtCorpus(lang, directory, pattern, Charset.defaultCharset().name());
}
public static TerminoExtractor fromTxtCorpus(Lang lang, String directory, String pattern, String encoding) {
return fromDocumentStream(
lang,
FileSystemUtils.pathWalker(
directory,
pattern,
FileSystemUtils.pathToDocumentMapper(lang, encoding)),
FileSystemUtils.pathDocumentCount(directory, pattern));
}
public static TerminoExtractor fromSinglePreprocessedDocument(Lang lang, JCas cas) {
return fromPreprocessedDocumentStream(
lang,
Lists.newArrayList(cas).stream(),
1);
}
public TerminoExtractor disableVariationDetection() {
this.variationDetectionEnabled = false;
return this;
}
public TerminoExtractor disableScoring() {
this.scoringEnabled = false;
return this;
}
public TerminoExtractor setTreeTaggerHome(String treeTaggerHome) {
this.treeTaggerHome = treeTaggerHome;
return this;
}
public TerminoExtractor usingCustomResources(String resourceDir) {
Preconditions.checkArgument(new File(resourceDir).exists(), "Directory %s does not exist", resourceDir);
Preconditions.checkArgument(new File(resourceDir).isDirectory(), "Not a directory: %s", resourceDir);
this.customResourceDir = Optional.empty();
return this;
}
public TerminoExtractor useContextualizer(int scope, ContextualizerMode contextualizerMode) {
this.useContextualizer = true;
this.contextualizerScope = 3;
this.contextualizerMode = contextualizerMode;
return this;
}
/**
* Filters the {@link TermIndex} before the term variant detection phase.
*
* This early-stage filtering will result in missing several low-frequency variations
* during the term variation detection but is often necessary
* when detecting variant takes too long.
*
* @param filterConfig
* The filtering configuration
* @return
* this {@link TerminoExtractor} launcher class
*
* @see #postFilter(TerminoFilterConfig)
*
*/
public TerminoExtractor preFilter(TerminoFilterConfig filterConfig) {
this.preFilterConfig = Optional.of(filterConfig);
return this;
}
/**
*
* Filters the {@link TermIndex} dynamically during the term spotting phase (RegexSpotter)
* of terminology extraction by cleaning by frequency whenever the number of terms in-memory
* exceeds a max number of terms allowed.
*
*
* @param maxTermIndexSize
* the maximum number of {@link Term} instances allowed to be kept in memory
* during the terminology extraction process.
* @return
* this {@link TerminoExtractor} launcher class
*
* @see TermSuitePipeline#aeMaxSizeThresholdCleaner(TermProperty, int)
*
*/
public TerminoExtractor dynamicMaxSizeFilter(int maxTermIndexSize) {
this.maxSizeFilter = Optional.of(maxTermIndexSize);
return this;
}
/**
* Filters the {@link TermIndex} at the end of the pipeline,
* i.e. after the term variant detection phase.
*
* This filtering is loss-less when configured with {@link TerminoFilterConfig#keepVariants(true)}.
*
* @param filterConfig
* The filtering configuration
* @return
* this {@link TerminoExtractor} launcher class
*
* @see #preFilter(TerminoFilterConfig)
*
*/
public TerminoExtractor postFilter(TerminoFilterConfig filterConfig) {
this.postFilterConfig = Optional.of(filterConfig);
return this;
}
public TermIndex execute() {
Preconditions.checkNotNull(this.lang, "Language cannot be null");
TermSuitePipeline pipeline = TermSuitePipeline
.create(lang.getCode());
if(history.isPresent())
pipeline.setHistory(history.get());
if(customResourceDir.isPresent())
pipeline.setResourceDir(this.customResourceDir.get());
if(!preprocessed) {
pipeline.aeWordTokenizer()
.setTreeTaggerHome(this.treeTaggerHome)
.aeTreeTagger()
.aeUrlFilter()
.aeStemmer()
.aeRegexSpotter();
} else {
pipeline.aeUrlFilter()
.aeTermOccAnnotationImporter();
}
if(preFilterConfig.isPresent())
PipelineUtils.filter(pipeline, preFilterConfig.get());
if(useContextualizer)
pipeline.aeContextualizer(
contextualizerScope,
contextualizerMode == ContextualizerMode.ON_ALL_TERMS ? true : false);
if(nbDocuments != -1)
pipeline.aeDocumentLogger(this.nbDocuments);
if(maxSizeFilter.isPresent())
pipeline.aeMaxSizeThresholdCleaner(TermProperty.FREQUENCY, maxSizeFilter.get());
pipeline
.aeSpecificityComputer()
.aeCompostSplitter()
.aePrefixSplitter();
if(variationDetectionEnabled)
pipeline
.aeSuffixDerivationDetector()
.aeSyntacticVariantGatherer()
.aeGraphicalVariantGatherer();
if(scoringEnabled)
pipeline.aeExtensionDetector()
.aeScorer()
.aeRanker(TermProperty.SPECIFICITY, true);
if(postFilterConfig.isPresent())
PipelineUtils.filter(pipeline, postFilterConfig.get());
ResourceManager resMgr = UIMAFramework.newDefaultResourceManager();
try {
// Create AAE
AnalysisEngineDescription aaeDesc = createEngineDescription(pipeline.createDescription());
// Instantiate AAE
final AnalysisEngine aae = UIMAFramework.produceAnalysisEngine(aaeDesc, resMgr, null);
if(preprocessed) {
preprocessedCasStream.forEach(cas -> {
try {
aae.process(cas);
} catch (UIMAException e) {
throw new TermSuiteException(e);
}
});
} else {
documentStream.forEach(document -> {
JCas cas;
try {
cas = JCasFactory.createJCas();
cas.setDocumentLanguage(document.getLang().getCode());
cas.setDocumentText(document.getText());
JCasUtils.initJCasSDI(
cas,
document.getLang().getCode(),
document.getText(),
document.getUrl());
aae.process(cas);
} catch (UIMAException e) {
throw new TermSuiteException(e);
}
});
}
aae.collectionProcessComplete();
} catch (ResourceInitializationException | AnalysisEngineProcessException e1) {
throw new TermSuiteException(e1);
}
return pipeline.getTermIndex();
}
private Optional<TermHistory> history = Optional.empty();
public TerminoExtractor setWatcher(TermHistory history) {
this.history = Optional.of(history);
return this;
}
}