/******************************************************************************* * Copyright 2015-2016 - CNRS (Centre National de Recherche Scientifique) * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. * *******************************************************************************/ package eu.project.ttc.tools; import java.io.File; import java.io.Serializable; import java.math.BigInteger; import java.net.MalformedURLException; import java.net.URL; import java.nio.file.Files; import java.nio.file.Paths; import java.security.SecureRandom; import java.util.List; import java.util.Map; import java.util.UUID; import java.util.concurrent.BlockingQueue; import org.apache.commons.lang.mutable.MutableInt; import org.apache.uima.analysis_engine.AnalysisEngineDescription; import org.apache.uima.collection.CollectionReader; import org.apache.uima.collection.CollectionReaderDescription; import org.apache.uima.fit.factory.AggregateBuilder; import org.apache.uima.fit.factory.AnalysisEngineFactory; import org.apache.uima.fit.factory.CollectionReaderFactory; import org.apache.uima.fit.factory.ExternalResourceFactory; import org.apache.uima.fit.pipeline.SimplePipeline; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ExternalResourceDescription; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.common.base.Joiner; import com.google.common.base.Optional; import com.google.common.base.Preconditions; import com.google.common.collect.Maps; import eu.project.ttc.engines.CasStatCounter; import eu.project.ttc.engines.Contextualizer; import eu.project.ttc.engines.DocumentLogger; import eu.project.ttc.engines.EvalEngine; import eu.project.ttc.engines.ExtensionDetecter; import eu.project.ttc.engines.FixedExpressionSpotter; import eu.project.ttc.engines.FixedExpressionTermMarker; import eu.project.ttc.engines.GraphicalVariantGatherer; import eu.project.ttc.engines.MateLemmaFixer; import eu.project.ttc.engines.MateLemmatizerTagger; import eu.project.ttc.engines.Merger; import eu.project.ttc.engines.PipelineObserver; import eu.project.ttc.engines.PrimaryOccurrenceDetector; import eu.project.ttc.engines.Ranker; import eu.project.ttc.engines.RegexSpotter; import eu.project.ttc.engines.ScorerAE; import eu.project.ttc.engines.StringRegexFilter; import eu.project.ttc.engines.SyntacticTermGatherer; import eu.project.ttc.engines.TermClassifier; import eu.project.ttc.engines.TermIndexBlacklistWordFilterAE; import eu.project.ttc.engines.TermOccAnnotationImporter; import eu.project.ttc.engines.TermSpecificityComputer; import eu.project.ttc.engines.TreeTaggerLemmaFixer; import eu.project.ttc.engines.cleaner.AbstractTermIndexCleaner; import eu.project.ttc.engines.cleaner.MaxSizeThresholdCleaner; import eu.project.ttc.engines.cleaner.TermIndexThresholdCleaner; import eu.project.ttc.engines.cleaner.TermIndexTopNCleaner; import eu.project.ttc.engines.cleaner.TermProperty; import eu.project.ttc.engines.desc.Lang; import eu.project.ttc.engines.desc.TermSuiteCollection; import eu.project.ttc.engines.desc.TermSuitePipelineException; import eu.project.ttc.engines.exporter.CompoundExporterAE; import eu.project.ttc.engines.exporter.EvalExporterAE; import eu.project.ttc.engines.exporter.ExportVariationRuleExamplesAE; import eu.project.ttc.engines.exporter.JsonCasExporter; import eu.project.ttc.engines.exporter.JsonExporterAE; import eu.project.ttc.engines.exporter.SpotterTSVWriter; import eu.project.ttc.engines.exporter.TSVExporterAE; import eu.project.ttc.engines.exporter.TbxExporterAE; import eu.project.ttc.engines.exporter.TermsuiteJsonCasExporter; import eu.project.ttc.engines.exporter.VariantEvalExporterAE; import eu.project.ttc.engines.exporter.VariationExporterAE; import eu.project.ttc.engines.exporter.XmiCasExporter; import eu.project.ttc.engines.morpho.CompostAE; import eu.project.ttc.engines.morpho.ManualCompositionSetter; import eu.project.ttc.engines.morpho.ManualPrefixSetter; import eu.project.ttc.engines.morpho.PrefixSplitter; import eu.project.ttc.engines.morpho.SuffixDerivationDetecter; import eu.project.ttc.engines.morpho.SuffixDerivationExceptionSetter; import eu.project.ttc.history.TermHistory; import eu.project.ttc.history.TermHistoryResource; import eu.project.ttc.metrics.LogLikelihood; import eu.project.ttc.models.OccurrenceStore; import eu.project.ttc.models.OccurrenceType; import eu.project.ttc.models.Term; import eu.project.ttc.models.TermIndex; import eu.project.ttc.models.VariationType; import eu.project.ttc.models.index.MemoryTermIndex; import eu.project.ttc.models.occstore.MemoryOccurrenceStore; import eu.project.ttc.models.occstore.MongoDBOccurrenceStore; import eu.project.ttc.readers.AbstractToTxtSaxHandler; import eu.project.ttc.readers.CollectionDocument; import eu.project.ttc.readers.EmptyCollectionReader; import eu.project.ttc.readers.GenericXMLToTxtCollectionReader; import eu.project.ttc.readers.IstexCollectionReader; import eu.project.ttc.readers.JsonCollectionReader; import eu.project.ttc.readers.QueueRegistry; import eu.project.ttc.readers.StreamingCollectionReader; import eu.project.ttc.readers.StringCollectionReader; import eu.project.ttc.readers.TeiCollectionReader; import eu.project.ttc.readers.TxtCollectionReader; import eu.project.ttc.readers.XmiCollectionReader; import eu.project.ttc.resources.CharacterFootprintTermFilter; import eu.project.ttc.resources.CompostInflectionRules; import eu.project.ttc.resources.EvalTrace; import eu.project.ttc.resources.FixedExpressionResource; import eu.project.ttc.resources.GeneralLanguageResource; import eu.project.ttc.resources.ManualSegmentationResource; import eu.project.ttc.resources.MateLemmatizerModel; import eu.project.ttc.resources.MateTaggerModel; import eu.project.ttc.resources.ObserverResource; import eu.project.ttc.resources.PrefixTree; import eu.project.ttc.resources.ReferenceTermList; import eu.project.ttc.resources.SimpleWordSet; import eu.project.ttc.resources.SuffixDerivationList; import eu.project.ttc.resources.TermIndexResource; import eu.project.ttc.resources.TermSuitePipelineObserver; import eu.project.ttc.resources.YamlVariantRules; import eu.project.ttc.stream.CasConsumer; import eu.project.ttc.stream.ConsumerRegistry; import eu.project.ttc.stream.DocumentProvider; import eu.project.ttc.stream.DocumentStream; import eu.project.ttc.stream.StreamingCasConsumer; import eu.project.ttc.types.FixedExpression; import eu.project.ttc.types.TermOccAnnotation; import eu.project.ttc.types.WordAnnotation; import eu.project.ttc.utils.FileUtils; import eu.project.ttc.utils.OccurrenceBuffer; import fr.free.rocheteau.jerome.engines.Stemmer; import fr.univnantes.julestar.uima.resources.MultimapFlatResource; import fr.univnantes.lina.uima.ChineseSegmenterResourceHelper; import fr.univnantes.lina.uima.engines.ChineseSegmenter; import fr.univnantes.lina.uima.engines.TreeTaggerWrapper; import fr.univnantes.lina.uima.models.ChineseSegmentResource; import fr.univnantes.lina.uima.models.TreeTaggerParameter; import fr.univnantes.lina.uima.tkregex.ae.RegexListResource; import fr.univnantes.lina.uima.tkregex.ae.TokenRegexAE; import uima.sandbox.filter.resources.DefaultFilterResource; import uima.sandbox.filter.resources.FilterResource; import uima.sandbox.lexer.engines.Lexer; import uima.sandbox.lexer.resources.SegmentBank; import uima.sandbox.lexer.resources.SegmentBankResource; import uima.sandbox.mapper.engines.Mapper; import uima.sandbox.mapper.resources.Mapping; import uima.sandbox.mapper.resources.MappingResource; /* * TODO Integrates frozen expressions * TODO integrate Sonar runner * TODO Add functional pipeline TestCases for each collection type and for different pipeline configs */ /** * A collection reader and ae aggregator (builder pattern) that * creates and runs a full pipeline. * * @author Damien Cram * */ public class TermSuitePipeline { /* The Logger */ private static final Logger LOGGER = LoggerFactory.getLogger(TermSuitePipeline.class); /* ****************************** * MAIN PIPELINE PARAMETERS */ private OccurrenceStore occurrenceStore = new MemoryOccurrenceStore(); private Optional<? extends TermIndex> termIndex = Optional.absent(); private Lang lang; private CollectionReaderDescription crDescription; private String pipelineObserverName; private AggregateBuilder aggregateBuilder; private String termHistoryResourceName = "PipelineHistory"; /* * POS Tagger parameters */ private Optional<String> mateModelsPath = Optional.absent(); private Optional<String> treeTaggerPath = Optional.absent(); /* * Regex Spotter params */ private boolean addSpottedAnnoToTermIndex = true; private boolean spotWithOccurrences = true; private Optional<Boolean> logOverlappingRules = Optional.absent(); private Optional<String> postProcessingStrategy = Optional.absent(); private boolean enableSyntacticLabels = false; /* * Contextualizer options */ private OccurrenceType contextualizeCoTermsType = OccurrenceType.SINGLE_WORD; private boolean contextualizeWithTermClasses = false; private int contextualizeWithCoOccurrenceFrequencyThreshhold = 1; private String contextAssocRateMeasure = LogLikelihood.class.getName(); /* * Cleaner properties */ private boolean keepVariantsWhileCleaning = false; /* * Compost Params */ private Optional<Float> alpha = Optional.absent(); private Optional<Float> beta = Optional.absent(); private Optional<Float> gamma = Optional.absent(); private Optional<Float> delta = Optional.absent(); private Optional<Float> compostScoreThreshold = Optional.absent(); private Optional<Integer> compostMinComponentSize = Optional.absent(); private Optional<Integer> compostMaxComponentNum = Optional.absent(); private Optional<Float> compostSegmentSimilarityThreshold = Optional.of(1f); /* * Graphical Variant Gatherer parameters */ private Optional<Float> graphicalVariantSimilarityThreshold = Optional.absent(); /* JSON */ private boolean exportJsonWithOccurrences = true; private boolean exportJsonWithContext = false; private boolean linkMongoStore = false; /* TSV */ private String tsvExportProperties = "groupingKey,wr"; private boolean tsvWithVariantScores = false; private boolean tsvWithHeaders = true; /* * Streaming parameters */ private Thread streamThread = null; private DocumentProvider documentProvider; /* ******************* * CONSTRUCTORS */ private TermSuitePipeline(String lang, String urlPrefix) { this.lang = Lang.forName(lang); this.aggregateBuilder = new AggregateBuilder(); this.pipelineObserverName = PipelineObserver.class.getSimpleName() + "-" + Thread.currentThread().getId() + "-" + System.currentTimeMillis(); TermSuiteResourceManager.getInstance().register(pipelineObserverName, new TermSuitePipelineObserver(2,1)); this.termHistoryResourceName = TermHistory.class.getSimpleName() + "-" + Thread.currentThread().getId() + "-" + System.currentTimeMillis(); TermSuiteResourceManager.getInstance().register(termHistoryResourceName, new TermHistory()); initUIMALogging(); } private void initUIMALogging() { System.setProperty("org.apache.uima.logger.class", UIMASlf4jWrapperLogger.class.getName()); } /** * * Starts a chaining {@link TermSuitePipeline} builder. * * @param lang * The * @return * The chaining builder. * */ public static TermSuitePipeline create(String lang) { return new TermSuitePipeline(lang, null); } public static TermSuitePipeline create(TermIndex termIndex) { Preconditions.checkNotNull(termIndex.getName(), "The term index must have a name before it can be used in TermSuitePipeline"); if(!TermSuiteResourceManager.getInstance().contains(termIndex.getName())) TermSuiteResourceManager.getInstance().register(termIndex.getName(), termIndex); TermSuitePipeline pipeline = create(termIndex.getLang().getCode()); pipeline.emptyCollection(); pipeline.setTermIndex(termIndex); return pipeline; } /* ******************************* * RUNNERS */ /** * Runs the pipeline with {@link SimplePipeline} on the {@link CollectionReader} that must have been defined. * * @throws TermSuitePipelineException if no {@link CollectionReader} has been declared on this pipeline */ public TermSuitePipeline run() { checkCR(); runPipeline(); return this; } private void runPipeline() { try { SimplePipeline.runPipeline(this.crDescription, createDescription()); terminates(); } catch (Exception e) { throw new TermSuitePipelineException(e); } } public DocumentStream stream(CasConsumer consumer) { try { String id = new BigInteger(130, new SecureRandom()).toString(8); String casConsumerName = "pipeline-"+id+"-consumer"; ConsumerRegistry.getInstance().registerConsumer(casConsumerName, consumer); String queueName = "pipeline-"+id+"-queue"; final BlockingQueue<CollectionDocument> q = QueueRegistry.getInstance().registerQueue(queueName, 10); /* * 1- Creates the streaming collection reader desc */ this.crDescription = CollectionReaderFactory.createReaderDescription( StreamingCollectionReader.class, StreamingCollectionReader.PARAM_LANGUAGE, this.lang.getCode(), StreamingCollectionReader.PARAM_NAME, queueName, StreamingCollectionReader.PARAM_QUEUE_NAME, queueName ); /* * 2- Aggregate the consumer AE */ AnalysisEngineDescription consumerAE = AnalysisEngineFactory.createEngineDescription( StreamingCasConsumer.class, StreamingCasConsumer.PARAM_CONSUMER_NAME, casConsumerName ); this.aggregateBuilder.add(consumerAE); /* * 3- Starts the pipeline in a separate Thread */ this.streamThread = new Thread() { @Override public void run() { runPipeline(); } }; this.streamThread.start(); /* * 4- Bind user inputs to the queue */ documentProvider = new DocumentProvider() { @Override public void provide(CollectionDocument doc) { try { q.put(doc); } catch (InterruptedException e) { LOGGER.warn("Interrupted while there were more documents waiting."); } } }; return new DocumentStream(streamThread, documentProvider, consumer, queueName); } catch (Exception e) { throw new TermSuitePipelineException(e); } } public Thread getStreamThread() { return streamThread; } private void checkCR() { if(crDescription == null) throw new TermSuitePipelineException("No collection reader has been declared on this pipeline."); } private void terminates() { if(termIndex.isPresent() && termIndex.get().getOccurrenceStore() instanceof MongoDBOccurrenceStore) ((MongoDBOccurrenceStore)termIndex.get().getOccurrenceStore()).close(); } /** * Registers a pipeline listener. * * @param pipelineListener * @return * This chaining {@link TermSuitePipeline} builder object */ public TermSuitePipeline addPipelineListener(PipelineListener pipelineListener) { TermSuiteResourceManager manager = TermSuiteResourceManager.getInstance(); ((TermSuitePipelineObserver)manager.get(pipelineObserverName)).registerListener(pipelineListener); return this; } /** * Runs the pipeline with {@link SimplePipeline} without requiring a {@link CollectionReader} * to be defined. * @param cas the {@link JCas} on which the pipeline operates. * @return * This chaining {@link TermSuitePipeline} builder object */ public TermSuitePipeline run(JCas cas) { try { SimplePipeline.runPipeline(cas, createDescription()); terminates(); return this; } catch (Exception e) { throw new TermSuitePipelineException(e); } } public TermSuitePipeline setInlineString(String text) { try { this.crDescription = CollectionReaderFactory.createReaderDescription( StringCollectionReader.class, StringCollectionReader.PARAM_TEXT, text, StringCollectionReader.PARAM_LANGUAGE, this.lang.getCode() ); return this; } catch (Exception e) { throw new TermSuitePipelineException(e); } } public TermSuitePipeline setIstexCollection(String apiURL, List<String> documentsIds) { try { this.crDescription = CollectionReaderFactory.createReaderDescription( IstexCollectionReader.class, IstexCollectionReader.PARAM_IGNORE_LANGUAGE_ERRORS, true, IstexCollectionReader.PARAM_LANGUAGE, this.lang.getCode(), IstexCollectionReader.PARAM_ID_LIST, Joiner.on(",").join(documentsIds), IstexCollectionReader.PARAM_API_URL, apiURL ); return this; } catch (Exception e) { throw new TermSuitePipelineException(e); } } /** * Creates a collection reader for this pipeline. * * @param termSuiteCollection * @param collectionPath * @param collectionEncoding * @return * This chaining {@link TermSuitePipeline} builder object */ public TermSuitePipeline setCollection(TermSuiteCollection termSuiteCollection, String collectionPath, String collectionEncoding) { Preconditions.checkNotNull(termSuiteCollection); Preconditions.checkNotNull(collectionPath); Preconditions.checkNotNull(collectionEncoding); try { switch(termSuiteCollection) { case TEI: this.crDescription = CollectionReaderFactory.createReaderDescription( TeiCollectionReader.class, TeiCollectionReader.PARAM_INPUTDIR, collectionPath, TxtCollectionReader.PARAM_COLLECTION_TYPE, termSuiteCollection, TeiCollectionReader.PARAM_ENCODING, collectionEncoding, TeiCollectionReader.PARAM_LANGUAGE, this.lang.getCode() ); break; case TXT: this.crDescription = CollectionReaderFactory.createReaderDescription( TxtCollectionReader.class, TxtCollectionReader.PARAM_INPUTDIR, collectionPath, TxtCollectionReader.PARAM_COLLECTION_TYPE, termSuiteCollection, TxtCollectionReader.PARAM_ENCODING, collectionEncoding, TxtCollectionReader.PARAM_LANGUAGE, this.lang.getCode() ); break; case XMI: this.crDescription = CollectionReaderFactory.createReaderDescription( XmiCollectionReader.class, XmiCollectionReader.PARAM_INPUTDIR, collectionPath, XmiCollectionReader.PARAM_COLLECTION_TYPE, termSuiteCollection, XmiCollectionReader.PARAM_ENCODING, collectionEncoding, XmiCollectionReader.PARAM_LANGUAGE, this.lang.getCode() ); break; case JSON: this.crDescription = CollectionReaderFactory.createReaderDescription( JsonCollectionReader.class, JsonCollectionReader.PARAM_INPUTDIR, collectionPath, JsonCollectionReader.PARAM_COLLECTION_TYPE, termSuiteCollection, JsonCollectionReader.PARAM_ENCODING, collectionEncoding, JsonCollectionReader.PARAM_LANGUAGE, this.lang.getCode() ); break; case EMPTY: this.crDescription = CollectionReaderFactory.createReaderDescription( EmptyCollectionReader.class ); break; default: throw new IllegalArgumentException("No such collection: " + termSuiteCollection); } return this; } catch (Exception e) { throw new TermSuitePipelineException(e); } } /** * Creates a collection reader of type {@link GenericXMLToTxtCollectionReader} for this pipeline. * * Requires a list of dropped tags and txt tags for collection parsing. * * @see AbstractToTxtSaxHandler * * @param termSuiteCollection * @param collectionPath * @param collectionEncoding * @param droppedTags * @param txtTags * @return * This chaining {@link TermSuitePipeline} builder object */ public TermSuitePipeline setCollection(TermSuiteCollection termSuiteCollection, String collectionPath, String collectionEncoding, String droppedTags, String txtTags) { try { this.crDescription = CollectionReaderFactory.createReaderDescription( GenericXMLToTxtCollectionReader.class, GenericXMLToTxtCollectionReader.PARAM_COLLECTION_TYPE, termSuiteCollection, GenericXMLToTxtCollectionReader.PARAM_DROPPED_TAGS, droppedTags, GenericXMLToTxtCollectionReader.PARAM_TXT_TAGS, txtTags, GenericXMLToTxtCollectionReader.PARAM_INPUTDIR, collectionPath, GenericXMLToTxtCollectionReader.PARAM_ENCODING, collectionEncoding, GenericXMLToTxtCollectionReader.PARAM_LANGUAGE, this.lang.getCode() ); return this; } catch (Exception e) { throw new TermSuitePipelineException(e); } } /** * Invoke this method if TermSuite resources are accessible via * a "file:/path/to/res/" url, i.e. they can be found locally. * * @param resourceDir * @return */ public TermSuitePipeline setResourceDir(String resourceDir) { Preconditions.checkArgument(new File(resourceDir).isDirectory(), "Not a directory: %s", resourceDir); if(!resourceDir.endsWith(File.separator)) resourceDir = resourceDir + File.separator; // TermSuiteUtils.addToClasspath(resourceDir); try { this.resourceUrlPrefix = Optional.of(new URL("file:" + resourceDir)); LOGGER.info("Resource URL prefix is: {}", this.resourceUrlPrefix.get()); } catch (MalformedURLException e) { throw new TermSuitePipelineException(e); } return this; } public TermSuitePipeline setResourceJar(String resourceJar) { Preconditions.checkArgument(FileUtils.isJar(resourceJar), "Not a jar file: %s", resourceJar); try { this.resourceUrlPrefix = Optional.of(new URL("jar:file:"+resourceJar+"!/")); LOGGER.info("Resource URL prefix is: {}", this.resourceUrlPrefix.get()); } catch (MalformedURLException e) { throw new TermSuitePipelineException(e); } return this; } private Optional<URL> resourceUrlPrefix = Optional.absent(); public TermSuitePipeline setResourceUrlPrefix(String urlPrefix) { try { this.resourceUrlPrefix = Optional.of(new URL(urlPrefix)); } catch (MalformedURLException e) { throw new TermSuitePipelineException("Bad url: " + urlPrefix, e); } return this; } public TermSuitePipeline setContextAssocRateMeasure(String contextAssocRateMeasure) { this.contextAssocRateMeasure = contextAssocRateMeasure; return this; } public TermSuitePipeline emptyCollection() { return setCollection(TermSuiteCollection.EMPTY, "", "UTF-8"); } public AnalysisEngineDescription createDescription() { try { return this.aggregateBuilder.createAggregateDescription(); } catch (Exception e) { throw new TermSuitePipelineException(e); } } public TermSuitePipeline setHistory(TermHistory history) { TermSuiteResourceManager.getInstance().remove(termHistoryResourceName); TermSuiteResourceManager.getInstance().register(termHistoryResourceName, history); return this; } public TermSuitePipeline watch(String... termKeys) { TermHistory termHistory = (TermHistory)TermSuiteResourceManager.getInstance().get(termHistoryResourceName); termHistory.addWatchedTerms(termKeys); return this; } public String getHistoryResourceName() { return termHistoryResourceName; } public TermSuitePipeline aeWordTokenizer() { try { AnalysisEngineDescription ae = AnalysisEngineFactory.createEngineDescription( Lexer.class, Lexer.PARAM_TYPE, "eu.project.ttc.types.WordAnnotation" ); ExternalResourceDescription segmentBank = ExternalResourceFactory.createExternalResourceDescription( SegmentBankResource.class, getResUrl(TermSuiteResource.SEGMENT_BANK) ); ExternalResourceFactory.bindResource( ae, SegmentBank.KEY_SEGMENT_BANK, segmentBank); return aggregateAndReturn(ae, "Word tokenizer", 0); } catch (Exception e) { throw new TermSuitePipelineException(e); } } // private TermSuitePipeline aggregateAndReturn(AnalysisEngineDescription ae) { // return aggregateAndReturn(ae, null, 0); // } private Map<String, MutableInt> taskNumbers = Maps.newHashMap(); private String getNumberedTaskName(String taskName) { if(!taskNumbers.containsKey(taskName)) taskNumbers.put(taskName, new MutableInt(0)); taskNumbers.get(taskName).increment(); return String.format("%s-%d", taskName, taskNumbers.get(taskName).intValue()); } private TermSuitePipeline aggregateAndReturn(AnalysisEngineDescription ae, String taskName, int ccWeight) { Preconditions.checkNotNull(taskName); // Add the pre-task observer this.aggregateBuilder.add(aeObserver(taskName, ccWeight, PipelineObserver.TASK_STARTED)); // Add the ae itself this.aggregateBuilder.add(ae); // Add the post-task observer this.aggregateBuilder.add(aeObserver(taskName, ccWeight, PipelineObserver.TASK_ENDED)); return this; } private AnalysisEngineDescription aeObserver(String taskName, int weight, String hook) { try { AnalysisEngineDescription ae = AnalysisEngineFactory.createEngineDescription( PipelineObserver.class, PipelineObserver.TASK_NAME, taskName, PipelineObserver.HOOK, hook, PipelineObserver.WEIGHT, weight ); ExternalResourceFactory.bindResource(ae, resObserver()); return ae; } catch (Exception e) { throw new TermSuitePipelineException(e); } } public TermSuitePipeline aeTreeTagger() { try { AnalysisEngineDescription ae = AnalysisEngineFactory.createEngineDescription( TreeTaggerWrapper.class, TreeTaggerWrapper.PARAM_ANNOTATION_TYPE, "eu.project.ttc.types.WordAnnotation", TreeTaggerWrapper.PARAM_TAG_FEATURE, "tag", TreeTaggerWrapper.PARAM_LEMMA_FEATURE, "lemma", TreeTaggerWrapper.PARAM_UPDATE_ANNOTATION_FEATURES, true, TreeTaggerWrapper.PARAM_TT_HOME_DIRECTORY, this.treeTaggerPath.get() ); ExternalResourceDescription ttParam = ExternalResourceFactory.createExternalResourceDescription( TreeTaggerParameter.class, getResUrl(TermSuiteResource.TREETAGGER_CONFIG, Tagger.TREE_TAGGER) ); ExternalResourceFactory.bindResource( ae, TreeTaggerParameter.KEY_TT_PARAMETER, ttParam ); return aggregateAndReturn(ae, "POS Tagging (TreeTagger)", 0).ttLemmaFixer().ttNormalizer(); } catch (Exception e) { throw new TermSuitePipelineException(e); } } /* * Builds the resource url for this pipeline */ private URL getResUrl(TermSuiteResource tsResource, Tagger tagger) { if(!resourceUrlPrefix.isPresent()) return tsResource.fromClasspath(lang, tagger); else return tsResource.fromUrlPrefix(this.resourceUrlPrefix.get(), lang, tagger); } /* * Builds the resource url for this pipeline * */ private URL getResUrl(TermSuiteResource tsResource) { if(!resourceUrlPrefix.isPresent()) return tsResource.fromClasspath(lang); else return tsResource.fromUrlPrefix(this.resourceUrlPrefix.get(), lang); } public TermSuitePipeline setMateModelPath(String path) { this.mateModelsPath = Optional.of(path); Preconditions.checkArgument(Files.exists(Paths.get(path)), "Directory %s does not exist", path); Preconditions.checkArgument(Files.isDirectory(Paths.get(path)), "File %s is not a directory", path); return this; } public TermSuitePipeline aeMateTaggerLemmatizer() { try { AnalysisEngineDescription ae = AnalysisEngineFactory.createEngineDescription( MateLemmatizerTagger.class ); Preconditions.checkState(mateModelsPath.isPresent(), "The path to mate models must be explicitely given. See method #setMateModelPath"); String lemmatizerModel = Paths.get(mateModelsPath.get(), "mate-lemma-"+lang.getCode()+".model").toString(); String taggerModel = Paths.get(mateModelsPath.get(), "mate-pos-"+lang.getCode()+".model").toString(); Preconditions.checkArgument(Files.exists(Paths.get(lemmatizerModel)), "Lemmatizer model does not exist: %s", lemmatizerModel); Preconditions.checkArgument(Files.exists(Paths.get(taggerModel)), "Tagger model does not exist: %s", taggerModel); ExternalResourceFactory.createDependencyAndBind( ae, MateLemmatizerTagger.LEMMATIZER, MateLemmatizerModel.class, lemmatizerModel); ExternalResourceFactory.createDependencyAndBind( ae, MateLemmatizerTagger.TAGGER, MateTaggerModel.class, taggerModel); return aggregateAndReturn(ae, "POS Tagging (Mate)", 0) .mateLemmaFixer() .mateNormalizer(); } catch (Exception e) { throw new TermSuitePipelineException(e); } } /** * Defines the term properties that appear in tsv export file * * @see #haeTsvExporter(String) * @param properties * @return * This chaining {@link TermSuitePipeline} builder object */ public TermSuitePipeline setTsvExportProperties(TermProperty... properties) { this.tsvExportProperties = Joiner.on(",").join(properties); return this; } /** * Exports the {@link TermIndex} in tsv format * * @see #setTsvExportProperties(TermProperty...) * @param toFilePath * @return * This chaining {@link TermSuitePipeline} builder object */ public TermSuitePipeline haeTsvExporter(String toFilePath) { try { AnalysisEngineDescription ae = AnalysisEngineFactory.createEngineDescription( TSVExporterAE.class, TSVExporterAE.TO_FILE_PATH, toFilePath, TSVExporterAE.TERM_PROPERTIES, this.tsvExportProperties, TSVExporterAE.SHOW_HEADERS, tsvWithHeaders, TSVExporterAE.SHOW_SCORES, tsvWithVariantScores ); ExternalResourceFactory.bindResource(ae, resTermIndex()); return aggregateAndReturn(ae, getNumberedTaskName("Exporting the terminology to " + toFilePath), 1); } catch (Exception e) { throw new TermSuitePipelineException(e); } } /** * * Exports examples of matching pairs for each variation rule. * * @param toFilePath * the file path where to write the examples for each variation rules * @return * This chaining {@link TermSuitePipeline} builder object */ public TermSuitePipeline haeExportVariationRuleExamples(String toFilePath) { try { AnalysisEngineDescription ae = AnalysisEngineFactory.createEngineDescription( ExportVariationRuleExamplesAE.class, ExportVariationRuleExamplesAE.TO_FILE_PATH, toFilePath); ExternalResourceFactory.bindResource(ae, resTermIndex()); ExternalResourceFactory.bindResource(ae, resSyntacticVariantRules()); return aggregateAndReturn(ae, "Exporting variation rules examples", 0); } catch (Exception e) { throw new TermSuitePipelineException(e); } } /** * * Exports all compound words of the terminology to given file path. * * @param toFilePath * @return * This chaining {@link TermSuitePipeline} builder object */ public TermSuitePipeline haeCompoundExporter(String toFilePath) { try { AnalysisEngineDescription ae = AnalysisEngineFactory.createEngineDescription( CompoundExporterAE.class, CompoundExporterAE.TO_FILE_PATH, toFilePath); ExternalResourceFactory.bindResource(ae, resTermIndex()); return aggregateAndReturn(ae, "Exporting compounds", 0); } catch (Exception e) { throw new TermSuitePipelineException(e); } } public TermSuitePipeline haeVariationExporter(String toFilePath, VariationType... vTypes) { try { String typeStrings = Joiner.on(",").join(vTypes); AnalysisEngineDescription ae = AnalysisEngineFactory.createEngineDescription( VariationExporterAE.class, VariationExporterAE.TO_FILE_PATH, toFilePath, VariationExporterAE.VARIATION_TYPES, typeStrings ); ExternalResourceFactory.bindResource(ae, resTermIndex()); String taskName = "Exporting variations " + typeStrings + " to file " + toFilePath; return aggregateAndReturn(ae, taskName, 0); } catch (Exception e) { throw new TermSuitePipelineException(e); } } public TermSuitePipeline haeTbxExporter(String toFilePath) { try { AnalysisEngineDescription ae = AnalysisEngineFactory.createEngineDescription( TbxExporterAE.class, TbxExporterAE.TO_FILE_PATH, toFilePath ); ExternalResourceFactory.bindResource(ae, resTermIndex()); return aggregateAndReturn(ae, getNumberedTaskName("Exporting the terminology to " + toFilePath), 1); } catch (Exception e) { throw new TermSuitePipelineException(e); } } public TermSuitePipeline haeEvalExporter(String toFilePath, boolean withVariants) { try { AnalysisEngineDescription ae = AnalysisEngineFactory.createEngineDescription( EvalExporterAE.class, EvalExporterAE.TO_FILE_PATH, toFilePath, EvalExporterAE.WITH_VARIANTS, withVariants ); ExternalResourceFactory.bindResource(ae, resTermIndex()); return aggregateAndReturn(ae, "Exporting evaluation files", 0); } catch (Exception e) { throw new TermSuitePipelineException(e); } } public TermSuitePipeline setExportJsonWithOccurrences(boolean exportJsonWithOccurrences) { this.exportJsonWithOccurrences = exportJsonWithOccurrences; return this; } public TermSuitePipeline setExportJsonWithContext(boolean b) { this.exportJsonWithContext = b; return this; } public TermSuitePipeline haeJsonExporter(String toFilePath) { try { AnalysisEngineDescription ae = AnalysisEngineFactory.createEngineDescription( JsonExporterAE.class, JsonExporterAE.TO_FILE_PATH, toFilePath, JsonExporterAE.WITH_OCCURRENCE, exportJsonWithOccurrences, JsonExporterAE.WITH_CONTEXTS, exportJsonWithContext, JsonExporterAE.LINKED_MONGO_STORE, this.linkMongoStore ); ExternalResourceFactory.bindResource(ae, resTermIndex()); return aggregateAndReturn(ae, getNumberedTaskName("Exporting the terminology to " + toFilePath), 1); } catch (Exception e) { throw new TermSuitePipelineException(e); } } /** * * Creates a tsv output with : * - the occurrence list of each term and theirs in-text contexts. * - a json structure for the evaluation of each variant * * @param toFilePath * The output file path * @param topN * The number of variants to keep in the file * @param maxVariantsPerTerm * The maximum number of variants to eval for each term * @return * This chaining {@link TermSuitePipeline} builder object */ public TermSuitePipeline haeVariantEvalExporter(String toFilePath, int topN, int maxVariantsPerTerm) { try { AnalysisEngineDescription ae = AnalysisEngineFactory.createEngineDescription( VariantEvalExporterAE.class, VariantEvalExporterAE.TO_FILE_PATH, toFilePath, VariantEvalExporterAE.TOP_N, topN, VariantEvalExporterAE.NB_VARIANTS_PER_TERM, maxVariantsPerTerm ); ExternalResourceFactory.bindResource(ae, resTermIndex()); return aggregateAndReturn(ae, "Exporting variant evaluation files", 0); } catch (Exception e) { throw new TermSuitePipelineException(e); } } private void addParameters(AnalysisEngineDescription ae, Object... parameters) { if(parameters.length % 2 == 1) throw new IllegalArgumentException("Expecting even number of arguements for key-value pairs: " + parameters.length); for(int i=0; i<parameters.length; i+=2) ae.getMetaData().getConfigurationParameterSettings().setParameterValue((String)parameters[i], parameters[i+1]); } private TermSuitePipeline subNormalizer(String target, URL mappingFile) { try { AnalysisEngineDescription ae = AnalysisEngineFactory.createEngineDescription( Mapper.class, Mapper.PARAM_SOURCE, "eu.project.ttc.types.WordAnnotation:tag", Mapper.PARAM_TARGET, target, Mapper.PARAM_UPDATE, true ); ExternalResourceDescription mappingRes = ExternalResourceFactory.createExternalResourceDescription( MappingResource.class, mappingFile ); ExternalResourceFactory.bindResource( ae, Mapping.KEY_MAPPING, mappingRes ); return aggregateAndReturn(ae, "Normalizing " + mappingFile, 0); } catch (Exception e) { throw new TermSuitePipelineException(e); } } private TermSuitePipeline caseNormalizer(Tagger tagger) { return subNormalizer( "eu.project.ttc.types.WordAnnotation:case", getResUrl(TermSuiteResource.TAGGER_CASE_MAPPING, tagger)); } private TermSuitePipeline categoryNormalizer(Tagger tagger) { return subNormalizer( "eu.project.ttc.types.WordAnnotation:category", getResUrl(TermSuiteResource.TAGGER_CATEGORY_MAPPING, tagger)); } private TermSuitePipeline tenseNormalizer(Tagger tagger) { return subNormalizer( "eu.project.ttc.types.WordAnnotation:tense", getResUrl(TermSuiteResource.TAGGER_TENSE_MAPPING, tagger)); } private TermSuitePipeline subCategoryNormalizer(Tagger tagger) { return subNormalizer( "eu.project.ttc.types.WordAnnotation:subCategory", getResUrl(TermSuiteResource.TAGGER_SUBCATEGORY_MAPPING, tagger)); } private TermSuitePipeline moodNormalizer(Tagger tagger) { return subNormalizer( "eu.project.ttc.types.WordAnnotation:mood", getResUrl(TermSuiteResource.TAGGER_MOOD_MAPPING, tagger)); } private TermSuitePipeline numberNormalizer(Tagger tagger) { return subNormalizer( "eu.project.ttc.types.WordAnnotation:number", getResUrl(TermSuiteResource.TAGGER_NUMBER_MAPPING, tagger)); } private TermSuitePipeline genderNormalizer(Tagger tagger) { return subNormalizer( "eu.project.ttc.types.WordAnnotation:gender", getResUrl(TermSuiteResource.TAGGER_GENDER_MAPPING, tagger)); } private TermSuitePipeline mateNormalizer() { return normalizer(Tagger.MATE); } private TermSuitePipeline ttNormalizer() { return normalizer(Tagger.TREE_TAGGER); } private TermSuitePipeline normalizer(Tagger tagger) { categoryNormalizer(tagger); subCategoryNormalizer(tagger); moodNormalizer(tagger); tenseNormalizer(tagger); genderNormalizer(tagger); numberNormalizer(tagger); return caseNormalizer(tagger); } public TermSuitePipeline aeStemmer() { try { AnalysisEngineDescription ae = AnalysisEngineFactory.createEngineDescription( Stemmer.class, Stemmer.PARAM_FEATURE, "eu.project.ttc.types.WordAnnotation:stem", Stemmer.PARAM_LANGUAGE, lang, Stemmer.PARAM_UPDATE, true ); return aggregateAndReturn(ae, "Stemming", 0); } catch (Exception e) { throw new TermSuitePipelineException(e); } } private TermSuitePipeline ttLemmaFixer() { try { AnalysisEngineDescription ae = AnalysisEngineFactory.createEngineDescription( TreeTaggerLemmaFixer.class, TreeTaggerLemmaFixer.LANGUAGE, lang.getCode() ); return aggregateAndReturn(ae, "Fixing lemmas", 0); } catch (Exception e) { throw new TermSuitePipelineException(e); } } private TermSuitePipeline mateLemmaFixer() { try { AnalysisEngineDescription ae = AnalysisEngineFactory.createEngineDescription( MateLemmaFixer.class, MateLemmaFixer.LANGUAGE, lang.getCode() ); return aggregateAndReturn(ae, "Fixing lemmas", 0); } catch (Exception e) { throw new TermSuitePipelineException(e); } } /** * Iterates over the {@link TermIndex} and mark terms as * "fixed expressions" when their lemmas are found in the * {@link FixedExpressionResource}. * * @return * This chaining {@link TermSuitePipeline} builder object */ public TermSuitePipeline aeFixedExpressionTermMarker() { /* * TODO Check if resource is present for that current language. */ try { AnalysisEngineDescription ae = AnalysisEngineFactory.createEngineDescription( FixedExpressionTermMarker.class ); ExternalResourceDescription fixedExprRes = ExternalResourceFactory.createExternalResourceDescription( FixedExpressionResource.class, getResUrl(TermSuiteResource.FIXED_EXPRESSIONS)); ExternalResourceFactory.bindResource( ae, FixedExpressionResource.FIXED_EXPRESSION_RESOURCE, fixedExprRes ); ExternalResourceFactory.bindResource(ae, resTermIndex()); return aggregateAndReturn(ae, "Marking fixed expression terms", 0); } catch (Exception e) { throw new TermSuitePipelineException(e); } } /** * Spots fixed expressions in the CAS an creates {@link FixedExpression} * annotation whenever one is found. * * @return * This chaining {@link TermSuitePipeline} builder object */ public TermSuitePipeline aeFixedExpressionSpotter() { /* * TODO Check if resource is present for that current language. */ try { AnalysisEngineDescription ae = AnalysisEngineFactory.createEngineDescription( FixedExpressionSpotter.class, FixedExpressionSpotter.FIXED_EXPRESSION_MAX_SIZE, 5, FixedExpressionSpotter.REMOVE_WORD_ANNOTATIONS_FROM_CAS, false, FixedExpressionSpotter.REMOVE_TERM_OCC_ANNOTATIONS_FROM_CAS, true ); ExternalResourceDescription fixedExprRes = ExternalResourceFactory.createExternalResourceDescription( FixedExpressionResource.class, getResUrl(TermSuiteResource.FIXED_EXPRESSIONS)); ExternalResourceFactory.bindResource( ae, FixedExpressionResource.FIXED_EXPRESSION_RESOURCE, fixedExprRes ); return aggregateAndReturn(ae, "Spotting fixed expressions", 0); } catch (Exception e) { throw new TermSuitePipelineException(e); } } /** * The single-word and multi-word term spotter AE * base on UIMA Tokens Regex. * * @return * This chaining {@link TermSuitePipeline} builder object */ public TermSuitePipeline aeRegexSpotter() { try { Serializable postProcStrategy = this.postProcessingStrategy.isPresent() ? this.postProcessingStrategy.get() : lang.getRegexPostProcessingStrategy(); AnalysisEngineDescription ae = AnalysisEngineFactory.createEngineDescription( RegexSpotter.class, TokenRegexAE.PARAM_ALLOW_OVERLAPPING_OCCURRENCES, true, RegexSpotter.POST_PROCESSING_STRATEGY, postProcStrategy ); if(enableSyntacticLabels) addParameters( ae, TokenRegexAE.PARAM_SET_LABELS, "labels"); if(logOverlappingRules.isPresent()) addParameters( ae, RegexSpotter.LOG_OVERLAPPING_RULES, logOverlappingRules.get()); ExternalResourceDescription mwtRules = ExternalResourceFactory.createExternalResourceDescription( RegexListResource.class, getResUrl(TermSuiteResource.MWT_RULES)); ExternalResourceFactory.bindResource( ae, RegexListResource.KEY_TOKEN_REGEX_RULES, mwtRules ); ExternalResourceFactory.bindResource( ae, resHistory()); ExternalResourceDescription allowedCharsRes = ExternalResourceFactory.createExternalResourceDescription( CharacterFootprintTermFilter.class, getResUrl(TermSuiteResource.ALLOWED_CHARS)); ExternalResourceFactory.bindResource( ae, RegexSpotter.CHARACTER_FOOTPRINT_TERM_FILTER, allowedCharsRes ); if(this.addSpottedAnnoToTermIndex) ExternalResourceFactory.bindResource(ae, resTermIndex()); ExternalResourceDescription stopWordsRes = ExternalResourceFactory.createExternalResourceDescription( DefaultFilterResource.class, getResUrl(TermSuiteResource.STOP_WORDS_FILTER)); ExternalResourceFactory.bindResource( ae, RegexSpotter.STOP_WORD_FILTER, stopWordsRes ); return aggregateAndReturn(ae, "Spotting terms", 0).aeTermOccAnnotationImporter(); } catch (Exception e) { throw new TermSuitePipelineException(e); } } /** * An AE thats imports all {@link TermOccAnnotation} in CAS to a {@link TermIndex}. * * @return * This chaining {@link TermSuitePipeline} builder object */ public TermSuitePipeline aeTermOccAnnotationImporter() { try { AnalysisEngineDescription ae = AnalysisEngineFactory.createEngineDescription( TermOccAnnotationImporter.class, TermOccAnnotationImporter.KEEP_OCCURRENCES_IN_TERM_INDEX, spotWithOccurrences ); ExternalResourceFactory.bindResource(ae, resTermIndex()); ExternalResourceFactory.bindResource(ae, resHistory()); return aggregateAndReturn(ae, "TermOccAnnotation importer", 0); } catch (Exception e) { throw new TermSuitePipelineException(e); } } /** * Naive morphological analysis of prefix compounds based on a * prefix dictionary resource * * @return * This chaining {@link TermSuitePipeline} builder object */ public TermSuitePipeline aePrefixSplitter() { try { AnalysisEngineDescription ae = AnalysisEngineFactory.createEngineDescription( PrefixSplitter.class ); ExternalResourceDescription prefixTreeRes = ExternalResourceFactory.createExternalResourceDescription( PrefixTree.class, getResUrl(TermSuiteResource.PREFIX_BANK)); ExternalResourceFactory.bindResource( ae, PrefixTree.PREFIX_TREE, prefixTreeRes ); ExternalResourceFactory.bindResource(ae, resHistory()); ExternalResourceFactory.bindResource(ae, resTermIndex()); return aggregateAndReturn(ae, "Splitting prefixes", 0) .aePrefixExceptionsSetter(); } catch(Exception e) { throw new TermSuitePipelineException(e); } } public TermSuitePipeline aeSuffixDerivationDetector() { try { AnalysisEngineDescription ae = AnalysisEngineFactory.createEngineDescription( SuffixDerivationDetecter.class ); ExternalResourceDescription suffixDerivationsRes = ExternalResourceFactory.createExternalResourceDescription( SuffixDerivationList.class, getResUrl(TermSuiteResource.SUFFIX_DERIVATIONS)); ExternalResourceFactory.bindResource( ae, SuffixDerivationList.SUFFIX_DERIVATIONS, suffixDerivationsRes ); ExternalResourceFactory.bindResource(ae, resTermIndex()); ExternalResourceFactory.bindResource(ae, resHistory()); return aggregateAndReturn(ae, "Detecting suffix derivations prefixes", 0) .aeSuffixDerivationException(); } catch(Exception e) { throw new TermSuitePipelineException(e); } } private TermSuitePipeline aeSuffixDerivationException() { try { AnalysisEngineDescription ae = AnalysisEngineFactory.createEngineDescription( SuffixDerivationExceptionSetter.class ); ExternalResourceDescription suffixDerivationsExceptionsRes = ExternalResourceFactory.createExternalResourceDescription( MultimapFlatResource.class, getResUrl(TermSuiteResource.SUFFIX_DERIVATION_EXCEPTIONS)); ExternalResourceFactory.bindResource( ae, SuffixDerivationExceptionSetter.SUFFIX_DERIVATION_EXCEPTION, suffixDerivationsExceptionsRes ); ExternalResourceFactory.bindResource(ae, resTermIndex()); ExternalResourceFactory.bindResource(ae, resHistory()); return aggregateAndReturn(ae, "Setting suffix derivation exceptions", 0); } catch(Exception e) { throw new TermSuitePipelineException(e); } } private TermSuitePipeline aeManualCompositionSetter() { try { AnalysisEngineDescription ae = AnalysisEngineFactory.createEngineDescription( ManualCompositionSetter.class ); ExternalResourceDescription manualCompositionListRes = ExternalResourceFactory.createExternalResourceDescription( ManualSegmentationResource.class, getResUrl(TermSuiteResource.MANUAL_COMPOSITIONS)); ExternalResourceFactory.bindResource( ae, ManualCompositionSetter.MANUAL_COMPOSITION_LIST, manualCompositionListRes ); ExternalResourceFactory.bindResource(ae, resTermIndex()); return aggregateAndReturn(ae, "Setting manual composition", 0); } catch(Exception e) { throw new TermSuitePipelineException(e); } } private TermSuitePipeline aePrefixExceptionsSetter() { try { AnalysisEngineDescription ae = AnalysisEngineFactory.createEngineDescription( ManualPrefixSetter.class ); ExternalResourceDescription prefixExceptionsRes = ExternalResourceFactory.createExternalResourceDescription( ManualSegmentationResource.class, getResUrl(TermSuiteResource.PREFIX_EXCEPTIONS)); ExternalResourceFactory.bindResource( ae, ManualPrefixSetter.PREFIX_EXCEPTIONS, prefixExceptionsRes ); ExternalResourceFactory.bindResource(ae, resTermIndex()); ExternalResourceFactory.bindResource(ae, resHistory()); return aggregateAndReturn(ae, "Setting prefix exceptions", 0); } catch(Exception e) { throw new TermSuitePipelineException(e); } } /** * Removes from the term index any term having a * stop word at its boundaries. * * @see TermIndexBlacklistWordFilterAE * @return * This chaining {@link TermSuitePipeline} builder object */ public TermSuitePipeline aeStopWordsFilter() { try { AnalysisEngineDescription ae = AnalysisEngineFactory.createEngineDescription( TermIndexBlacklistWordFilterAE.class ); ExternalResourceDescription stopWordsFilterResourceRes = ExternalResourceFactory.createExternalResourceDescription( DefaultFilterResource.class, getResUrl(TermSuiteResource.STOP_WORDS_FILTER)); ExternalResourceFactory.bindResource( ae, FilterResource.KEY_FILTERS, stopWordsFilterResourceRes ); ExternalResourceFactory.bindResource(ae, resTermIndex()); return aggregateAndReturn(ae, "Filtering stop words", 0); } catch(Exception e) { throw new TermSuitePipelineException(e); } } /** * Exports all CAS as XMI files to a given directory. * * @param toDirectoryPath * @return * This chaining {@link TermSuitePipeline} builder object */ public TermSuitePipeline haeXmiCasExporter(String toDirectoryPath) { try { AnalysisEngineDescription ae = AnalysisEngineFactory.createEngineDescription( XmiCasExporter.class, XmiCasExporter.OUTPUT_DIRECTORY, toDirectoryPath ); return aggregateAndReturn(ae, "Exporting XMI Cas files", 0); } catch(Exception e) { throw new TermSuitePipelineException(e); } } /** * Exports all CAS as JSON files to a given directory. * * @param toDirectoryPath * @return * This chaining {@link TermSuitePipeline} builder object */ public TermSuitePipeline haeTermsuiteJsonCasExporter(String toDirectoryPath) { try { AnalysisEngineDescription ae = AnalysisEngineFactory.createEngineDescription( TermsuiteJsonCasExporter.class, TermsuiteJsonCasExporter.OUTPUT_DIRECTORY, toDirectoryPath ); return aggregateAndReturn(ae, "Exporting Json Cas files", 0); } catch(Exception e) { throw new TermSuitePipelineException(e); } } /** * Export all CAS in TSV format to a given directory. * * @see SpotterTSVWriter * @param toDirectoryPath * @return * This chaining {@link TermSuitePipeline} builder object */ public TermSuitePipeline haeSpotterTSVWriter(String toDirectoryPath) { try { AnalysisEngineDescription ae = AnalysisEngineFactory.createEngineDescription( SpotterTSVWriter.class, XmiCasExporter.OUTPUT_DIRECTORY, toDirectoryPath ); return aggregateAndReturn(ae, "Exporting annotations in TSV to " + toDirectoryPath, 0); } catch(Exception e) { throw new TermSuitePipelineException(e); } } public TermSuitePipeline aeDocumentLogger(long nbDocument) { try { AnalysisEngineDescription ae = AnalysisEngineFactory.createEngineDescription( DocumentLogger.class, DocumentLogger.NB_DOCUMENTS, nbDocument ); return aggregateAndReturn(ae, "Document logging", 0); } catch(Exception e) { throw new TermSuitePipelineException(e); } } /** * Tokenizer for chinese collections. * @see ChineseSegmenter * * @return * This chaining {@link TermSuitePipeline} builder object */ public TermSuitePipeline aeChineseTokenizer() { try { AnalysisEngineDescription ae = AnalysisEngineFactory.createEngineDescription( ChineseSegmenter.class, ChineseSegmenter.ANNOTATION_TYPE, "eu.project.ttc.types.WordAnnotation" ); ExternalResourceFactory.createDependencyAndBind( ae, ChineseSegmenter.CHINESE_WORD_SEGMENTS, ChineseSegmentResource.class, ChineseSegmenterResourceHelper.getChineseWordSegments()); ExternalResourceFactory.createDependencyAndBind( ae, ChineseSegmenter.CHINESE_FOREIGN_NAME_SEGMENTS, ChineseSegmentResource.class, ChineseSegmenterResourceHelper.getForeignNameSegments()); ExternalResourceFactory.createDependencyAndBind( ae, ChineseSegmenter.CHINESE_NUMBER_SEGMENTS, ChineseSegmentResource.class, ChineseSegmenterResourceHelper.getNumberSegments()); return aggregateAndReturn(ae, "Word tokenizing", 0); } catch(Exception e) { throw new TermSuitePipelineException(e); } } private ExternalResourceDescription termIndexResourceDesc; public ExternalResourceDescription resTermIndex() { if(termIndexResourceDesc == null) { if(!termIndex.isPresent()) emptyTermIndex(UUID.randomUUID().toString()); termIndexResourceDesc = ExternalResourceFactory.createExternalResourceDescription( TermIndexResource.class, termIndex.get().getName()); TermSuiteResourceManager manager = TermSuiteResourceManager.getInstance(); // register the term index if not in term index manager if(!manager.contains(termIndex.get().getName())) manager.register(termIndex.get().getName(), termIndex.get()); } return termIndexResourceDesc; } private ExternalResourceDescription pipelineObserverResource; public ExternalResourceDescription resObserver() { if(pipelineObserverResource == null) { pipelineObserverResource = ExternalResourceFactory.createExternalResourceDescription( ObserverResource.class, this.pipelineObserverName); } return pipelineObserverResource; } private ExternalResourceDescription termHistoryResource; public ExternalResourceDescription resHistory() { if(termHistoryResource == null) { termHistoryResource = ExternalResourceFactory.createExternalResourceDescription( TermHistoryResource.class, this.termHistoryResourceName); } return termHistoryResource; } private ExternalResourceDescription syntacticVariantRules; public ExternalResourceDescription resSyntacticVariantRules() { if(syntacticVariantRules == null) { syntacticVariantRules = ExternalResourceFactory.createExternalResourceDescription( YamlVariantRules.class, getResUrl(TermSuiteResource.VARIANTS) ); } return syntacticVariantRules; } /** * Returns the term index produced (or last modified) by this pipeline. * @return * The term index processed by this pipeline */ public TermIndex getTermIndex() { return this.termIndex.get(); } /** * Sets the term index on which this pipeline will run. * * @param termIndex * @return * This chaining {@link TermSuitePipeline} builder object */ public TermSuitePipeline setTermIndex(TermIndex termIndex) { this.termIndex = Optional.of(termIndex); return this; } /** * Creates a new in-memory {@link TermIndex} on which this * piepline with run. * * @param name * the name of the new term index * @return * This chaining {@link TermSuitePipeline} builder object */ public TermSuitePipeline emptyTermIndex(String name) { MemoryTermIndex termIndex = new MemoryTermIndex(name, this.lang, this.occurrenceStore); LOGGER.info("Creating TermIndex {}", termIndex.getName()); this.termIndex = Optional.of(termIndex); return this; } private ExternalResourceDescription generalLanguageResourceDesc; private ExternalResourceDescription resGeneralLanguage() { if(generalLanguageResourceDesc == null) generalLanguageResourceDesc = ExternalResourceFactory.createExternalResourceDescription( GeneralLanguageResource.class, getResUrl(TermSuiteResource.GENERAL_LANGUAGE)); return generalLanguageResourceDesc; } /** * Computes {@link TermProperty#WR} values (and additional * term properties of type {@link TermProperty} in the future). * * @see TermSpecificityComputer * @see TermProperty * @return * This chaining {@link TermSuitePipeline} builder object */ public TermSuitePipeline aeSpecificityComputer() { try { AnalysisEngineDescription ae = AnalysisEngineFactory.createEngineDescription( TermSpecificityComputer.class ); ExternalResourceFactory.bindResource(ae, resGeneralLanguage()); ExternalResourceFactory.bindResource(ae, resTermIndex()); ExternalResourceFactory.bindResource(ae, resHistory()); return aggregateAndReturn(ae, "Computing term specificities", 0); } catch(Exception e) { throw new TermSuitePipelineException(e); } } public TermSuitePipeline setContextualizeCoTermsType( OccurrenceType contextualizeCoTermsType) { this.contextualizeCoTermsType = contextualizeCoTermsType; return this; } public TermSuitePipeline setContextualizeWithTermClasses( boolean contextualizeWithTermClasses) { this.contextualizeWithTermClasses = contextualizeWithTermClasses; return this; } public TermSuitePipeline setContextualizeWithCoOccurrenceFrequencyThreshhold( int contextualizeWithCoOccurrenceFrequencyThreshhold) { this.contextualizeWithCoOccurrenceFrequencyThreshhold = contextualizeWithCoOccurrenceFrequencyThreshhold; return this; } /** * Computes the {@link Contextualizer} vector of all * single-word terms in the term index. * * @see Contextualizer * @param scope * @param allTerms * @return * This chaining {@link TermSuitePipeline} builder object */ public TermSuitePipeline aeContextualizer(int scope, boolean allTerms) { AnalysisEngineDescription ae; try { ae = AnalysisEngineFactory.createEngineDescription( Contextualizer.class, Contextualizer.NORMALIZE_ASSOC_RATE, true, Contextualizer.SCOPE, scope, Contextualizer.CO_TERMS_TYPE, contextualizeCoTermsType, Contextualizer.COMPUTE_CONTEXTS_FOR_ALL_TERMS, allTerms, Contextualizer.ASSOCIATION_RATE, contextAssocRateMeasure, Contextualizer.USE_TERM_CLASSES, contextualizeWithTermClasses, Contextualizer.MINIMUM_COOCC_FREQUENCY_THRESHOLD, contextualizeWithCoOccurrenceFrequencyThreshhold ); ExternalResourceFactory.bindResource(ae, resTermIndex()); return aggregateAndReturn(ae, "Build context vectors", 1); } catch (Exception e) { throw new TermSuitePipelineException(e); } } public TermSuitePipeline aeMaxSizeThresholdCleaner(TermProperty property, int maxSize) { try { AnalysisEngineDescription ae = AnalysisEngineFactory.createEngineDescription( MaxSizeThresholdCleaner.class, AbstractTermIndexCleaner.CLEANING_PROPERTY, property, MaxSizeThresholdCleaner.MAX_SIZE, maxSize ); ExternalResourceFactory.bindResource(ae, resTermIndex()); return aggregateAndReturn(ae, "Cleaning TermIndex on property "+property.toString().toLowerCase()+" with maxSize=" + maxSize, 0); } catch(Exception e) { throw new TermSuitePipelineException(e); } } public TermSuitePipeline aeThresholdCleaner(TermProperty property, float threshold, boolean isPeriodic, int cleaningPeriod, int termIndexSizeTrigger) { try { AnalysisEngineDescription ae = AnalysisEngineFactory.createEngineDescription( TermIndexThresholdCleaner.class, AbstractTermIndexCleaner.CLEANING_PROPERTY, property, AbstractTermIndexCleaner.NUM_TERMS_CLEANING_TRIGGER, termIndexSizeTrigger, AbstractTermIndexCleaner.KEEP_VARIANTS, this.keepVariantsWhileCleaning, TermIndexThresholdCleaner.THRESHOLD, threshold ); setPeriodic(isPeriodic, cleaningPeriod, ae); ExternalResourceFactory.bindResource(ae, resTermIndex()); ExternalResourceFactory.bindResource(ae, resHistory()); return aggregateAndReturn(ae, getNumberedTaskName("Cleaning"), 0); } catch(Exception e) { throw new TermSuitePipelineException(e); } } public TermSuitePipeline aePrimaryOccurrenceDetector(int detectionStrategy) { try { AnalysisEngineDescription ae = AnalysisEngineFactory.createEngineDescription( PrimaryOccurrenceDetector.class ); ExternalResourceFactory.bindResource(ae, resTermIndex()); return aggregateAndReturn(ae, "Detecting primary occurrences", 0); } catch(Exception e) { throw new TermSuitePipelineException(e); } } private void setPeriodic(boolean isPeriodic, int cleaningPeriod, AnalysisEngineDescription ae) { if(isPeriodic) addParameters(ae, AbstractTermIndexCleaner.PERIODIC_CAS_CLEAN_ON, true, AbstractTermIndexCleaner.CLEANING_PERIOD, cleaningPeriod ); } /** * * * * @param property * @param threshold * @param cleaningPeriod * @return * This chaining {@link TermSuitePipeline} builder object */ public TermSuitePipeline aeThresholdCleanerPeriodic(TermProperty property, float threshold, int cleaningPeriod) { return aeThresholdCleaner(property, threshold, true, cleaningPeriod, 0); } public TermSuitePipeline aeThresholdCleanerSizeTrigger(TermProperty property, float threshold, int termIndexSizeTrigger) { return aeThresholdCleaner(property, threshold, false, 0, termIndexSizeTrigger); } public TermSuitePipeline setKeepVariantsWhileCleaning(boolean keepVariantsWhileCleaning) { this.keepVariantsWhileCleaning = keepVariantsWhileCleaning; return this; } public TermSuitePipeline aeThresholdCleaner(TermProperty property, float threshold) { return aeThresholdCleaner(property, threshold, false, 0, 0); } public TermSuitePipeline aeTopNCleaner(TermProperty property, int n) { return aeTopNCleanerPeriodic(property, n, false, 0); } /** * * @param property * @param n * @param isPeriodic * @param cleaningPeriod * @return * This chaining {@link TermSuitePipeline} builder object */ public TermSuitePipeline aeTopNCleanerPeriodic(TermProperty property, int n, boolean isPeriodic, int cleaningPeriod) { try { AnalysisEngineDescription ae = AnalysisEngineFactory.createEngineDescription( TermIndexTopNCleaner.class, AbstractTermIndexCleaner.CLEANING_PROPERTY, property, TermIndexTopNCleaner.TOP_N, n ); setPeriodic(isPeriodic, cleaningPeriod, ae); ExternalResourceFactory.bindResource(ae, resTermIndex()); ExternalResourceFactory.bindResource(ae, resHistory()); return aggregateAndReturn(ae, "Cleaning TermIndex. Keepings only top " + n + " terms on property " + property.toString().toLowerCase(), 0); } catch(Exception e) { throw new TermSuitePipelineException(e); } } public TermSuitePipeline setGraphicalVariantSimilarityThreshold(float th) { this.graphicalVariantSimilarityThreshold = Optional.of(th); return this; } public TermSuitePipeline aeGraphicalVariantGatherer() { try { AnalysisEngineDescription ae = AnalysisEngineFactory.createEngineDescription( GraphicalVariantGatherer.class, GraphicalVariantGatherer.LANG, lang.getCode(), GraphicalVariantGatherer.SIMILARITY_THRESHOLD, graphicalVariantSimilarityThreshold.isPresent() ? graphicalVariantSimilarityThreshold.get() : 0.9f ); ExternalResourceFactory.bindResource(ae, resTermIndex()); ExternalResourceFactory.bindResource(ae, resObserver()); ExternalResourceFactory.bindResource(ae, resHistory()); return aggregateAndReturn(ae, GraphicalVariantGatherer.TASK_NAME, 1); } catch(Exception e) { throw new TermSuitePipelineException(e); } } /** * Filters out URLs from CAS. * * @return * This chaining {@link TermSuitePipeline} builder object */ public TermSuitePipeline aeUrlFilter() { try { AnalysisEngineDescription ae = AnalysisEngineFactory.createEngineDescription( StringRegexFilter.class ); return aggregateAndReturn(ae, "Filtering URLs", 0); } catch(Exception e) { throw new TermSuitePipelineException(e); } } /** * Gathers terms according to their syntactic structures. * * @return * This chaining {@link TermSuitePipeline} builder object */ public TermSuitePipeline aeSyntacticVariantGatherer() { try { AnalysisEngineDescription ae = AnalysisEngineFactory.createEngineDescription( SyntacticTermGatherer.class ); ExternalResourceFactory.bindResource(ae, resSyntacticVariantRules()); ExternalResourceFactory.bindResource(ae, resTermIndex()); ExternalResourceFactory.bindResource(ae, resObserver()); ExternalResourceFactory.bindResource(ae, resHistory()); return aggregateAndReturn(ae, SyntacticTermGatherer.TASK_NAME, 1); } catch(Exception e) { throw new TermSuitePipelineException(e); } } /** * Detects all inclusion/extension relation between terms that have size >= 2. * * @return * This chaining {@link TermSuitePipeline} builder object */ public TermSuitePipeline aeExtensionDetector() { try { AnalysisEngineDescription ae = AnalysisEngineFactory.createEngineDescription( ExtensionDetecter.class ); ExternalResourceFactory.bindResource(ae, resTermIndex()); ExternalResourceFactory.bindResource(ae, resHistory()); return aggregateAndReturn(ae, "Detecting term extensions", 1); } catch(Exception e) { throw new TermSuitePipelineException(e); } } /** * Transforms the {@link TermIndex} into a flat one-n scored model. * * * @return * This chaining {@link TermSuitePipeline} builder object */ public TermSuitePipeline aeScorer() { try { AnalysisEngineDescription ae = AnalysisEngineFactory.createEngineDescription( ScorerAE.class ); ExternalResourceFactory.bindResource(ae, resTermIndex()); ExternalResourceFactory.bindResource(ae, resObserver()); ExternalResourceFactory.bindResource(ae, resHistory()); return aggregateAndReturn(ae, ScorerAE.TASK_NAME, 1); } catch(Exception e) { throw new TermSuitePipelineException(e); } } /** * Merges the variants (only those who are extensions of the base term) * of a terms by graphical variation. * * @return * This chaining {@link TermSuitePipeline} builder object */ public TermSuitePipeline aeMerger() { try { AnalysisEngineDescription ae = AnalysisEngineFactory.createEngineDescription( Merger.class, Merger.SIMILARITY_THRESHOLD, 0.9f ); ExternalResourceFactory.bindResource(ae, resTermIndex()); ExternalResourceFactory.bindResource(ae, resObserver()); return aggregateAndReturn(ae, Merger.TASK_NAME, 1); } catch(Exception e) { throw new TermSuitePipelineException(e); } } /** * * Sets the {@link Term#setRank(int)} of all terms of the {@link TermIndex} * given a {@link TermProperty}. * * @param property * @param desc * @return */ public TermSuitePipeline aeRanker(TermProperty property, boolean desc) { Preconditions.checkArgument(property != TermProperty.RANK, "Cannot rank on property %s", TermProperty.RANK); try { AnalysisEngineDescription ae = AnalysisEngineFactory.createEngineDescription( Ranker.class, Ranker.RANKING_PROPERTY, property, Ranker.DESC, desc ); ExternalResourceFactory.bindResource(ae, resTermIndex()); ExternalResourceFactory.bindResource(ae, resObserver()); ExternalResourceFactory.bindResource(ae, resHistory()); return aggregateAndReturn(ae, Ranker.TASK_NAME, 1); } catch(Exception e) { throw new TermSuitePipelineException(e); } } public TermSuitePipeline setTreeTaggerHome(String treeTaggerPath) { this.treeTaggerPath = Optional.of(treeTaggerPath); return this; } public TermSuitePipeline haeLogOverlappingRules() { this.logOverlappingRules = Optional.of(true); return this; } public TermSuitePipeline enableSyntacticLabels() { this.enableSyntacticLabels = true; return this; } public TermSuitePipeline setCompostCoeffs(float alpha, float beta, float gamma, float delta) { Preconditions.checkArgument(alpha + beta + gamma + delta == 1.0f, "The sum of coeff must be 1.0"); this.alpha = Optional.of(alpha); this.beta = Optional.of(beta); this.gamma = Optional.of(gamma); this.delta = Optional.of(delta); return this; } public TermSuitePipeline setCompostMaxComponentNum(int compostMaxComponentNum) { this.compostMaxComponentNum = Optional.of(compostMaxComponentNum); return this; } public TermSuitePipeline setCompostMinComponentSize(int compostMinComponentSize) { this.compostMinComponentSize = Optional.of(compostMinComponentSize); return this; } public TermSuitePipeline setCompostScoreThreshold(float compostScoreThreshold) { this.compostScoreThreshold = Optional.of(compostScoreThreshold); return this; } public TermSuitePipeline setCompostSegmentSimilarityThreshold( float compostSegmentSimilarityThreshold) { this.compostSegmentSimilarityThreshold = Optional.of(compostSegmentSimilarityThreshold); return this; } public TermSuitePipeline aeCompostSplitter() { try { AnalysisEngineDescription ae = AnalysisEngineFactory.createEngineDescription( CompostAE.class, CompostAE.SCORE_THRESHOLD, this.compostScoreThreshold.isPresent() ? this.compostScoreThreshold.get() : this.lang.getCompostScoreThreshold(), CompostAE.ALPHA, alpha.isPresent() ? alpha.get() : lang.getCompostAlpha(), CompostAE.BETA, beta.isPresent() ? beta.get() : lang.getCompostBeta(), CompostAE.GAMMA, gamma.isPresent() ? gamma.get() : lang.getCompostGamma(), CompostAE.DELTA, delta.isPresent() ? delta.get() : lang.getCompostDelta(), CompostAE.MIN_COMPONENT_SIZE, this.compostMinComponentSize.isPresent() ? this.compostMinComponentSize.get() : this.lang.getCompostMinComponentSize(), CompostAE.MAX_NUMBER_OF_COMPONENTS, this.compostMaxComponentNum.isPresent() ? this.compostMaxComponentNum.get() : this.lang.getCompostMaxComponentNumber(), CompostAE.SEGMENT_SIMILARITY_THRESHOLD, this.compostSegmentSimilarityThreshold.get() ); ExternalResourceFactory.bindResource(ae, resTermIndex()); ExternalResourceFactory.bindResource(ae, resObserver()); ExternalResourceDescription langDicoRes = ExternalResourceFactory.createExternalResourceDescription( SimpleWordSet.class, getResUrl(TermSuiteResource.DICO)); ExternalResourceFactory.bindResource( ae, CompostAE.LANGUAGE_DICO, langDicoRes ); ExternalResourceDescription compostInflectionRulesRes = ExternalResourceFactory.createExternalResourceDescription( CompostInflectionRules.class, getResUrl(TermSuiteResource.COMPOST_INFLECTION_RULES)); ExternalResourceFactory.bindResource( ae, CompostAE.INFLECTION_RULES, compostInflectionRulesRes ); ExternalResourceDescription transformationRulesRes = ExternalResourceFactory.createExternalResourceDescription( CompostInflectionRules.class, getResUrl(TermSuiteResource.COMPOST_TRANSFORMATION_RULES)); ExternalResourceFactory.bindResource( ae, CompostAE.TRANSFORMATION_RULES, transformationRulesRes ); ExternalResourceDescription compostStopListRes = ExternalResourceFactory.createExternalResourceDescription( SimpleWordSet.class, getResUrl(TermSuiteResource.COMPOST_STOP_LIST)); ExternalResourceFactory.bindResource( ae, CompostAE.STOP_LIST, compostStopListRes ); ExternalResourceDescription neoClassicalPrefixesRes = ExternalResourceFactory.createExternalResourceDescription( SimpleWordSet.class, getResUrl(TermSuiteResource.NEOCLASSICAL_PREFIXES)); ExternalResourceFactory.bindResource( ae, CompostAE.NEOCLASSICAL_PREFIXES, neoClassicalPrefixesRes ); ExternalResourceFactory.bindResource(ae, resHistory()); return aeManualCompositionSetter() .aggregateAndReturn(ae, CompostAE.TASK_NAME, 2); } catch(Exception e) { throw new TermSuitePipelineException(e); } } public TermSuitePipeline haeCasStatCounter(String statName) { try { AnalysisEngineDescription ae = AnalysisEngineFactory.createEngineDescription( CasStatCounter.class, CasStatCounter.STAT_NAME, statName ); ExternalResourceFactory.bindResource(ae, resTermIndex()); return aggregateAndReturn(ae, getNumberedTaskName("Counting stats ["+statName+"]"), 0); } catch(Exception e) { throw new TermSuitePipelineException(e); } } /** * * Exports time progress to TSV file. * * Columns are : * <ul> * <li>elapsed time from initialization in milliseconds</li> * <li>number of docs processed</li> * <li>cumulated size of data processed</li> * <li>number of terms in term index</li> * <li>number of {@link WordAnnotation} processed</li> * </ul> * * * @param toFile * @return * This chaining {@link TermSuitePipeline} builder object */ public TermSuitePipeline haeTraceTimePerf(String toFile) { try { AnalysisEngineDescription ae = AnalysisEngineFactory.createEngineDescription( CasStatCounter.class, CasStatCounter.DOCUMENT_PERIOD, 1, CasStatCounter.TO_TRACE_FILE, toFile ); ExternalResourceFactory.bindResource(ae, resTermIndex()); return aggregateAndReturn(ae, "Exporting time performances to file " + toFile, 0); } catch(Exception e) { throw new TermSuitePipelineException(e); } } /** * * @see TermClassifier * @param sortingProperty * the term property used to order terms before they are classified. * The first term of a class appearing given this order will be considered * as the head of the class. * @return * This chaining {@link TermSuitePipeline} builder object */ public TermSuitePipeline aeTermClassifier(TermProperty sortingProperty) { try { AnalysisEngineDescription ae = AnalysisEngineFactory.createEngineDescription( TermClassifier.class, TermClassifier.CLASSIFYING_PROPERTY, sortingProperty ); ExternalResourceFactory.bindResource(ae, resTermIndex()); return aggregateAndReturn(ae, "Classifying ters on property " + sortingProperty.toString().toLowerCase(), 0); } catch(Exception e) { throw new TermSuitePipelineException(e); } } /** * * @param refFileURI * The path to reference termino * @param outputFile * The path to output log file * @param customLogHeader * A custom string to add in the header of the output log file * @param rFile * The path to output r file * @param evalTraceName * The name of the eval trace * @param rtlWithVariants * true if variants of the reference termino should be kept during the eval * @return * This chaining {@link TermSuitePipeline} builder object */ public TermSuitePipeline haeEval(String refFileURI, String outputFile, String customLogHeader, String rFile, String evalTraceName, boolean rtlWithVariants) { try { AnalysisEngineDescription ae = AnalysisEngineFactory.createEngineDescription( EvalEngine.class, EvalEngine.OUTPUT_LOG_FILE, outputFile, EvalEngine.OUTPUT_R_FILE, rFile, EvalEngine.CUSTOM_LOG_HEADER_STRING, customLogHeader, // EvalEngine.LC_WITH_VARIANTS, extractedTerminoWithVariants, EvalEngine.RTL_WITH_VARIANTS, rtlWithVariants ); ExternalResourceFactory.bindResource(ae, resTermIndex()); ExternalResourceFactory.createDependencyAndBind( ae, EvalEngine.EVAL_TRACE, EvalTrace.class, evalTraceName); ExternalResourceFactory.createDependencyAndBind( ae, EvalEngine.REFERENCE_LIST, ReferenceTermList.class, "file:" + refFileURI); return aggregateAndReturn(ae, "Evaluating " + evalTraceName, 0); } catch(Exception e) { throw new TermSuitePipelineException(e); } } /** * * Stores occurrences to MongoDB * * @param mongoDBUri * the mongo db connection uri * @return * This chaining {@link TermSuitePipeline} builder object */ public TermSuitePipeline setMongoDBOccurrenceStore(String mongoDBUri) { this.occurrenceStore = new MongoDBOccurrenceStore(mongoDBUri); return this; } /** * @deprecated Use TermSuitePipeline#setOccurrenceStoreMode instead. * * @param activate * @return * This chaining {@link TermSuitePipeline} builder object * */ @Deprecated public TermSuitePipeline setSpotWithOccurrences(boolean activate) { this.spotWithOccurrences = activate; return this; } /** * Configures {@link RegexSpotter}. If <code>true</code>, * adds all spotted occurrences to the {@link TermIndex}. * * @see #aeRegexSpotter() * * @param addToTermIndex * the value of the parameter * @return * This chaining {@link TermSuitePipeline} builder object */ public TermSuitePipeline setAddSpottedAnnoToTermIndex(boolean addToTermIndex) { this.addSpottedAnnoToTermIndex = addToTermIndex; return this; } /** * Sets the post processing strategy for {@link RegexSpotter} analysis engine * * @see #aeRegexSpotter() * @see OccurrenceBuffer#NO_CLEANING * @see OccurrenceBuffer#KEEP_PREFIXES * @see OccurrenceBuffer#KEEP_SUFFIXES * * @param postProcessingStrategy * @return * This chaining {@link TermSuitePipeline} builder object */ public TermSuitePipeline setPostProcessingStrategy( String postProcessingStrategy) { this.postProcessingStrategy = Optional.of(postProcessingStrategy); return this; } /** * Configures tsvExporter to (not) show headers on the * first line. * * @param tsvWithHeaders * the flag * @return * This chaining {@link TermSuitePipeline} builder object */ public TermSuitePipeline setTsvShowHeaders(boolean tsvWithHeaders) { this.tsvWithHeaders = tsvWithHeaders; return this; } /** * Configures tsvExporter to (not) show variant scores with the * "V" label * * @param tsvWithVariantScores * the flag * @return * This chaining {@link TermSuitePipeline} builder object */ public TermSuitePipeline setTsvShowScores(boolean tsvWithVariantScores) { this.tsvWithVariantScores = tsvWithVariantScores; return this; } public TermSuitePipeline haeJsonCasExporter(String toDirectoryPath ) { try { AnalysisEngineDescription ae = AnalysisEngineFactory.createEngineDescription( JsonCasExporter.class, JsonCasExporter.OUTPUT_DIRECTORY, toDirectoryPath ); return aggregateAndReturn(ae, getNumberedTaskName("Exporting CAS to JSON files"), 0); } catch(Exception e) { throw new TermSuitePipelineException(e); } } /** * * Configures the {@link JsonExporterAE} to not embed the occurrences * in the json file, but to link the mongodb occurrence store instead. * * * * @see #haeJsonExporter(String) * @return * This chaining {@link TermSuitePipeline} builder object */ public TermSuitePipeline linkMongoStore() { this.linkMongoStore = true; return this; } /** * * Aggregates an AE to the TS pipeline. * * @param ae * the ae description of the added pipeline. * @param taskName * a user-readable name for the AE task (intended to * be displayed in progress views) * @return * This chaining {@link TermSuitePipeline} builder object * */ public TermSuitePipeline customAE(AnalysisEngineDescription ae, String taskName) { try { return aggregateAndReturn(ae, taskName, 0); } catch(Exception e) { throw new TermSuitePipelineException(e); } } }