TerminoExtractor.java example

Explorer
termsuite-core-master
- src
package eu.project.ttc.api;

import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription;

import java.io.File;
import java.io.FileInputStream;
import java.nio.charset.Charset;
import java.nio.file.Path;
import java.util.Collection;
import java.util.Optional;
import java.util.function.Function;
import java.util.stream.Stream;

import org.apache.uima.UIMAException;
import org.apache.uima.UIMAFramework;
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.impl.XmiCasDeserializer;
import org.apache.uima.fit.factory.JCasFactory;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.resource.ResourceManager;

import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;

import eu.project.ttc.engines.cleaner.TermProperty;
import eu.project.ttc.engines.desc.Lang;
import eu.project.ttc.history.TermHistory;
import eu.project.ttc.models.Term;
import eu.project.ttc.models.TermIndex;
import eu.project.ttc.readers.TermSuiteJsonCasDeserializer;
import eu.project.ttc.tools.TermSuitePipeline;
import eu.project.ttc.tools.api.internal.FileSystemUtils;
import eu.project.ttc.tools.api.internal.PipelineUtils;
import eu.project.ttc.utils.JCasUtils;

/**
 * 
 * A builder and launcher class for execute a terminology extraction
 * pipeline from raw text files or from TermSuite preprocessed files.
 * 
 * @author Damien Cram
 * 
 * @see TermSuitePreprocessor
 *
 */
public class TerminoExtractor {
	
	public static enum ContextualizerMode {ON_ALL_TERMS, ON_SWT_TERMS}
	
	/*
	 * TreeTagger home
	 */
	private String treeTaggerHome;
	
	/*
	 * Pipeline language
	 */
	private Lang lang = null;
	
	/*
	 * Custom resources
	 */
	private Optional<String> customResourceDir = Optional.empty();
	
	/*
	 * Contextualizer properties
	 */
	private boolean useContextualizer = false;
	private int contextualizerScope = 3;
	private ContextualizerMode contextualizerMode = ContextualizerMode.ON_SWT_TERMS;

	/*
	 * true if the input is preprocessed, false otherwise.
	 */
	private boolean preprocessed = false;
	
	/*
	 * The maximum number of terms allowed in memory. empty 
	 * if maxSizeFiltering is deactivated.
	 */
	private Optional<Integer> maxSizeFilter = Optional.empty();
	
	
	/*
	 * The total number of documents
	 */
	private long nbDocuments = -1;
	
	/*
	 * The input streams.
	 */
	// document stream when the input is not preprocessed
	private Stream<Document> documentStream;
	
	// jcas stream when the input is preprocessed
	private Stream<JCas> preprocessedCasStream;
	
	
	/*
	 * 
	 */
	private boolean scoringEnabled = true;
	
	
	/*
	 * 
	 */
	private boolean variationDetectionEnabled = true;
	
	
	/*
	 * Filter properties
	 */
	private Optional<TerminoFilterConfig> postFilterConfig = Optional.empty();
	private Optional<TerminoFilterConfig> preFilterConfig  = Optional.empty();
	
	public static TerminoExtractor fromTextString(Lang lang, String text) {
		return fromSingleDocument(lang, new Document(lang, "file://inline.text", text));
	}

	public static TerminoExtractor fromSingleDocument(Lang lang, Document document) {
		return fromDocumentCollection(lang, Lists.newArrayList(document));
	}

	public static TerminoExtractor fromPreprocessedJsonFiles(Lang lang, String directory) {
		return fromPreprocessedJsonFiles(lang, directory, Charset.defaultCharset().name());
	}

	public static TerminoExtractor fromPreprocessedJsonFiles(Lang lang, String directory, String encoding) {
		Function<Path, JCas> mapper = path -> {
			try {
				JCas cas = JCasFactory.createJCas();
				cas.setDocumentLanguage(lang.getCode());
				TermSuiteJsonCasDeserializer.deserialize(
						new FileInputStream(path.toFile()), 
						cas.getCas(),
						encoding);
				return cas;
			} catch (Exception e) {
				throw new TermSuiteException("Unable to parse cas file " + path, e);
			}
		};

		return fromPreprocessedDocumentStream(
				lang, 
				FileSystemUtils.pathWalker(directory, "**/*.json", mapper),
				FileSystemUtils.pathDocumentCount(directory, "**/*.json")
			);
		
	}

	/**
	 * WARNING : encoding of XMI file must be UTF-8.
	 */
	public static TerminoExtractor fromPreprocessedXmiFiles(Lang lang, String directory) {
		Function<Path, JCas> mapper = path -> {
			try {
				JCas cas = JCasFactory.createJCas();
				cas.setDocumentLanguage(lang.getCode());

				XmiCasDeserializer.deserialize(new FileInputStream(path.toFile()), cas.getCas());
				return cas;
			} catch (Exception e) {
				throw new TermSuiteException("Unable to parse cas file " + path, e);
			}
		};

		return fromPreprocessedDocumentStream(
				lang, 
				FileSystemUtils.pathWalker(directory, "**/*.xmi", mapper),
				FileSystemUtils.pathDocumentCount(directory, "**/*.xmi")
			);
	}

	public static TerminoExtractor fromPreprocessedDocumentStream(Lang lang, Stream<JCas> casStream, long streamSize) {
		TerminoExtractor extractor = new TerminoExtractor();
		extractor.preprocessedCasStream = casStream;
		extractor.nbDocuments = streamSize;
		extractor.lang = lang;
		extractor.preprocessed = true;
		return extractor;
	}

	public static TerminoExtractor fromDocumentStream(Lang lang, Stream<Document> documentStream, long streamSize) {
		TerminoExtractor extractor = new TerminoExtractor();
		extractor.documentStream = documentStream;
		extractor.lang = lang;
		extractor.nbDocuments = streamSize;
		return extractor;
	}
	
	public static TerminoExtractor fromDocumentCollection(Lang lang, Collection<Document> documents) {
		return fromDocumentStream(lang, documents.stream(), documents.size());
	}

	public static TerminoExtractor fromTxtCorpus(Lang lang, String directory, String pattern) {
		return fromTxtCorpus(lang, directory, pattern, Charset.defaultCharset().name());
	}

	public static TerminoExtractor fromTxtCorpus(Lang lang, String directory, String pattern, String encoding) {
		return fromDocumentStream(
				lang, 
				FileSystemUtils.pathWalker(
						directory, 
						pattern, 
						FileSystemUtils.pathToDocumentMapper(lang, encoding)),
				FileSystemUtils.pathDocumentCount(directory, pattern));
	}

	
	public static TerminoExtractor fromSinglePreprocessedDocument(Lang lang, JCas cas) {
		return fromPreprocessedDocumentStream(
				lang, 
				Lists.newArrayList(cas).stream(),
				1);
	}

	public TerminoExtractor disableVariationDetection() {
		this.variationDetectionEnabled = false;
		return this;
	}

	public TerminoExtractor disableScoring() {
		this.scoringEnabled = false;
		return this;
	}

	public TerminoExtractor setTreeTaggerHome(String treeTaggerHome) {
		this.treeTaggerHome = treeTaggerHome;
		return this;
	}

	public TerminoExtractor usingCustomResources(String resourceDir) {
		Preconditions.checkArgument(new File(resourceDir).exists(), "Directory %s does not exist", resourceDir);
		Preconditions.checkArgument(new File(resourceDir).isDirectory(), "Not a directory: %s", resourceDir);
		this.customResourceDir = Optional.empty();
		return this;
	}
	
	public TerminoExtractor useContextualizer(int scope, ContextualizerMode contextualizerMode) {
		this.useContextualizer = true;
		this.contextualizerScope = 3;
		this.contextualizerMode = contextualizerMode;
		return this;
	}
	
	/**
	 * Filters the {@link TermIndex} before the term variant detection phase.
	 * 
	 * This early-stage filtering will result in missing several low-frequency variations
	 * during the term variation detection but is often necessary 
	 * when detecting variant takes too long.
	 * 
	 * @param filterConfig
	 * 			The filtering configuration
	 * @return
	 * 			this {@link TerminoExtractor} launcher class
	 * 
	 * @see  #postFilter(TerminoFilterConfig)
	 * 
	 */
	public TerminoExtractor preFilter(TerminoFilterConfig filterConfig) {
		this.preFilterConfig = Optional.of(filterConfig);
		return this;
	}

	
	/**
	 * 
	 * Filters the {@link TermIndex} dynamically during the term spotting phase (RegexSpotter)
	 * of terminology extraction by cleaning by frequency whenever the number of terms in-memory 
	 * exceeds a max number of terms allowed.
	 * 
	 * 
	 * @param maxTermIndexSize
	 * 			the maximum number of {@link Term} instances allowed to be kept in memory 
	 * 			during the terminology extraction process.
	 * @return
	 * 		this {@link TerminoExtractor} launcher class
	 * 
	 * @see TermSuitePipeline#aeMaxSizeThresholdCleaner(TermProperty, int)
	 * 
	 */
	public TerminoExtractor dynamicMaxSizeFilter(int maxTermIndexSize) {
		this.maxSizeFilter = Optional.of(maxTermIndexSize);
		return this;
	}

	
	
	/**
	 * Filters the {@link TermIndex} at the end of the pipeline,
	 * i.e. after the term variant detection phase.
	 * 
	 * This filtering is loss-less when configured with {@link TerminoFilterConfig#keepVariants(true)}.
	 * 
	 * @param filterConfig
	 * 			The filtering configuration
	 * @return
	 * 			this {@link TerminoExtractor} launcher class
	 * 
	 * @see  #preFilter(TerminoFilterConfig)
	 * 
	 */
	public TerminoExtractor postFilter(TerminoFilterConfig filterConfig) {
		this.postFilterConfig = Optional.of(filterConfig);
		return this;
	}	
	
	public TermIndex execute() {
		Preconditions.checkNotNull(this.lang, "Language cannot be null");
		
		TermSuitePipeline pipeline = TermSuitePipeline
				.create(lang.getCode());
		
		if(history.isPresent())
			pipeline.setHistory(history.get());
		
		if(customResourceDir.isPresent())
			pipeline.setResourceDir(this.customResourceDir.get());
		
		if(!preprocessed) {
			
			pipeline.aeWordTokenizer()
					.setTreeTaggerHome(this.treeTaggerHome)
					.aeTreeTagger()
					.aeUrlFilter()
					.aeStemmer()
					.aeRegexSpotter();
		} else {
			pipeline.aeUrlFilter()
					.aeTermOccAnnotationImporter();
		}
		
		
		if(preFilterConfig.isPresent()) 
			PipelineUtils.filter(pipeline, preFilterConfig.get());
			
		if(useContextualizer)
			pipeline.aeContextualizer(
					contextualizerScope, 
					contextualizerMode == ContextualizerMode.ON_ALL_TERMS ? true : false);
		
		if(nbDocuments != -1)
			pipeline.aeDocumentLogger(this.nbDocuments);
		
		if(maxSizeFilter.isPresent())
			pipeline.aeMaxSizeThresholdCleaner(TermProperty.FREQUENCY, maxSizeFilter.get());
		
		pipeline
				.aeSpecificityComputer()
				.aeCompostSplitter()
				.aePrefixSplitter();
		
		if(variationDetectionEnabled)
			pipeline
				.aeSuffixDerivationDetector()
				.aeSyntacticVariantGatherer()
				.aeGraphicalVariantGatherer();
		
		
		if(scoringEnabled)
			pipeline.aeExtensionDetector()
				.aeScorer()
				.aeRanker(TermProperty.SPECIFICITY, true);

		if(postFilterConfig.isPresent()) 
			PipelineUtils.filter(pipeline, postFilterConfig.get());
		
	    ResourceManager resMgr = UIMAFramework.newDefaultResourceManager();
	    
		try {
			// Create AAE
			AnalysisEngineDescription aaeDesc = createEngineDescription(pipeline.createDescription());

			// Instantiate AAE
			final AnalysisEngine aae = UIMAFramework.produceAnalysisEngine(aaeDesc, resMgr, null);
			
			
			
			if(preprocessed) {
				preprocessedCasStream.forEach(cas -> {
					try {
						aae.process(cas);
					} catch (UIMAException e) {
						throw new TermSuiteException(e);
					}				
				});
			} else {
				documentStream.forEach(document -> {
					JCas cas;
					try {
						cas = JCasFactory.createJCas();
						cas.setDocumentLanguage(document.getLang().getCode());
						cas.setDocumentText(document.getText());
						JCasUtils.initJCasSDI(
								cas, 
								document.getLang().getCode(), 
								document.getText(), 
								document.getUrl());
						aae.process(cas);
					} catch (UIMAException e) {
						throw new TermSuiteException(e);
					}
				});
			}
			
			aae.collectionProcessComplete();
		} catch (ResourceInitializationException | AnalysisEngineProcessException e1) {
			throw new TermSuiteException(e1);
		}

		return pipeline.getTermIndex();
	}

	private Optional<TermHistory> history = Optional.empty();
	
	public TerminoExtractor setWatcher(TermHistory history) {
		this.history = Optional.of(history);
		return this;
	}
}