TermSuitePreprocessor.java example

Explorer
termsuite-core-master
- src
package eu.project.ttc.api;

import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription;

import java.io.File;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.Writer;
import java.nio.charset.Charset;
import java.util.Collection;
import java.util.Optional;
import java.util.stream.Stream;

import org.apache.uima.UIMAException;
import org.apache.uima.UIMAFramework;
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
import org.apache.uima.cas.impl.XmiCasSerializer;
import org.apache.uima.fit.factory.JCasFactory;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.resource.ResourceManager;

import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;

import eu.project.ttc.engines.desc.Lang;
import eu.project.ttc.readers.TermSuiteJsonCasSerializer;
import eu.project.ttc.tools.TermSuitePipeline;
import eu.project.ttc.tools.api.internal.FileSystemUtils;
import eu.project.ttc.utils.FileUtils;
import eu.project.ttc.utils.JCasUtils;

public class TermSuitePreprocessor {
	
	public static enum OutputFormat{JSON,XMI}
	
	private Lang lang;

	private Stream<Document> documentStream = null;
	
	private String inputDirectory = "/";

	private String treeTaggerHome = null;

	private String outputEncoding = Charset.defaultCharset().name();
	
	private Optional<String> outputDirectory = Optional.empty();
	private OutputFormat outputFormat = OutputFormat.JSON;
	
	private long nbDocuments = -1;
	
	public static TermSuitePreprocessor fromTextString(Lang lang, String text) {
		return fromSingleDocument(lang, new Document(lang, "file://inline.text", text));
	}

	public static TermSuitePreprocessor fromSingleDocument(Lang lang, Document document) {
		return fromDocumentCollection(lang, Lists.newArrayList(document));
	}

	public static TermSuitePreprocessor fromDocumentStream(Lang lang, Stream<Document> documentStream, long nbDocuments) {
		TermSuitePreprocessor extractor = new TermSuitePreprocessor();
		extractor.documentStream = documentStream;
		extractor.lang = lang;
		extractor.nbDocuments  = nbDocuments;
		return extractor;
	}
	
	public static TermSuitePreprocessor fromDocumentCollection(Lang lang, Collection<Document> documents) {
		return fromDocumentStream(lang, documents.stream(), documents.size());
	}

	
	public static TermSuitePreprocessor fromTxtCorpus(Lang lang, String directory) {
		return fromTxtCorpus(lang, directory, "**/*.txt", Charset.defaultCharset().name());
	}

	
	/**
	 * 
	 * Example: "**\/*.{txt,data}"
	 *   
	 * 
	 * @param lang
	 * @param directory
	 * @param pattern
	 * @return
	 */
	public static TermSuitePreprocessor fromTxtCorpus(Lang lang, String directory, String pattern) {
		return fromTxtCorpus(lang, directory, pattern, Charset.defaultCharset().name());
	}
	
	public TermSuitePreprocessor toJson(String outputDirectory, String encoding) {
		this.outputDirectory = Optional.of(outputDirectory);
		this.outputEncoding = encoding;
		
		return this;
	}


	public static TermSuitePreprocessor fromTxtCorpus(Lang lang, String directory, String pattern, String encoding) {
		TermSuitePreprocessor preprocessor = fromDocumentStream(
				lang, 
				FileSystemUtils.pathWalker(
						directory, 
						pattern, 
						FileSystemUtils.pathToDocumentMapper(lang, encoding)),
				FileSystemUtils.pathDocumentCount(directory, pattern)
			);
		preprocessor.inputDirectory = directory;
		return preprocessor;
	}

	private TermSuitePreprocessor() {}
	
	public TermSuitePreprocessor setTreeTaggerHome(String treeTaggerHome) {
		this.treeTaggerHome = treeTaggerHome;
		return this;
	}
	
	public Stream<JCas> stream() {
		Preconditions.checkState(treeTaggerHome != null, "TreeTagger home is null. Please use #setTreeTaggerHome()");

		TermSuitePipeline pipeline = TermSuitePipeline.create(lang.getCode());
		
		if(nbDocuments != -1)
			pipeline.aeDocumentLogger(nbDocuments);
		
		pipeline.aeWordTokenizer()
				.setTreeTaggerHome(treeTaggerHome)
				.aeTreeTagger()
				.aeStemmer()
				.setAddSpottedAnnoToTermIndex(false)
				.aeRegexSpotter();
		
	    ResourceManager resMgr = UIMAFramework.newDefaultResourceManager();
	    
		try {
			// Create AAE
			AnalysisEngineDescription aaeDesc = createEngineDescription(pipeline.createDescription());

			// Instantiate AAE
			final AnalysisEngine aae = UIMAFramework.produceAnalysisEngine(aaeDesc, resMgr, null);

			return documentStream.map(document -> {
				JCas cas;
				try {
					cas = JCasFactory.createJCas();
					cas.setDocumentLanguage(document.getLang().getCode());
					cas.setDocumentText(document.getText());
					JCasUtils.initJCasSDI(
							cas, 
							document.getLang().getCode(), 
							document.getText(), 
							document.getUrl());
					aae.process(cas);
					
					
					if(outputDirectory.isPresent())
						exportCas(document, cas);
					
					return cas;
				} catch (UIMAException e) {
					throw new TermSuiteException(e);
				}
			});
			
		} catch (ResourceInitializationException e1) {
			throw new TermSuiteException(e1);
		}

	}

	private void exportCas(Document document, JCas cas) {
		String toFilePath;
		try {
			toFilePath = FileUtils.replaceRootDir(
					document.getUrl(), 
					new File(inputDirectory).getCanonicalPath(), 
					outputDirectory.get());
			toFilePath = FileUtils.replaceExtensionWith(
					toFilePath, 
					this.outputFormat.toString().toLowerCase());
			
			new File(toFilePath).getParentFile().mkdirs();
			
			try(Writer writer = new FileWriter(toFilePath)) {
				if(outputFormat == OutputFormat.JSON)
					TermSuiteJsonCasSerializer.serialize(writer, cas);
				if(outputFormat == OutputFormat.XMI)
					XmiCasSerializer.serialize(cas.getCas(), 
							cas.getTypeSystem(), 
							new FileOutputStream(toFilePath));
			} catch (Exception e) {
				throw new TermSuiteException("Could not export cas to " + toFilePath + " for cas " + document.getUrl(),e);
			}
		} catch (IOException e1) {
			throw new TermSuiteException("Could not export cas " + document.getUrl(),e1);
		}
	}

	public void execute() {
		stream().forEach(cas -> {});
	}
}