package eu.project.ttc.api;
import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.Writer;
import java.nio.charset.Charset;
import java.util.Collection;
import java.util.Optional;
import java.util.stream.Stream;
import org.apache.uima.UIMAException;
import org.apache.uima.UIMAFramework;
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
import org.apache.uima.cas.impl.XmiCasSerializer;
import org.apache.uima.fit.factory.JCasFactory;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.resource.ResourceManager;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import eu.project.ttc.engines.desc.Lang;
import eu.project.ttc.readers.TermSuiteJsonCasSerializer;
import eu.project.ttc.tools.TermSuitePipeline;
import eu.project.ttc.tools.api.internal.FileSystemUtils;
import eu.project.ttc.utils.FileUtils;
import eu.project.ttc.utils.JCasUtils;
public class TermSuitePreprocessor {
public static enum OutputFormat{JSON,XMI}
private Lang lang;
private Stream<Document> documentStream = null;
private String inputDirectory = "/";
private String treeTaggerHome = null;
private String outputEncoding = Charset.defaultCharset().name();
private Optional<String> outputDirectory = Optional.empty();
private OutputFormat outputFormat = OutputFormat.JSON;
private long nbDocuments = -1;
public static TermSuitePreprocessor fromTextString(Lang lang, String text) {
return fromSingleDocument(lang, new Document(lang, "file://inline.text", text));
}
public static TermSuitePreprocessor fromSingleDocument(Lang lang, Document document) {
return fromDocumentCollection(lang, Lists.newArrayList(document));
}
public static TermSuitePreprocessor fromDocumentStream(Lang lang, Stream<Document> documentStream, long nbDocuments) {
TermSuitePreprocessor extractor = new TermSuitePreprocessor();
extractor.documentStream = documentStream;
extractor.lang = lang;
extractor.nbDocuments = nbDocuments;
return extractor;
}
public static TermSuitePreprocessor fromDocumentCollection(Lang lang, Collection<Document> documents) {
return fromDocumentStream(lang, documents.stream(), documents.size());
}
public static TermSuitePreprocessor fromTxtCorpus(Lang lang, String directory) {
return fromTxtCorpus(lang, directory, "**/*.txt", Charset.defaultCharset().name());
}
/**
*
* Example: "**\/*.{txt,data}"
*
*
* @param lang
* @param directory
* @param pattern
* @return
*/
public static TermSuitePreprocessor fromTxtCorpus(Lang lang, String directory, String pattern) {
return fromTxtCorpus(lang, directory, pattern, Charset.defaultCharset().name());
}
public TermSuitePreprocessor toJson(String outputDirectory, String encoding) {
this.outputDirectory = Optional.of(outputDirectory);
this.outputEncoding = encoding;
return this;
}
public static TermSuitePreprocessor fromTxtCorpus(Lang lang, String directory, String pattern, String encoding) {
TermSuitePreprocessor preprocessor = fromDocumentStream(
lang,
FileSystemUtils.pathWalker(
directory,
pattern,
FileSystemUtils.pathToDocumentMapper(lang, encoding)),
FileSystemUtils.pathDocumentCount(directory, pattern)
);
preprocessor.inputDirectory = directory;
return preprocessor;
}
private TermSuitePreprocessor() {}
public TermSuitePreprocessor setTreeTaggerHome(String treeTaggerHome) {
this.treeTaggerHome = treeTaggerHome;
return this;
}
public Stream<JCas> stream() {
Preconditions.checkState(treeTaggerHome != null, "TreeTagger home is null. Please use #setTreeTaggerHome()");
TermSuitePipeline pipeline = TermSuitePipeline.create(lang.getCode());
if(nbDocuments != -1)
pipeline.aeDocumentLogger(nbDocuments);
pipeline.aeWordTokenizer()
.setTreeTaggerHome(treeTaggerHome)
.aeTreeTagger()
.aeStemmer()
.setAddSpottedAnnoToTermIndex(false)
.aeRegexSpotter();
ResourceManager resMgr = UIMAFramework.newDefaultResourceManager();
try {
// Create AAE
AnalysisEngineDescription aaeDesc = createEngineDescription(pipeline.createDescription());
// Instantiate AAE
final AnalysisEngine aae = UIMAFramework.produceAnalysisEngine(aaeDesc, resMgr, null);
return documentStream.map(document -> {
JCas cas;
try {
cas = JCasFactory.createJCas();
cas.setDocumentLanguage(document.getLang().getCode());
cas.setDocumentText(document.getText());
JCasUtils.initJCasSDI(
cas,
document.getLang().getCode(),
document.getText(),
document.getUrl());
aae.process(cas);
if(outputDirectory.isPresent())
exportCas(document, cas);
return cas;
} catch (UIMAException e) {
throw new TermSuiteException(e);
}
});
} catch (ResourceInitializationException e1) {
throw new TermSuiteException(e1);
}
}
private void exportCas(Document document, JCas cas) {
String toFilePath;
try {
toFilePath = FileUtils.replaceRootDir(
document.getUrl(),
new File(inputDirectory).getCanonicalPath(),
outputDirectory.get());
toFilePath = FileUtils.replaceExtensionWith(
toFilePath,
this.outputFormat.toString().toLowerCase());
new File(toFilePath).getParentFile().mkdirs();
try(Writer writer = new FileWriter(toFilePath)) {
if(outputFormat == OutputFormat.JSON)
TermSuiteJsonCasSerializer.serialize(writer, cas);
if(outputFormat == OutputFormat.XMI)
XmiCasSerializer.serialize(cas.getCas(),
cas.getTypeSystem(),
new FileOutputStream(toFilePath));
} catch (Exception e) {
throw new TermSuiteException("Could not export cas to " + toFilePath + " for cas " + document.getUrl(),e);
}
} catch (IOException e1) {
throw new TermSuiteException("Could not export cas " + document.getUrl(),e1);
}
}
public void execute() {
stream().forEach(cas -> {});
}
}