package eu.project.ttc.tools;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.net.URL;
import java.nio.charset.Charset;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Optional;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.cli.PosixParser;
import org.apache.uima.UIMAException;
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.fit.factory.AggregateBuilder;
import org.apache.uima.fit.factory.AnalysisEngineFactory;
import org.apache.uima.fit.factory.ExternalResourceFactory;
import org.apache.uima.fit.factory.JCasFactory;
import org.apache.uima.fit.pipeline.SimplePipeline;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
import org.apache.uima.resource.ExternalResourceDescription;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.project.ttc.engines.desc.Lang;
import eu.project.ttc.tools.cli.TermSuiteCLIUtils;
import eu.project.ttc.types.WordAnnotation;
import eu.project.ttc.utils.StringUtils;
import fr.univnantes.lina.uima.engines.TreeTaggerWrapper;
import fr.univnantes.lina.uima.models.TreeTaggerParameter;
import uima.sandbox.lexer.engines.Lexer;
import uima.sandbox.lexer.resources.SegmentBank;
import uima.sandbox.lexer.resources.SegmentBankResource;
public class PreProcessor {
private static final Logger LOGGER = LoggerFactory.getLogger(PreProcessor.class);
/** Short usage description of the CLI */
private static final String USAGE = "java [-DconfigFile=<file>] -cp termsuite-core-x.x.jar eu.project.ttc.tools.PreProcessor";
private Charset charset = Charset.forName("UTF-8");
private Lang lang;
private Path resourceJar;
private Path inputFile;
private Path outputFile;
private Optional<Path> taggerPath = Optional.empty();
public PreProcessor(Lang lang, Path resourceJar, Path inputFile, Path outputFile) {
super();
this.lang = lang;
this.resourceJar = resourceJar;
this.inputFile = inputFile;
this.outputFile = outputFile;
}
public void setTaggerPath(Path taggerPath) {
this.taggerPath = Optional.of(taggerPath);
}
public void setCharset(Charset charset) {
this.charset = charset;
}
private static final char BLANK = ' ';
private static final String INPUT_FILE = "input";
private static final String OUTPUT_FILE = "output";
private static final String RESOURCE_JAR = "resources";
private static final String LANG = "lang";
private static final String ENCODING = "encoding";
private static final String TREE_TAGGER_PATH = "treetagger";
private static final char UNDERSCORE = '_';
private static final String TAG_UNKOWN = "UNK";
public void run() {
try {
AggregateBuilder aggregateBuilder = new AggregateBuilder();
/*
* Tokenizer AE
*/
AnalysisEngineDescription tokenizerAe = AnalysisEngineFactory.createEngineDescription(
Lexer.class,
Lexer.PARAM_TYPE, "eu.project.ttc.types.WordAnnotation"
);
URL jarURI = new URL("jar:"+resourceJar.toUri()+"!/");
String segmentBankURI = TermSuiteResource.SEGMENT_BANK.fromUrlPrefix(jarURI, lang).toString();
ExternalResourceDescription segmentBank = ExternalResourceFactory.createExternalResourceDescription(
SegmentBankResource.class,
segmentBankURI);
ExternalResourceFactory.bindResource(tokenizerAe, SegmentBank.KEY_SEGMENT_BANK, segmentBank);
aggregateBuilder.add(tokenizerAe);
/*
* TreeTagger AE
*/
if(taggerPath.isPresent()) {
String treeTaggerHome = this.taggerPath.get().toString();
AnalysisEngineDescription taggerAe = AnalysisEngineFactory.createEngineDescription(
TreeTaggerWrapper.class,
TreeTaggerWrapper.PARAM_ANNOTATION_TYPE, "eu.project.ttc.types.WordAnnotation",
TreeTaggerWrapper.PARAM_TAG_FEATURE, "tag",
TreeTaggerWrapper.PARAM_LEMMA_FEATURE, "lemma",
TreeTaggerWrapper.PARAM_UPDATE_ANNOTATION_FEATURES, true,
TreeTaggerWrapper.PARAM_TT_HOME_DIRECTORY, treeTaggerHome
);
URL taggerConfigURI = TermSuiteResource.TREETAGGER_CONFIG.fromUrlPrefix(jarURI, lang, Tagger.TREE_TAGGER);
ExternalResourceFactory.createDependencyAndBind(
taggerAe,
TreeTaggerParameter.KEY_TT_PARAMETER,
TreeTaggerParameter.class,
taggerConfigURI.toString());
aggregateBuilder.add(taggerAe);
}
JCas cas = createCas();
SimplePipeline.runPipeline(cas, aggregateBuilder.createAggregate());
FSIterator<Annotation> it = cas.getAnnotationIndex(WordAnnotation.type).iterator();
try(
FileOutputStream fos = new FileOutputStream(outputFile.toFile());
OutputStreamWriter w = new OutputStreamWriter(fos, charset)) {
while(it.hasNext()) {
WordAnnotation a = (WordAnnotation)it.next();
w.write(a.getCoveredText());
if(this.taggerPath.isPresent()) {
w.write(UNDERSCORE);
w.write(a.getTag() == null ? TAG_UNKOWN : a.getTag());
}
w.write(BLANK);
}
w.flush();
}
} catch (Exception e) {
e.printStackTrace(System.err);
System.exit(1);
}
}
private JCas createCas() throws UIMAException, IOException {
JCas cas = JCasFactory.createJCas();
cas.setDocumentLanguage(lang.getCode());
LOGGER.info("Reading file {} with charset {}", inputFile, charset);
String text = StringUtils.readFile(inputFile.toString(), charset);
text = StringUtils.toOnelineSentences(text);
cas.setDocumentText(text);
return cas;
}
public static void main(String[] args) throws ParseException {
CommandLine line = readArgs(args);
TermSuiteCLIUtils.logCommandLineOptions(line);
Lang lang = Lang.fromCode(line.getOptionValue(LANG));
Path inputFile = Paths.get(line.getOptionValue(INPUT_FILE));
Path outputFile = Paths.get(line.getOptionValue(OUTPUT_FILE));
Path resourceJar = Paths.get(line.getOptionValue(RESOURCE_JAR));
PreProcessor preProcessor = new PreProcessor(lang, resourceJar, inputFile, outputFile);
if(line.hasOption(TREE_TAGGER_PATH))
preProcessor.setTaggerPath(Paths.get(line.getOptionValue(TREE_TAGGER_PATH)));
if(line.hasOption(ENCODING))
preProcessor.setCharset(Charset.forName(line.getOptionValue(ENCODING)));
preProcessor.run();
}
private static CommandLine readArgs(String[] args) throws ParseException {
Options options = new Options();
options.addOption(TermSuiteCLIUtils.createOption(
"i",
INPUT_FILE,
true,
"Path to input file",
true));
options.addOption(TermSuiteCLIUtils.createOption(
"o",
OUTPUT_FILE,
true,
"Path to output file",
false));
options.addOption(TermSuiteCLIUtils.createOption(
"t",
TREE_TAGGER_PATH,
true,
"Path to TreeTagger home directory",
false));
options.addOption(TermSuiteCLIUtils.createOption(
"l",
LANG,
true,
"Language code (fr, en, ry, de, es, etc)",
false));
options.addOption(TermSuiteCLIUtils.createOption(
"r",
RESOURCE_JAR,
true,
"Path to TermSuite resource jar",
false));
options.addOption(TermSuiteCLIUtils.createOption(
"e",
ENCODING,
true,
"Encoding to use for input file",
false));
try {
PosixParser parser = new PosixParser();
// Parse and set CL options
CommandLine line = parser.parse(options, args, false);
return line;
} catch (ParseException e) {
TermSuiteCLIUtils.printUsage(e, USAGE, options);
System.exit(1);
return null;
}
}
}