package eu.project.ttc.tools; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStreamWriter; import java.net.URL; import java.nio.charset.Charset; import java.nio.file.Path; import java.nio.file.Paths; import java.util.Optional; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; import org.apache.commons.cli.PosixParser; import org.apache.uima.UIMAException; import org.apache.uima.analysis_engine.AnalysisEngineDescription; import org.apache.uima.cas.FSIterator; import org.apache.uima.fit.factory.AggregateBuilder; import org.apache.uima.fit.factory.AnalysisEngineFactory; import org.apache.uima.fit.factory.ExternalResourceFactory; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.fit.pipeline.SimplePipeline; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; import org.apache.uima.resource.ExternalResourceDescription; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import eu.project.ttc.engines.desc.Lang; import eu.project.ttc.tools.cli.TermSuiteCLIUtils; import eu.project.ttc.types.WordAnnotation; import eu.project.ttc.utils.StringUtils; import fr.univnantes.lina.uima.engines.TreeTaggerWrapper; import fr.univnantes.lina.uima.models.TreeTaggerParameter; import uima.sandbox.lexer.engines.Lexer; import uima.sandbox.lexer.resources.SegmentBank; import uima.sandbox.lexer.resources.SegmentBankResource; public class PreProcessor { private static final Logger LOGGER = LoggerFactory.getLogger(PreProcessor.class); /** Short usage description of the CLI */ private static final String USAGE = "java [-DconfigFile=<file>] -cp termsuite-core-x.x.jar eu.project.ttc.tools.PreProcessor"; private Charset charset = Charset.forName("UTF-8"); private Lang lang; private Path resourceJar; private Path inputFile; private Path outputFile; private Optional<Path> taggerPath = Optional.empty(); public PreProcessor(Lang lang, Path resourceJar, Path inputFile, Path outputFile) { super(); this.lang = lang; this.resourceJar = resourceJar; this.inputFile = inputFile; this.outputFile = outputFile; } public void setTaggerPath(Path taggerPath) { this.taggerPath = Optional.of(taggerPath); } public void setCharset(Charset charset) { this.charset = charset; } private static final char BLANK = ' '; private static final String INPUT_FILE = "input"; private static final String OUTPUT_FILE = "output"; private static final String RESOURCE_JAR = "resources"; private static final String LANG = "lang"; private static final String ENCODING = "encoding"; private static final String TREE_TAGGER_PATH = "treetagger"; private static final char UNDERSCORE = '_'; private static final String TAG_UNKOWN = "UNK"; public void run() { try { AggregateBuilder aggregateBuilder = new AggregateBuilder(); /* * Tokenizer AE */ AnalysisEngineDescription tokenizerAe = AnalysisEngineFactory.createEngineDescription( Lexer.class, Lexer.PARAM_TYPE, "eu.project.ttc.types.WordAnnotation" ); URL jarURI = new URL("jar:"+resourceJar.toUri()+"!/"); String segmentBankURI = TermSuiteResource.SEGMENT_BANK.fromUrlPrefix(jarURI, lang).toString(); ExternalResourceDescription segmentBank = ExternalResourceFactory.createExternalResourceDescription( SegmentBankResource.class, segmentBankURI); ExternalResourceFactory.bindResource(tokenizerAe, SegmentBank.KEY_SEGMENT_BANK, segmentBank); aggregateBuilder.add(tokenizerAe); /* * TreeTagger AE */ if(taggerPath.isPresent()) { String treeTaggerHome = this.taggerPath.get().toString(); AnalysisEngineDescription taggerAe = AnalysisEngineFactory.createEngineDescription( TreeTaggerWrapper.class, TreeTaggerWrapper.PARAM_ANNOTATION_TYPE, "eu.project.ttc.types.WordAnnotation", TreeTaggerWrapper.PARAM_TAG_FEATURE, "tag", TreeTaggerWrapper.PARAM_LEMMA_FEATURE, "lemma", TreeTaggerWrapper.PARAM_UPDATE_ANNOTATION_FEATURES, true, TreeTaggerWrapper.PARAM_TT_HOME_DIRECTORY, treeTaggerHome ); URL taggerConfigURI = TermSuiteResource.TREETAGGER_CONFIG.fromUrlPrefix(jarURI, lang, Tagger.TREE_TAGGER); ExternalResourceFactory.createDependencyAndBind( taggerAe, TreeTaggerParameter.KEY_TT_PARAMETER, TreeTaggerParameter.class, taggerConfigURI.toString()); aggregateBuilder.add(taggerAe); } JCas cas = createCas(); SimplePipeline.runPipeline(cas, aggregateBuilder.createAggregate()); FSIterator<Annotation> it = cas.getAnnotationIndex(WordAnnotation.type).iterator(); try( FileOutputStream fos = new FileOutputStream(outputFile.toFile()); OutputStreamWriter w = new OutputStreamWriter(fos, charset)) { while(it.hasNext()) { WordAnnotation a = (WordAnnotation)it.next(); w.write(a.getCoveredText()); if(this.taggerPath.isPresent()) { w.write(UNDERSCORE); w.write(a.getTag() == null ? TAG_UNKOWN : a.getTag()); } w.write(BLANK); } w.flush(); } } catch (Exception e) { e.printStackTrace(System.err); System.exit(1); } } private JCas createCas() throws UIMAException, IOException { JCas cas = JCasFactory.createJCas(); cas.setDocumentLanguage(lang.getCode()); LOGGER.info("Reading file {} with charset {}", inputFile, charset); String text = StringUtils.readFile(inputFile.toString(), charset); text = StringUtils.toOnelineSentences(text); cas.setDocumentText(text); return cas; } public static void main(String[] args) throws ParseException { CommandLine line = readArgs(args); TermSuiteCLIUtils.logCommandLineOptions(line); Lang lang = Lang.fromCode(line.getOptionValue(LANG)); Path inputFile = Paths.get(line.getOptionValue(INPUT_FILE)); Path outputFile = Paths.get(line.getOptionValue(OUTPUT_FILE)); Path resourceJar = Paths.get(line.getOptionValue(RESOURCE_JAR)); PreProcessor preProcessor = new PreProcessor(lang, resourceJar, inputFile, outputFile); if(line.hasOption(TREE_TAGGER_PATH)) preProcessor.setTaggerPath(Paths.get(line.getOptionValue(TREE_TAGGER_PATH))); if(line.hasOption(ENCODING)) preProcessor.setCharset(Charset.forName(line.getOptionValue(ENCODING))); preProcessor.run(); } private static CommandLine readArgs(String[] args) throws ParseException { Options options = new Options(); options.addOption(TermSuiteCLIUtils.createOption( "i", INPUT_FILE, true, "Path to input file", true)); options.addOption(TermSuiteCLIUtils.createOption( "o", OUTPUT_FILE, true, "Path to output file", false)); options.addOption(TermSuiteCLIUtils.createOption( "t", TREE_TAGGER_PATH, true, "Path to TreeTagger home directory", false)); options.addOption(TermSuiteCLIUtils.createOption( "l", LANG, true, "Language code (fr, en, ry, de, es, etc)", false)); options.addOption(TermSuiteCLIUtils.createOption( "r", RESOURCE_JAR, true, "Path to TermSuite resource jar", false)); options.addOption(TermSuiteCLIUtils.createOption( "e", ENCODING, true, "Encoding to use for input file", false)); try { PosixParser parser = new PosixParser(); // Parse and set CL options CommandLine line = parser.parse(options, args, false); return line; } catch (ParseException e) { TermSuiteCLIUtils.printUsage(e, USAGE, options); System.exit(1); return null; } } }