/** * Copyright 2014, Emory University * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package edu.emory.clir.clearnlp.bin; import java.io.PrintStream; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import org.kohsuke.args4j.Option; import edu.emory.clir.clearnlp.component.AbstractComponent; import edu.emory.clir.clearnlp.component.configuration.DecodeConfiguration; import edu.emory.clir.clearnlp.component.mode.dep.DEPConfiguration; import edu.emory.clir.clearnlp.component.mode.srl.SRLConfiguration; import edu.emory.clir.clearnlp.component.utils.GlobalLexica; import edu.emory.clir.clearnlp.component.utils.NLPMode; import edu.emory.clir.clearnlp.component.utils.NLPUtils; import edu.emory.clir.clearnlp.dependency.DEPNode; import edu.emory.clir.clearnlp.dependency.DEPTree; import edu.emory.clir.clearnlp.reader.AbstractReader; import edu.emory.clir.clearnlp.reader.LineReader; import edu.emory.clir.clearnlp.reader.RawReader; import edu.emory.clir.clearnlp.reader.TReader; import edu.emory.clir.clearnlp.reader.TSVReader; import edu.emory.clir.clearnlp.tokenization.AbstractTokenizer; import edu.emory.clir.clearnlp.util.BinUtils; import edu.emory.clir.clearnlp.util.FileUtils; import edu.emory.clir.clearnlp.util.IOUtils; import edu.emory.clir.clearnlp.util.constant.StringConst; import edu.emory.clir.clearnlp.util.lang.TLanguage; /** * @since 3.0.0 * @author Jinho D. Choi ({@code jinho.choi@emory.edu}) */ public class NLPDecode { @Option(name="-c", usage="confinguration file (required)", required=true, metaVar="<string>") protected String s_configurationFile; @Option(name="-i", usage="input path (required)", required=true, metaVar="<filepath>") protected String s_inputPath; @Option(name="-ie", usage="input file extension (default: *)", required=false, metaVar="<string>") protected String s_inputExt = "*"; @Option(name="-oe", usage="output file extension (default: cnlp)", required=false, metaVar="<string>") protected String s_outputExt = "cnlp"; @Option(name="-mode", usage="pos|morph|dep|ner", required=true, metaVar="<string>") protected String s_mode; @Option(name="-threads", usage="number of threads (default: 1)", required=false, metaVar="<integer>") protected int n_threads = 1; // private long time = 0, tokens = 0, trees = 0; public NLPDecode() {} public NLPDecode(String[] args) { BinUtils.initArgs(args, this); NLPMode mode = NLPMode.valueOf(s_mode); List<String> inputFiles = FileUtils.getFileList(s_inputPath, s_inputExt, false); if (n_threads > 2) decode(inputFiles, s_outputExt, s_configurationFile, n_threads, mode); else decode(inputFiles, s_outputExt, s_configurationFile, mode); // System.out.printf("Tokens / Sec.: %d\n", Math.round(MathUtils.divide(tokens*1000, time))); // System.out.printf("Sents. / Sec.: %d\n", Math.round(MathUtils.divide(trees *1000, time))); } public void decode(List<String> inputFiles, String outputExt, String configurationFile, NLPMode mode) { DecodeConfiguration config = new DecodeConfiguration(IOUtils.createFileInputStream(configurationFile));; GlobalLexica.init(IOUtils.createFileInputStream(configurationFile)); AbstractReader<?> reader = config.getReader(); AbstractTokenizer tokenizer = null; AbstractComponent[] components; PrintStream fout; if (reader.isReaderType(TReader.TSV)) { components = getComponents((TSVReader)reader, config.getLanguage(), mode, config); } else { tokenizer = NLPUtils.getTokenizer(config.getLanguage()); components = getComponents(config.getLanguage(), mode, config); } BinUtils.LOG.info("Decoding:\n"); for (String inputFile : inputFiles) { BinUtils.LOG.info(FileUtils.getBaseName(inputFile)+"\n"); reader.open(IOUtils.createFileInputStream(inputFile)); fout = IOUtils.createBufferedPrintStream(inputFile + StringConst.PERIOD + outputExt); switch (reader.getReaderType()) { case TSV : process((TSVReader) reader, fout, mode, components); break; case RAW : process((RawReader) reader, fout, mode, components, tokenizer); break; case LINE: process((LineReader)reader, fout, mode, components, tokenizer); break; } reader.close(); fout.close(); } } public void decode(List<String> inputFiles, String outputExt, String configurationFile, int nThreads, NLPMode mode) { DecodeConfiguration config = new DecodeConfiguration(IOUtils.createFileInputStream(s_configurationFile));; GlobalLexica.init(IOUtils.createFileInputStream(configurationFile)); ExecutorService executor = Executors.newFixedThreadPool(nThreads); AbstractReader<?> reader = config.getReader(); AbstractTokenizer tokenizer = null; AbstractComponent[] components; String outputFile; if (reader.isReaderType(TReader.TSV)) { components = getComponents((TSVReader)reader, config.getLanguage(), mode, config); } else { tokenizer = NLPUtils.getTokenizer(config.getLanguage()); components = getComponents(config.getLanguage(), mode, config); } BinUtils.LOG.info("Decoding:\n"); for (String inputFile : inputFiles) { outputFile = inputFile + StringConst.PERIOD + outputExt; executor.submit(new NLPTask(tokenizer, components, reader, mode, inputFile, outputFile)); } executor.shutdown(); } class NLPTask implements Runnable { private AbstractComponent[] components; private AbstractTokenizer tokenizer; private AbstractReader<?> reader; private String input_file; private PrintStream fout; private NLPMode mode; public NLPTask(AbstractTokenizer tokenizer, AbstractComponent[] components, AbstractReader<?> reader, NLPMode mode, String inputFile, String outputFile) { this.mode = mode; this.tokenizer = tokenizer; this.input_file = inputFile; this.components = components; this.reader = reader.clone(); this.reader.open(IOUtils.createFileInputStream(inputFile)); this.fout = IOUtils.createBufferedPrintStream(outputFile); } @Override public void run() { try { BinUtils.LOG.info(FileUtils.getBaseName(input_file)+"\n"); switch (reader.getReaderType()) { case TSV : process((TSVReader) reader, fout, mode, components); break; case RAW : process((RawReader) reader, fout, mode, components, tokenizer); break; case LINE: process((LineReader)reader, fout, mode, components, tokenizer); break; } reader.close(); fout.close(); } catch (Exception e) {e.printStackTrace();} } } public void process(RawReader reader, PrintStream fout, NLPMode mode, AbstractComponent[] components, AbstractTokenizer tokenizer) { List<List<String>> tokens = tokenizer.segmentize(reader.getInputStream()); int i, size = tokens.size(); DEPTree tree; for (i=0; i<size; i++) { tree = new DEPTree(tokens.get(i)); process(tree, fout, mode, components); } } public void process(LineReader reader, PrintStream fout, NLPMode mode, AbstractComponent[] components, AbstractTokenizer tokenizer) { DEPTree tree; String line; while ((line = reader.next()) != null) { tree = new DEPTree(tokenizer.tokenize(line)); process(tree, fout, mode, components); } } public void process(TSVReader reader, PrintStream fout, NLPMode mode, AbstractComponent[] components) { DEPTree tree; while ((tree = reader.next()) != null) process(tree, fout, mode, components); } public void process(DEPTree tree, PrintStream fout, NLPMode mode, AbstractComponent[] components) { // long st, et; for (AbstractComponent component : components) { // st = System.currentTimeMillis(); component.process(tree); // et = System.currentTimeMillis(); // time += et - st; } // tokens += tree.size() - 1; // trees++; fout.println(toString(tree, mode)+StringConst.NEW_LINE); } private AbstractComponent[] getComponents(TLanguage language, NLPMode mode, DecodeConfiguration config) { List<AbstractComponent> list = new ArrayList<>(); switch (mode) { case ner : list.add(NLPUtils.getNERecognizer(language, config.getModelPath(NLPMode.ner))); case srl : list.add(NLPUtils.getSRLabeler(language, config.getModelPath(NLPMode.srl), new SRLConfiguration(IOUtils.createFileInputStream(s_configurationFile)))); case dep : list.add(NLPUtils.getDEPParser(language, config.getModelPath(NLPMode.dep), new DEPConfiguration(IOUtils.createFileInputStream(s_configurationFile)))); case morph: list.add(NLPUtils.getMPAnalyzer(language)); case pos : list.add(NLPUtils.getPOSTagger(language, config.getModelPath(NLPMode.pos))); } return toReverseArray(list); } private AbstractComponent[] getComponents(TSVReader reader, TLanguage language, NLPMode mode, DecodeConfiguration config) { List<AbstractComponent> list = new ArrayList<>(); switch (mode) { case ner: if (!reader.hasNamedEntityTags()) list.add(NLPUtils.getNERecognizer(language, config.getModelPath(NLPMode.ner))); case srl: if (!reader.hasSemanticHeads()) list.add(NLPUtils.getSRLabeler(language, config.getModelPath(NLPMode.srl), new SRLConfiguration(IOUtils.createFileInputStream(s_configurationFile)))); case dep: if (!reader.hasDependencyHeads()) list.add(NLPUtils.getDEPParser(language, config.getModelPath(NLPMode.dep), new DEPConfiguration(IOUtils.createFileInputStream(s_configurationFile)))); case morph: if (!reader.hasLemmas()) list.add(NLPUtils.getMPAnalyzer(language)); case pos: if (!reader.hasPOSTags()) list.add(NLPUtils.getPOSTagger(language, config.getModelPath(NLPMode.pos))); } return toReverseArray(list); } private AbstractComponent[] toReverseArray(List<AbstractComponent> list) { AbstractComponent[] array = new AbstractComponent[list.size()]; Collections.reverse(list); return list.toArray(array); } private String toString(DEPTree tree, NLPMode mode) { switch (mode) { case ner : return tree.toString(); case srl : return tree.toString(DEPNode::toStringSRL); case dep : return tree.toString(DEPNode::toStringDEP); case morph: return tree.toString(DEPNode::toStringMorph); case pos : return tree.toString(DEPNode::toStringPOS); } throw new IllegalArgumentException("Invalid mode: "+mode.toString()); } static public void main(String[] args) { new NLPDecode(args); } }