package edu.stanford.nlp.sentiment; import edu.stanford.nlp.util.logging.Redwood; import java.io.BufferedReader; import java.io.IOException; import java.io.FileOutputStream; import java.io.PrintStream; import java.text.DecimalFormat; import java.text.NumberFormat; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Properties; import edu.stanford.nlp.ling.SentenceUtils; import org.ejml.simple.SimpleMatrix; import edu.stanford.nlp.io.IOUtils; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.ling.Label; import edu.stanford.nlp.neural.rnn.RNNCoreAnnotations; import edu.stanford.nlp.pipeline.Annotation; import edu.stanford.nlp.pipeline.StanfordCoreNLP; import edu.stanford.nlp.trees.MemoryTreebank; import edu.stanford.nlp.trees.Tree; import edu.stanford.nlp.trees.TreeCoreAnnotations; import edu.stanford.nlp.util.CoreMap; import edu.stanford.nlp.util.Generics; /** * A wrapper class which creates a suitable pipeline for the sentiment * model and processes raw text. * * The main program has the following options: <br> * <code>-parserModel</code> Which parser model to use, defaults to englishPCFG.ser.gz <br> * <code>-sentimentModel</code> Which sentiment model to use, defaults to sentiment.ser.gz <br> * <code>-file</code> Which file to process. <br> * <code>-fileList</code> A comma separated list of files to process. <br> * <code>-stdin</code> Read one line at a time from stdin. <br> * <code>-output</code> pennTrees: Output trees with scores at each binarized node. vectors: Number tree nodes and print out the vectors. probabilities: Output the scores for different labels for each node. Defaults to printing just the root. <br> * <code>-filterUnknown</code> Remove unknown trees from the input. Only applies to TREES input, in which case the trees must be binarized with sentiment labels <br> * <code>-help</code> Print out help <br> * * @author John Bauer */ public class SentimentPipeline { /** A logger for this class */ private static Redwood.RedwoodChannels log = Redwood.channels(SentimentPipeline.class); private static final NumberFormat NF = new DecimalFormat("0.0000"); enum Output { PENNTREES, VECTORS, ROOT, PROBABILITIES } enum Input { TEXT, TREES } private SentimentPipeline() {} // static methods /** * Sets the labels on the tree (except the leaves) to be the integer * value of the sentiment prediction. Makes it easy to print out * with Tree.toString() */ static void setSentimentLabels(Tree tree) { if (tree.isLeaf()) { return; } for (Tree child : tree.children()) { setSentimentLabels(child); } Label label = tree.label(); if (!(label instanceof CoreLabel)) { throw new IllegalArgumentException("Required a tree with CoreLabels"); } CoreLabel cl = (CoreLabel) label; cl.setValue(Integer.toString(RNNCoreAnnotations.getPredictedClass(tree))); } /** * Sets the labels on the tree to be the indices of the nodes. * Starts counting at the root and does a postorder traversal. */ static int setIndexLabels(Tree tree, int index) { if (tree.isLeaf()) { return index; } tree.label().setValue(Integer.toString(index)); index++; for (Tree child : tree.children()) { index = setIndexLabels(child, index); } return index; } /** * Outputs the vectors from the tree. Counts the tree nodes the * same as setIndexLabels. */ static int outputTreeVectors(PrintStream out, Tree tree, int index) { if (tree.isLeaf()) { return index; } out.print(" " + index + ":"); SimpleMatrix vector = RNNCoreAnnotations.getNodeVector(tree); for (int i = 0; i < vector.getNumElements(); ++i) { out.print(" " + NF.format(vector.get(i))); } out.println(); index++; for (Tree child : tree.children()) { index = outputTreeVectors(out, child, index); } return index; } /** * Outputs the scores from the tree. Counts the tree nodes the * same as setIndexLabels. */ static int outputTreeScores(PrintStream out, Tree tree, int index) { if (tree.isLeaf()) { return index; } out.print(" " + index + ":"); SimpleMatrix vector = RNNCoreAnnotations.getPredictions(tree); for (int i = 0; i < vector.getNumElements(); ++i) { out.print(" " + NF.format(vector.get(i))); } out.println(); index++; for (Tree child : tree.children()) { index = outputTreeScores(out, child, index); } return index; } /** * Outputs a tree using the output style requested */ static void outputTree(PrintStream out, CoreMap sentence, List<Output> outputFormats) { Tree tree = sentence.get(SentimentCoreAnnotations.SentimentAnnotatedTree.class); for (Output output : outputFormats) { switch (output) { case PENNTREES: { Tree copy = tree.deepCopy(); setSentimentLabels(copy); out.println(copy); break; } case VECTORS: { Tree copy = tree.deepCopy(); setIndexLabels(copy, 0); out.println(copy); outputTreeVectors(out, tree, 0); break; } case ROOT: { out.println(" " + sentence.get(SentimentCoreAnnotations.SentimentClass.class)); break; } case PROBABILITIES: { Tree copy = tree.deepCopy(); setIndexLabels(copy, 0); out.println(copy); outputTreeScores(out, tree, 0); break; } default: throw new IllegalArgumentException("Unknown output format " + output); } } } static final String DEFAULT_TLPP_CLASS = "edu.stanford.nlp.parser.lexparser.EnglishTreebankParserParams"; static void help() { log.info("Known command line arguments:"); log.info(" -sentimentModel <model>: Which model to use"); log.info(" -parserModel <model>: Which parser to use"); log.info(" -file <filename>: Which file to process"); log.info(" -fileList <file>,<file>,...: Comma separated list of files to process. Output goes to file.out"); log.info(" -stdin: Process stdin instead of a file"); log.info(" -input <format>: Which format to input, TEXT or TREES. Will not process stdin as trees. If trees are not already binarized, they will be binarized with -tlppClass's headfinder, which means they must have labels in that treebank's tagset."); log.info(" -output <format>: Which format to output, PENNTREES, VECTORS, PROBABILITIES, or ROOT. Multiple formats can be specified as a comma separated list."); log.info(" -filterUnknown: remove unknown trees from the input. Only applies to TREES input, in which case the trees must be binarized with sentiment labels"); log.info(" -tlppClass: a class to use for building the binarizer if using non-binarized TREES as input. Defaults to " + DEFAULT_TLPP_CLASS); } /** * Reads an annotation from the given filename using the requested input. */ public static List<Annotation> getAnnotations(StanfordCoreNLP tokenizer, Input inputFormat, String filename, boolean filterUnknown) { switch (inputFormat) { case TEXT: { String text = IOUtils.slurpFileNoExceptions(filename); Annotation annotation = new Annotation(text); tokenizer.annotate(annotation); List<Annotation> annotations = Generics.newArrayList(); for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) { Annotation nextAnnotation = new Annotation(sentence.get(CoreAnnotations.TextAnnotation.class)); nextAnnotation.set(CoreAnnotations.SentencesAnnotation.class, Collections.singletonList(sentence)); annotations.add(nextAnnotation); } return annotations; } case TREES: { List<Tree> trees; if (filterUnknown) { trees = SentimentUtils.readTreesWithGoldLabels(filename); trees = SentimentUtils.filterUnknownRoots(trees); } else { trees = Generics.newArrayList(); MemoryTreebank treebank = new MemoryTreebank("utf-8"); treebank.loadPath(filename, null); for (Tree tree : treebank) { trees.add(tree); } } List<Annotation> annotations = Generics.newArrayList(); for (Tree tree : trees) { CoreMap sentence = new Annotation(SentenceUtils.listToString(tree.yield())); sentence.set(TreeCoreAnnotations.TreeAnnotation.class, tree); List<CoreMap> sentences = Collections.singletonList(sentence); Annotation annotation = new Annotation(""); annotation.set(CoreAnnotations.SentencesAnnotation.class, sentences); annotations.add(annotation); } return annotations; } default: throw new IllegalArgumentException("Unknown format " + inputFormat); } } /** Runs the tree-based sentiment model on some text. */ public static void main(String[] args) throws IOException { String parserModel = null; String sentimentModel = null; String filename = null; String fileList = null; boolean stdin = false; boolean filterUnknown = false; List<Output> outputFormats = Collections.singletonList(Output.ROOT); Input inputFormat = Input.TEXT; String tlppClass = DEFAULT_TLPP_CLASS; for (int argIndex = 0; argIndex < args.length; ) { if (args[argIndex].equalsIgnoreCase("-sentimentModel")) { sentimentModel = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-parserModel")) { parserModel = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-file")) { filename = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-fileList")) { fileList = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-stdin")) { stdin = true; argIndex++; } else if (args[argIndex].equalsIgnoreCase("-input")) { inputFormat = Input.valueOf(args[argIndex + 1].toUpperCase()); argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-output")) { String[] formats = args[argIndex + 1].split(","); outputFormats = new ArrayList<>(); for (String format : formats) { outputFormats.add(Output.valueOf(format.toUpperCase())); } argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-filterUnknown")) { filterUnknown = true; argIndex++; } else if (args[argIndex].equalsIgnoreCase("-tlppClass")) { tlppClass = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-help")) { help(); System.exit(0); } else { log.info("Unknown argument " + args[argIndex + 1]); help(); throw new IllegalArgumentException("Unknown argument " + args[argIndex + 1]); } } // We construct two pipelines. One handles tokenization, if // necessary. The other takes tokenized sentences and converts // them to sentiment trees. Properties pipelineProps = new Properties(); Properties tokenizerProps = null; if (sentimentModel != null) { pipelineProps.setProperty("sentiment.model", sentimentModel); } if (parserModel != null) { pipelineProps.setProperty("parse.model", parserModel); } if (inputFormat == Input.TREES) { pipelineProps.setProperty("annotators", "binarizer, sentiment"); pipelineProps.setProperty("customAnnotatorClass.binarizer", "edu.stanford.nlp.pipeline.BinarizerAnnotator"); pipelineProps.setProperty("binarizer.tlppClass", tlppClass); pipelineProps.setProperty("enforceRequirements", "false"); } else { pipelineProps.setProperty("annotators", "parse, sentiment"); pipelineProps.setProperty("parse.binaryTrees", "true"); pipelineProps.setProperty("enforceRequirements", "false"); tokenizerProps = new Properties(); tokenizerProps.setProperty("annotators", "tokenize, ssplit"); } if (stdin && tokenizerProps != null) { tokenizerProps.setProperty(StanfordCoreNLP.NEWLINE_SPLITTER_PROPERTY, "true"); } int count = 0; if (filename != null) count++; if (fileList != null) count++; if (stdin) count++; if (count > 1) { throw new IllegalArgumentException("Please only specify one of -file, -fileList or -stdin"); } if (count == 0) { throw new IllegalArgumentException("Please specify either -file, -fileList or -stdin"); } StanfordCoreNLP tokenizer = (tokenizerProps == null) ? null : new StanfordCoreNLP(tokenizerProps); StanfordCoreNLP pipeline = new StanfordCoreNLP(pipelineProps); if (filename != null) { // Process a file. The pipeline will do tokenization, which // means it will split it into sentences as best as possible // with the tokenizer. List<Annotation> annotations = getAnnotations(tokenizer, inputFormat, filename, filterUnknown); for (Annotation annotation : annotations) { pipeline.annotate(annotation); for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) { System.out.println(sentence); outputTree(System.out, sentence, outputFormats); } } } else if (fileList != null) { // Process multiple files. The pipeline will do tokenization, // which means it will split it into sentences as best as // possible with the tokenizer. Output will go to filename.out // for each file. for (String file : fileList.split(",")) { List<Annotation> annotations = getAnnotations(tokenizer, inputFormat, file, filterUnknown); FileOutputStream fout = new FileOutputStream(file + ".out"); PrintStream pout = new PrintStream(fout); for (Annotation annotation : annotations) { pipeline.annotate(annotation); for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) { pout.println(sentence); outputTree(pout, sentence, outputFormats); } } pout.flush(); fout.close(); } } else { // Process stdin. Each line will be treated as a single sentence. log.info("Reading in text from stdin."); log.info("Please enter one sentence per line."); log.info("Processing will end when EOF is reached."); BufferedReader reader = IOUtils.readerFromStdin("utf-8"); for (String line; (line = reader.readLine()) != null; ) { line = line.trim(); if ( ! line.isEmpty()) { Annotation annotation = tokenizer.process(line); pipeline.annotate(annotation); for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) { outputTree(System.out, sentence, outputFormats); } } else { // Output blank lines for blank lines so the tool can be // used for line-by-line text processing System.out.println(); } } } } }