TokensRegexRetokenizeDemo.java example

Explorer
Stanford-NLP-master
- CoreNLP-master
package edu.stanford.nlp.ling.tokensregex.demo;

import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.util.CoreMap;

import java.io.IOException;
import java.io.PrintWriter;
import java.util.List;
import java.util.Properties;

/**
 * Demo illustrating how to use TokensRegexAnnotator for tweaks to tokenization
 */
public class TokensRegexRetokenizeDemo {

  public static void runPipeline(StanfordCoreNLP pipeline, String text, PrintWriter out) {
    Annotation annotation = new Annotation(text);
    pipeline.annotate(annotation);

    // An Annotation is a Map and you can get and use the various analyses individually.
    out.println();
    // The toString() method on an Annotation just prints the text of the Annotation
    // But you can see what is in it with other methods like toShorterString()
    out.println("The top level annotation");
    out.println(annotation.toShorterString());
    List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class);

    for (CoreMap sentence : sentences) {
      // Print out token annotations
      for (CoreLabel token:sentence.get(CoreAnnotations.TokensAnnotation.class)) {
        // Print out words, lemma, ne, and normalized ne
        String word = token.get(CoreAnnotations.TextAnnotation.class);
        String lemma = token.get(CoreAnnotations.LemmaAnnotation.class);
        String pos = token.get(CoreAnnotations.PartOfSpeechAnnotation.class);
        String ne = token.get(CoreAnnotations.NamedEntityTagAnnotation.class);
        String normalized = token.get(CoreAnnotations.NormalizedNamedEntityTagAnnotation.class);
        out.println("token: " + "word="+word + ", lemma="+lemma + ", pos=" + pos + ", ne=" + ne + ", normalized=" + normalized);
      }
    }
    out.flush();
  }

  public static void main(String[] args) throws IOException {
    PrintWriter out;

    String rules;
    if (args.length > 0) {
      rules = args[0];
    } else {
      rules = "edu/stanford/nlp/ling/tokensregex/demo/rules/retokenize.rules.txt";
    }
    if (args.length > 2) {
      out = new PrintWriter(args[2]);
    } else {
      out = new PrintWriter(System.out);
    }

    String text;
    if (args.length > 1) {
      text = IOUtils.slurpFileNoExceptions(args[1]);
    } else {
      text = "Do we tokenize on hyphens? one-two-three-four.  How about dates? 03-16-2015.";
    }

    Properties propertiesDefaultTokenize = new Properties();
    propertiesDefaultTokenize.setProperty("annotators", "tokenize,ssplit,pos,lemma,ner");
    StanfordCoreNLP pipelineDefaultRetokenize = new StanfordCoreNLP();
    out.println("Default tokenization: ");
    runPipeline(pipelineDefaultRetokenize, text, out);

    Properties properties = new Properties();
    properties.setProperty("annotators", "tokenize,retokenize,ssplit,pos,lemma,ner");
    properties.setProperty("customAnnotatorClass.retokenize", "edu.stanford.nlp.pipeline.TokensRegexAnnotator");
    properties.setProperty("retokenize.rules", rules);
    StanfordCoreNLP pipelineWithRetokenize = new StanfordCoreNLP(properties);
    out.println();
    out.println("Always tokenize hyphens: ");
    runPipeline(pipelineWithRetokenize, text, out);
  }

}