package edu.stanford.nlp.tagger.maxent.documentation; import edu.stanford.nlp.util.logging.Redwood; import java.io.BufferedReader; import java.io.FileInputStream; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.io.PrintWriter; import java.util.List; import edu.stanford.nlp.ling.SentenceUtils; import edu.stanford.nlp.ling.TaggedWord; import edu.stanford.nlp.ling.HasWord; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.process.CoreLabelTokenFactory; import edu.stanford.nlp.process.DocumentPreprocessor; import edu.stanford.nlp.process.PTBTokenizer; import edu.stanford.nlp.process.TokenizerFactory; import edu.stanford.nlp.tagger.maxent.MaxentTagger; /** This demo shows user-provided sentences (i.e., {@code List<HasWord>}) * being tagged by the tagger. The sentences are generated by direct use * of the DocumentPreprocessor class. * * @author Christopher Manning */ public class TaggerDemo2 { /** A logger for this class */ private static Redwood.RedwoodChannels log = Redwood.channels(TaggerDemo2.class); private TaggerDemo2() {} public static void main(String[] args) throws Exception { if (args.length != 2) { log.info("usage: java TaggerDemo2 modelFile fileToTag"); return; } MaxentTagger tagger = new MaxentTagger(args[0]); TokenizerFactory<CoreLabel> ptbTokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "untokenizable=noneKeep"); BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(args[1]), "utf-8")); PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, "utf-8")); DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(r); documentPreprocessor.setTokenizerFactory(ptbTokenizerFactory); for (List<HasWord> sentence : documentPreprocessor) { List<TaggedWord> tSentence = tagger.tagSentence(sentence); pw.println(SentenceUtils.listToString(tSentence, false)); } // print the adjectives in one more sentence. This shows how to get at words and tags in a tagged sentence. List<HasWord> sent = SentenceUtils.toWordList("The", "slimy", "slug", "crawled", "over", "the", "long", ",", "green", "grass", "."); List<TaggedWord> taggedSent = tagger.tagSentence(sent); for (TaggedWord tw : taggedSent) { if (tw.tag().startsWith("JJ")) { pw.println(tw.word()); } } pw.close(); } }