CountClosedTags.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.tagger.util; 
import edu.stanford.nlp.util.logging.Redwood;

import java.io.IOException;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.TreeSet;

import edu.stanford.nlp.ling.TaggedWord;
import edu.stanford.nlp.tagger.io.TaggedFileReader;
import edu.stanford.nlp.tagger.io.TaggedFileRecord;
import edu.stanford.nlp.tagger.maxent.TaggerConfig;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.StringUtils;


/**
 * Implements Chris's heuristic for when a closed tag class can be
 * treated as a closed tag.  You count how many different words in the
 * class you see in the first X% of the training data, then make sure
 * you don't see any new words in the rest of the training or test data.
 * <br>
 * This handles tagged training/test data in any format handled by the
 *  tagger (@see edu.stanford.nlp.tagger.maxent.MaxentTagger).  Files
 *  are specified as a comma-separated list via the flag
 *  -TRAIN_FILE_PROPERTY or -TEST_FILE_PROPERTY.  Closed tags are
 *  specified as a space separated list using the flag
 *  -CLOSED_TAGS_PROPERTY.
 * <br>
 * CountClosedTags then reads each training file to count how many
 * lines are in it.  First, it reads the first
 * -TRAINING_RATIO_PROPERTY fraction of the lines and keeps track of
 * which words show up for each closed tag.  Next, it reads the rest
 * of the training file and keeps track of which words show up in the
 * rest of the data that didn't show up in the rest of the training
 * data.  Finally, it reads all of the test files, once again tracking
 * the words that didn't show up in the training data.
 * <br>
 * CountClosedTags then outputs the number of unique words that showed
 * up in the TRAINING_RATIO_PROPERTY training data and the total
 * number of unique words for each tag.  If the -PRINT_WORDS_PROPERTY
 * flag is set to true, it also prints out the sets of observed words.
 * <br>
 * @author John Bauer
 */
public class CountClosedTags  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(CountClosedTags.class);
  /**
   * Which tags to look for
   */
  Set<String> closedTags;

  /**
   * Words seen in the first trainingRatio fraction of the trainFiles
   */
  Map<String, Set<String>> trainingWords = Generics.newHashMap();
  /**
   * Words seen in either trainFiles or testFiles
   */
  Map<String, Set<String>> allWords = Generics.newHashMap();

  static final double DEFAULT_TRAINING_RATIO = 2.0 / 3.0;
  /**
   * How much of each training file to count for trainingWords
   */
  final double trainingRatio;
  /**
   * Whether or not the final output should print the words
   */
  final boolean printWords;

  /**
   * Tag separator...
   */
  private static final String tagSeparator = "_";

  // intended to be a standalone program, not a class
  private CountClosedTags(Properties props) {
    String tagList = props.getProperty(CLOSED_TAGS_PROPERTY);
    if (tagList != null) {
      closedTags = new TreeSet<>();
      String[] pieces = tagList.split("\\s+");
      Collections.addAll(closedTags, pieces);
    } else {
      closedTags = null;
    }

    if (props.containsKey(TRAINING_RATIO_PROPERTY)) {
      trainingRatio =
        Double.valueOf(props.getProperty(TRAINING_RATIO_PROPERTY));
    } else {
      trainingRatio = DEFAULT_TRAINING_RATIO;
    }

    printWords = Boolean.valueOf(props.getProperty(PRINT_WORDS_PROPERTY,
                                                   "false"));
  }

  /**
   * Count how many sentences there are in filename
   */
  private static int countSentences(TaggedFileRecord file)
    throws IOException
  {
    int count = 0;
    for (List<TaggedWord> line : file.reader())
      ++count;
    return count;
  }

  /**
   * Given a line, split it into tagged words and add each word to
   * the given tagWordMap
   */
  void addTaggedWords(List<TaggedWord> line,
                      Map<String, Set<String>> tagWordMap) {
    for (TaggedWord taggedWord : line) {
      String word = taggedWord.word();
      String tag = taggedWord.tag();
      if (closedTags == null || closedTags.contains(tag)) {
        if (!tagWordMap.containsKey(tag)) {
          tagWordMap.put(tag, new TreeSet<>());
        }
        tagWordMap.get(tag).add(word);
      }
    }
  }

  /**
   * Count trainingRatio of the sentences for both trainingWords and
   * allWords, and count the rest for just allWords
   */
  void countTrainingTags(TaggedFileRecord file)
    throws IOException
  {
    int sentences = countSentences(file);
    int trainSentences = (int) (sentences * trainingRatio);
    TaggedFileReader reader = file.reader();
    List<TaggedWord> line;
    for (int i = 0; i < trainSentences && reader.hasNext(); ++i) {
      line = reader.next();
      addTaggedWords(line, trainingWords);
      addTaggedWords(line, allWords);
    }
    while (reader.hasNext()) {
      line = reader.next();
      addTaggedWords(line, allWords);
    }
  }

  /**
   * Count all the words in the given file for just allWords
   */
  void countTestTags(TaggedFileRecord file)
    throws IOException
  {
    for (List<TaggedWord> line : file.reader()) {
      addTaggedWords(line, allWords);
    }
  }

  /**
   * Print out the results found
   */
  void report() {
    List<String> successfulTags = new ArrayList<>();
    Set<String> tags = new TreeSet<>();
    tags.addAll(allWords.keySet());
    tags.addAll(trainingWords.keySet());
    if (closedTags != null)
      tags.addAll(closedTags);
    for (String tag : tags) {
      int numTraining = (trainingWords.containsKey(tag) ?
                         trainingWords.get(tag).size() : 0);
      int numTotal = (allWords.containsKey(tag) ?
                      allWords.get(tag).size() : 0);
      if (numTraining == numTotal && numTraining > 0)
        successfulTags.add(tag);
      System.out.println(tag + " " + numTraining + " " + numTotal);
      if (printWords) {
        Set<String> trainingSet = trainingWords.get(tag);
        if (trainingSet == null)
          trainingSet = Collections.emptySet();
        Set<String> allSet = allWords.get(tag);
        for (String word : trainingSet) {
          System.out.print(" " + word);
        }
        if (trainingSet.size() < allSet.size()) {
          System.out.println();
          System.out.print(" *");
          for (String word : allWords.get(tag)) {
            if (!trainingSet.contains(word)) {
              System.out.print(" " + word);
            }
          }
        }
        System.out.println();
      }
    }
    System.out.println(successfulTags);
  }

  public static final String TEST_FILE_PROPERTY = "testFile";
  public static final String TRAIN_FILE_PROPERTY = "trainFile";
  public static final String CLOSED_TAGS_PROPERTY = "closedTags";
  public static final String TRAINING_RATIO_PROPERTY = "trainingRatio";
  public static final String PRINT_WORDS_PROPERTY = "printWords";

  private static final Set<String> knownArgs =
    Generics.newHashSet(Arrays.asList(TEST_FILE_PROPERTY,
                                      TRAIN_FILE_PROPERTY,
                                      CLOSED_TAGS_PROPERTY,
                                      TRAINING_RATIO_PROPERTY,
                                      PRINT_WORDS_PROPERTY,
                                      TaggerConfig.ENCODING_PROPERTY,
                                      TaggerConfig.TAG_SEPARATOR_PROPERTY));

  private static void help(String error) {
    if (error != null && !error.equals("")) {
      log.info(error);
    }
    System.exit(2);
  }

  private static void checkArgs(Properties props) {
    if (!props.containsKey(TRAIN_FILE_PROPERTY)) {
      help("No " + TRAIN_FILE_PROPERTY + " specified");
    }
    for (String arg : props.stringPropertyNames()) {
      if (!knownArgs.contains(arg))
        help("Unknown arg " + arg);
    }
  }

  public static void main(String[] args) throws Exception {
    System.setOut(new PrintStream(System.out, true, "UTF-8"));
    System.setErr(new PrintStream(System.err, true, "UTF-8"));

    Properties config = StringUtils.argsToProperties(args);
    checkArgs(config);

    CountClosedTags cct = new CountClosedTags(config);
    String trainFiles = config.getProperty(TRAIN_FILE_PROPERTY);
    String testFiles = config.getProperty(TEST_FILE_PROPERTY);
    List<TaggedFileRecord> files =
      TaggedFileRecord.createRecords(config, trainFiles);
    for (TaggedFileRecord file : files) {
      cct.countTrainingTags(file);
    }
    if (testFiles != null) {
      files = TaggedFileRecord.createRecords(config, testFiles);
      for (TaggedFileRecord file : files) {
        cct.countTestTags(file);
      }
    }
    cct.report();
  }
}