ReadDataTagged.java example

Explorer
CoreNLP-master
/**
 * Title:        StanfordMaxEnt<p>
 * Description:  A Maximum Entropy Toolkit<p>
 * Copyright:    Copyright (c) Trustees of Leland Stanford Junior University<p>
 */
package edu.stanford.nlp.tagger.maxent; 
import edu.stanford.nlp.util.logging.Redwood;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;

import edu.stanford.nlp.ling.TaggedWord;
import edu.stanford.nlp.ling.WordTag;
import edu.stanford.nlp.stats.IntCounter;
import edu.stanford.nlp.tagger.common.Tagger;
import edu.stanford.nlp.tagger.io.TaggedFileReader;
import edu.stanford.nlp.tagger.io.TaggedFileRecord;
import edu.stanford.nlp.util.Generics;


/**
 * Reads tagged data from a file and creates a dictionary.
 * The tagged data has to be whitespace-separated items, with the word and
 * tag split off by a delimiter character, which is found as the last instance
 * of the delimiter character in the item.
 *
 * @author Kristina Toutanova
 * @version 1.0
 */
public class ReadDataTagged  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(ReadDataTagged.class);

  private final ArrayList<DataWordTag> v = new ArrayList<>();
  private int numElements = 0;
  private int totalSentences = 0;
  private int totalWords = 0;
  private final PairsHolder pairs;
  private final MaxentTagger maxentTagger;

  //TODO: make a class DataHolder that holds the dict, tags, pairs, etc, for tagger
  // and pass it around

  protected ReadDataTagged(TaggerConfig config, MaxentTagger maxentTagger,
                           PairsHolder pairs)
    throws IOException
  {
    this.maxentTagger = maxentTagger;
    this.pairs = pairs;
    List<TaggedFileRecord> fileRecords = TaggedFileRecord.createRecords(config, config.getFile());
    Map<String, IntCounter<String>> wordTagCounts = Generics.newHashMap();
    for (TaggedFileRecord record : fileRecords) {
      loadFile(record.reader(), wordTagCounts);
    }
    // By counting the words and then filling the Dictionary, we can
    // make it so there are no calls that mutate the Dictionary or its
    // TagCount objects later
    maxentTagger.dict.fillWordTagCounts(wordTagCounts);
  }


  /** Frees the memory that is stored in this object by dropping the word-tag data.
   */
  void release() {
    v.clear();
  }


  DataWordTag get(int index) {
    return v.get(index);
  }

  private void loadFile(TaggedFileReader reader, Map<String, IntCounter<String>> wordTagCounts) {
    log.info("Loading tagged words from " + reader.filename());

    ArrayList<String> words = new ArrayList<>();
    ArrayList<String> tags = new ArrayList<>();
    int numSentences = 0;
    int numWords = 0;
    int maxLen = Integer.MIN_VALUE;
    int minLen = Integer.MAX_VALUE;

    for (List<TaggedWord> sentence : reader) {
      if (maxentTagger.wordFunction != null) {
        List<TaggedWord> newSentence =
                new ArrayList<>(sentence.size());
        for (TaggedWord word : sentence) {
          TaggedWord newWord =
            new TaggedWord(maxentTagger.wordFunction.apply(word.word()),
                           word.tag());
          newSentence.add(newWord);
        }
        sentence = newSentence;
      }
      for (TaggedWord tw : sentence) {
        if(tw != null) {
          words.add(tw.word());
          tags.add(tw.tag());
          if (!maxentTagger.tagTokens.containsKey(tw.tag())) {
            maxentTagger.tagTokens.put(tw.tag(), Generics.<String>newHashSet());
          }
          maxentTagger.tagTokens.get(tw.tag()).add(tw.word());
        }
      }
      maxLen = (sentence.size() > maxLen ? sentence.size() : maxLen);
      minLen = (sentence.size() < minLen ? sentence.size() : minLen);
      words.add(Tagger.EOS_WORD);
      tags.add(Tagger.EOS_TAG);
      numElements = numElements + sentence.size() + 1;
      // iterate over the words in the sentence
      for (int i = 0; i < sentence.size() + 1; i++) {
        History h = new History(totalWords + totalSentences,
                                totalWords + totalSentences + sentence.size(),
                                totalWords + totalSentences + i,
                                pairs, maxentTagger.extractors);
        String tag = tags.get(i);
        String word = words.get(i);
        pairs.add(new WordTag(word,tag));
        int y = maxentTagger.addTag(tag);
        DataWordTag dat = new DataWordTag(h, y, tag);
        v.add(dat);

        IntCounter<String> tagCounts = wordTagCounts.get(word);
        if (tagCounts == null) {
          tagCounts = new IntCounter<>();
          wordTagCounts.put(word, tagCounts);
        }
        tagCounts.incrementCount(tag, 1);
      }
      totalSentences++;
      totalWords += sentence.size();
      numSentences++;
      numWords += sentence.size();
      words.clear();
      tags.clear();
      if ((numSentences % 100000) == 0) log.info("Read " + numSentences + " sentences, min " + minLen + " words, max " + maxLen + " words ... [still reading]");
    }

    log.info("Read " + numWords + " words from " + reader.filename() + " [done].");
    log.info("Read " + numSentences + " sentences, min " + minLen + " words, max " + maxLen + " words.");
  }


  /** Returns the number of tokens in the data read, which is the number of words
   *  plus one end sentence token per sentence.
   *  @return The number of tokens in the data
   */
  public int getSize() {
    return numElements;
  }

}