TaggerExperiments.java example

Explorer
CoreNLP-master
// TaggerExperiments -- StanfordMaxEnt, A Maximum Entropy Toolkit
// Copyright (c) 2002-2008 Leland Stanford Junior University
//
//
// This program is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public License
// as published by the Free Software Foundation; either version 2
// of the License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
//
// For more information, bug reports, fixes, contact:
//    Christopher Manning
//    Dept of Computer Science, Gates 1A
//    Stanford CA 94305-9010
//    USA
//    Support/Questions: java-nlp-user@lists.stanford.edu
//    Licensing: java-nlp-support@lists.stanford.edu
//    http://www-nlp.stanford.edu/software/tagger.shtml

package edu.stanford.nlp.tagger.maxent;

import edu.stanford.nlp.io.RuntimeIOException;
import edu.stanford.nlp.maxent.Experiments;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.Pair;
import edu.stanford.nlp.util.logging.Redwood;

import java.io.File;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.util.Map;
import java.util.Set;
import java.util.Arrays;


/**
 * This class represents the training samples. It can return statistics of
 * them, for example the frequency of each x or y in the training data.
 *
 * @author Kristina Toutanova
 * @version 1.0
 */
public class TaggerExperiments extends Experiments  {

  /** A logger for this class */
  private static final Redwood.RedwoodChannels log = Redwood.channels(TaggerExperiments.class);

  private static final boolean DEBUG = true;
  private static final String zeroSt = "0";

  private final TaggerFeatures feats;
  private final Set<FeatureKey> sTemplates = Generics.newHashSet();
  private final HistoryTable tHistories = new HistoryTable();

  private final int numFeatsGeneral;
  private final int numFeatsAll;

  private final MaxentTagger maxentTagger;

  private final TemplateHash tFeature;

  private byte[][] fnumArr;



  // This constructor is only used by unit tests.
  TaggerExperiments(MaxentTagger maxentTagger) {
    this.maxentTagger = maxentTagger;
    this.tFeature = new TemplateHash(maxentTagger);
    numFeatsGeneral = maxentTagger.extractors.size();
    numFeatsAll = numFeatsGeneral + maxentTagger.extractorsRare.size();
    feats = new TaggerFeatures(this);
  }

  /** This method gets feature statistics from a training file found in the TaggerConfig.
   *  It is the start of the training process.
   */
  protected TaggerExperiments(TaggerConfig config, MaxentTagger maxentTagger) throws IOException {
    this(maxentTagger);

    log.info("TaggerExperiments: adding word/tags");
    PairsHolder pairs = new PairsHolder();
    ReadDataTagged c = new ReadDataTagged(config, maxentTagger, pairs);
    vArray = new int[c.getSize()][2];

    initTemplatesNew();
    log.info("Featurizing tagged data tokens...");
    for (int i = 0, size = c.getSize(); i < size; i++) {

      DataWordTag d = c.get(i);
      String yS = d.getY();
      History h = d.getHistory();
      int indX = tHistories.add(h);
      int indY = d.getYInd();
      addTemplatesNew(h, yS);
      addRareTemplatesNew(h, yS);
      vArray[i][0] = indX;
      vArray[i][1] = indY;

      // It's the 2010s now and it doesn't take so long to featurize....
      // if (i > 0 && (i % 10000) == 0) {
      //   System.err.printf("%d ", i);
      //   if (i % 100000 == 0) { System.err.println(); }
      // }
    }
    // log.info();
    log.info("Featurized " + c.getSize() + " data tokens [done].");
    c.release();
    ptilde();
    maxentTagger.xSize = xSize;
    maxentTagger.ySize = ySize;
    log.info("xSize [num Phi templates] = " + xSize + "; ySize [num classes] = " + ySize);

    hashHistories();

    // if we'll look at occurring tags only, we need the histories and pairs still
    if (!maxentTagger.occurringTagsOnly && !maxentTagger.possibleTagsOnly) {
      tHistories.release();
      pairs.clear();
    }

    getFeaturesNew();
  }


  public TaggerFeatures getTaggerFeatures() {
    return feats;
  }


  /** Adds a FeatureKey to the set of known FeatureKeys.
   *
   * @param s The feature key to be added
   * @return Whether the key was already known (false) or added (true)
   */
  protected boolean add(FeatureKey s) {
    if ((sTemplates.contains(s))) {
      return false;
    }
    sTemplates.add(s);
    return true;
  }

  byte[][] getFnumArr() {
    return fnumArr;
  }

  /** This method uses and deletes a file tempXXXXXX.x in the current directory! */
  private void getFeaturesNew() {
    // todo: Change to rethrow a RuntimeIOException.
    // todo: can fnumArr overflow?
    try {
      log.info("TaggerExperiments.getFeaturesNew: initializing fnumArr.");
      fnumArr = new byte[xSize][ySize]; // what is the maximum number of active features
      File hFile = File.createTempFile("temp",".x", new File("./"));
      RandomAccessFile hF = new RandomAccessFile(hFile, "rw");
      log.info("  length of sTemplates keys: " + sTemplates.size());
      log.info("getFeaturesNew adding features ...");
      int current = 0;
      int numFeats = 0;
      final boolean VERBOSE = false;
      for (FeatureKey fK : sTemplates) {
        int numF = fK.num;
        int[] xValues;
        Pair<Integer, String> wT = new Pair<>(numF, fK.val);
        xValues = tFeature.getXValues(wT);
        if (xValues == null) {
          log.info("  xValues is null: " + fK); //  + " " + i
          continue;
        }
        int numEvidence = 0;
        int y = maxentTagger.tags.getIndex(fK.tag);
        for (int xValue : xValues) {

          if (maxentTagger.occurringTagsOnly) {
            //check whether the current word in x has occurred with y
            String word = ExtractorFrames.cWord.extract(tHistories.getHistory(xValue));
            if (maxentTagger.dict.getCount(word, fK.tag) == 0) {
              continue;
            }
          }
          if (maxentTagger.possibleTagsOnly) {
            String word = ExtractorFrames.cWord.extract(tHistories.getHistory(xValue));
            String[] tags = maxentTagger.dict.getTags(word);
            Set<String> s = Generics.newHashSet(Arrays.asList(maxentTagger.tags.deterministicallyExpandTags(tags)));
            if(DEBUG)
              System.err.printf("possible tags for %s: %s\n", word, Arrays.toString(s.toArray()));
            if(!s.contains(fK.tag))
              continue;
          }
          numEvidence += this.px[xValue];
        }

        if (populated(numF, numEvidence)) {
          int[] positions = tFeature.getPositions(fK);
          if (maxentTagger.occurringTagsOnly || maxentTagger.possibleTagsOnly) { // TODO
            positions = null;
          }

          if (positions == null) {
            // write this in the file and create a TaggerFeature for it
            //int numElem
            int numElements = 0;

            for (int x : xValues) {
              if (maxentTagger.occurringTagsOnly) {
                //check whether the current word in x has occurred with y
                String word = ExtractorFrames.cWord.extract(tHistories.getHistory(x));
                if (maxentTagger.dict.getCount(word, fK.tag) == 0) {
                  continue;
                }
              }
              if(maxentTagger.possibleTagsOnly) {
                String word = ExtractorFrames.cWord.extract(tHistories.getHistory(x));
                String[] tags = maxentTagger.dict.getTags(word);
                Set<String> s = Generics.newHashSet(Arrays.asList(maxentTagger.tags.deterministicallyExpandTags(tags)));
                if(!s.contains(fK.tag))
                  continue;
              }
              numElements++;

              hF.writeInt(x);
              fnumArr[x][y]++;
            }
            TaggerFeature tF = new TaggerFeature(current, current + numElements - 1, fK,
                                                 maxentTagger.getTagIndex(fK.tag), this);
            tFeature.addPositions(current, current + numElements - 1, fK);
            current = current + numElements;
            feats.add(tF);
            if (VERBOSE) {
              log.info("  added feature with key " + fK + " has support " + numElements);
            }
          } else {

            for(int x : xValues) {
              fnumArr[x][y]++;
            }
            // this is the second time to write these values
            TaggerFeature tF = new TaggerFeature(positions[0], positions[1], fK,
                                                 maxentTagger.getTagIndex(fK.tag), this);
            feats.add(tF);
            if (VERBOSE) {
              log.info("  added feature with key " + fK + " has support " + xValues.length);
            }
          }

          // TODO: rearrange some of this code, such as not needing to
          // look up the tag # in the index
          if (maxentTagger.fAssociations.size() <= fK.num) {
            for (int i = maxentTagger.fAssociations.size(); i <= fK.num; ++i) {
              maxentTagger.fAssociations.add(Generics.<String, int[]>newHashMap());
            }
          }
          Map<String, int[]> fValueAssociations = maxentTagger.fAssociations.get(fK.num);
          int[] fTagAssociations = fValueAssociations.get(fK.val);
          if (fTagAssociations == null) {
            fTagAssociations = new int[ySize];
            for (int i = 0; i < ySize; ++i) {
              fTagAssociations[i] = -1;
            }
            fValueAssociations.put(fK.val, fTagAssociations);
          }
          fTagAssociations[maxentTagger.tags.getIndex(fK.tag)] = numFeats;

          numFeats++;
        }

      } // foreach FeatureKey fK
      // read out the file and put everything in an array of ints stored in Feats
      tFeature.release();
      feats.xIndexed = new int[current];
      hF.seek(0);
      int current1 = 0;
      while (current1 < current) {
        feats.xIndexed[current1] = hF.readInt();
        current1++;
      }
      log.info("  total feats: " + sTemplates.size() + ", populated: " + numFeats);
      hF.close();
      hFile.delete();

      // what is the maximum number of active features per pair
      int max = 0;
      int maxGt = 0;
      int numZeros = 0;
      for (int x = 0; x < xSize; x++) {
        int numGt = 0;
        for (int y = 0; y < ySize; y++) {
          if (fnumArr[x][y] > 0) {
            numGt++;
            if (max < fnumArr[x][y]) {
              max = fnumArr[x][y];
            }
          } else {
            // if 00
            numZeros++;
          }
        }
        if (maxGt < numGt) {
          maxGt = numGt;
        }
      } // for x

      log.info("  Max features per x,y pair: " + max);
      log.info("  Max non-zero y values for an x: " + maxGt);
      log.info("  Number of non-zero feature x,y pairs: " +
          (xSize * ySize - numZeros));
      log.info("  Number of zero feature x,y pairs: " + numZeros);
      log.info("end getFeaturesNew.");
    } catch (Exception e) {
      throw new RuntimeIOException(e);
    }
  }


  private void hashHistories() {
    int fAll = maxentTagger.extractors.size() + maxentTagger.extractorsRare.size();
    int fGeneral = maxentTagger.extractors.size();
    log.info("Hashing histories ...");
    for (int x = 0; x < xSize; x++) {
      History h = tHistories.getHistory(x);
      // It's the 2010s now and it doesn't take so long to featurize....
      // if (x > 0 && x % 10000 == 0) {
      //   System.err.printf("%d ",x);
      //   if (x % 100000 == 0) { log.info(); }
      // }
      int fSize = (maxentTagger.isRare(ExtractorFrames.cWord.extract(h)) ? fAll : fGeneral);
      for (int i = 0; i < fSize; i++) {
        tFeature.addPrev(i, h);
      }
    } // for x
    // now for the populated ones
    // log.info();
    log.info("Hashed " + xSize + " histories.");
    log.info("Hashing populated histories ...");
    for (int x = 0; x < xSize; x++) {
      History h = tHistories.getHistory(x);
      // It's the 2010s now and it doesn't take so long to featurize....
      // if (x > 0 && x % 10000 == 0) {
      //   log.info(x + " ");
      //   if (x % 100000 == 0) { log.info(); }
      // }
      int fSize = (maxentTagger.isRare(ExtractorFrames.cWord.extract(h)) ? fAll : fGeneral);
      for (int i = 0; i < fSize; i++) {
        tFeature.add(i, h, x); // write this to check whether to add
      }
    } // for x
    // log.info();
    log.info("Hashed populated histories.");
  }


  protected boolean populated(int fNo, int size) {
    return isPopulated(fNo, size, maxentTagger);
  }

  protected static boolean isPopulated(int fNo, int size, MaxentTagger maxentTagger) {
    // Feature number 0 is hard-coded as the current word feature, which has a special threshold
    if (fNo == 0) {
      return (size > maxentTagger.curWordMinFeatureThresh);
    } else if (fNo < maxentTagger.extractors.size()) {
      return (size > maxentTagger.minFeatureThresh);
    } else {
      return (size > maxentTagger.rareWordMinFeatureThresh);
    }
  }

  private void initTemplatesNew() {
    maxentTagger.dict.setAmbClasses(maxentTagger.ambClasses, maxentTagger.veryCommonWordThresh, maxentTagger.tags);
  }


  // Add a new feature key in a hashtable of feature templates
  private void addTemplatesNew(History h, String tag) {
    // Feature templates general

    for (int i = 0; i < numFeatsGeneral; i++) {
      String s = maxentTagger.extractors.extract(i, h);
      if (s.equals(zeroSt)) {
        continue;
      } //do not add the feature
      //iterate over tags in dictionary
      if (maxentTagger.alltags) {
        int numTags = maxentTagger.numTags();
        for (int j = 0; j < numTags; j++) {

          String tag1 = maxentTagger.getTag(j);

          FeatureKey key = new FeatureKey(i, s, tag1);

          if (!maxentTagger.extractors.get(i).precondition(tag1)) {
            continue;
          }

          add(key);
        }
      } else {
        //only this tag
        FeatureKey key = new FeatureKey(i, s, tag);

        if (!maxentTagger.extractors.get(i).precondition(tag)) {
          continue;
        }

        add(key);
      }
    }
  }


  private void addRareTemplatesNew(History h, String tag) {
    // Feature templates rare

    if (!(maxentTagger.isRare(ExtractorFrames.cWord.extract(h)))) {
      return;
    }
    int start = numFeatsGeneral;
    for (int i = start; i < numFeatsAll; i++) {
      String s = maxentTagger.extractorsRare.extract(i - start, h);

      if (s.equals(zeroSt)) {
        continue;
      } //do not add the feature
      if (maxentTagger.alltags) {
        int numTags = maxentTagger.numTags();
        for (int j = 0; j < numTags; j++) {

          String tag1 = maxentTagger.getTag(j);

          FeatureKey key = new FeatureKey(i, s, tag1);

          if (!maxentTagger.extractorsRare.get(i - start).precondition(tag1)) {
            continue;
          }

          add(key);
        }
      } else {
        //only this tag
        FeatureKey key = new FeatureKey(i, s, tag);

        if (!maxentTagger.extractorsRare.get(i - start).precondition(tag)) {
          continue;
        }

        add(key);
      }
    }
  }

  HistoryTable getHistoryTable() {
    return tHistories;
  }


  /*
  public String getY(int index) {
    return maxentTagger.tags.getTag(vArray[index][1]);
  }
  */

  /*
  public static void main(String[] args) {
    int[] hPos = {0, 1, 2, -1, -2};
    boolean[] isTag = {false, false, false, true, true};
    maxentTagger.init();
    TaggerExperiments gophers = new TaggerExperiments("trainhuge.txt", null);
    //gophers.ptilde();
  }
  */

}