// TaggerExperiments -- StanfordMaxEnt, A Maximum Entropy Toolkit // Copyright (c) 2002-2008 Leland Stanford Junior University // // // This program is free software; you can redistribute it and/or // modify it under the terms of the GNU General Public License // as published by the Free Software Foundation; either version 2 // of the License, or (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. // // For more information, bug reports, fixes, contact: // Christopher Manning // Dept of Computer Science, Gates 1A // Stanford CA 94305-9010 // USA // Support/Questions: java-nlp-user@lists.stanford.edu // Licensing: java-nlp-support@lists.stanford.edu // http://www-nlp.stanford.edu/software/tagger.shtml package edu.stanford.nlp.tagger.maxent; import edu.stanford.nlp.io.RuntimeIOException; import edu.stanford.nlp.maxent.Experiments; import edu.stanford.nlp.util.Generics; import edu.stanford.nlp.util.Pair; import edu.stanford.nlp.util.logging.Redwood; import java.io.File; import java.io.IOException; import java.io.RandomAccessFile; import java.util.Map; import java.util.Set; import java.util.Arrays; /** * This class represents the training samples. It can return statistics of * them, for example the frequency of each x or y in the training data. * * @author Kristina Toutanova * @version 1.0 */ public class TaggerExperiments extends Experiments { /** A logger for this class */ private static final Redwood.RedwoodChannels log = Redwood.channels(TaggerExperiments.class); private static final boolean DEBUG = true; private static final String zeroSt = "0"; private final TaggerFeatures feats; private final Set<FeatureKey> sTemplates = Generics.newHashSet(); private final HistoryTable tHistories = new HistoryTable(); private final int numFeatsGeneral; private final int numFeatsAll; private final MaxentTagger maxentTagger; private final TemplateHash tFeature; private byte[][] fnumArr; // This constructor is only used by unit tests. TaggerExperiments(MaxentTagger maxentTagger) { this.maxentTagger = maxentTagger; this.tFeature = new TemplateHash(maxentTagger); numFeatsGeneral = maxentTagger.extractors.size(); numFeatsAll = numFeatsGeneral + maxentTagger.extractorsRare.size(); feats = new TaggerFeatures(this); } /** This method gets feature statistics from a training file found in the TaggerConfig. * It is the start of the training process. */ protected TaggerExperiments(TaggerConfig config, MaxentTagger maxentTagger) throws IOException { this(maxentTagger); log.info("TaggerExperiments: adding word/tags"); PairsHolder pairs = new PairsHolder(); ReadDataTagged c = new ReadDataTagged(config, maxentTagger, pairs); vArray = new int[c.getSize()][2]; initTemplatesNew(); log.info("Featurizing tagged data tokens..."); for (int i = 0, size = c.getSize(); i < size; i++) { DataWordTag d = c.get(i); String yS = d.getY(); History h = d.getHistory(); int indX = tHistories.add(h); int indY = d.getYInd(); addTemplatesNew(h, yS); addRareTemplatesNew(h, yS); vArray[i][0] = indX; vArray[i][1] = indY; // It's the 2010s now and it doesn't take so long to featurize.... // if (i > 0 && (i % 10000) == 0) { // System.err.printf("%d ", i); // if (i % 100000 == 0) { System.err.println(); } // } } // log.info(); log.info("Featurized " + c.getSize() + " data tokens [done]."); c.release(); ptilde(); maxentTagger.xSize = xSize; maxentTagger.ySize = ySize; log.info("xSize [num Phi templates] = " + xSize + "; ySize [num classes] = " + ySize); hashHistories(); // if we'll look at occurring tags only, we need the histories and pairs still if (!maxentTagger.occurringTagsOnly && !maxentTagger.possibleTagsOnly) { tHistories.release(); pairs.clear(); } getFeaturesNew(); } public TaggerFeatures getTaggerFeatures() { return feats; } /** Adds a FeatureKey to the set of known FeatureKeys. * * @param s The feature key to be added * @return Whether the key was already known (false) or added (true) */ protected boolean add(FeatureKey s) { if ((sTemplates.contains(s))) { return false; } sTemplates.add(s); return true; } byte[][] getFnumArr() { return fnumArr; } /** This method uses and deletes a file tempXXXXXX.x in the current directory! */ private void getFeaturesNew() { // todo: Change to rethrow a RuntimeIOException. // todo: can fnumArr overflow? try { log.info("TaggerExperiments.getFeaturesNew: initializing fnumArr."); fnumArr = new byte[xSize][ySize]; // what is the maximum number of active features File hFile = File.createTempFile("temp",".x", new File("./")); RandomAccessFile hF = new RandomAccessFile(hFile, "rw"); log.info(" length of sTemplates keys: " + sTemplates.size()); log.info("getFeaturesNew adding features ..."); int current = 0; int numFeats = 0; final boolean VERBOSE = false; for (FeatureKey fK : sTemplates) { int numF = fK.num; int[] xValues; Pair<Integer, String> wT = new Pair<>(numF, fK.val); xValues = tFeature.getXValues(wT); if (xValues == null) { log.info(" xValues is null: " + fK); // + " " + i continue; } int numEvidence = 0; int y = maxentTagger.tags.getIndex(fK.tag); for (int xValue : xValues) { if (maxentTagger.occurringTagsOnly) { //check whether the current word in x has occurred with y String word = ExtractorFrames.cWord.extract(tHistories.getHistory(xValue)); if (maxentTagger.dict.getCount(word, fK.tag) == 0) { continue; } } if (maxentTagger.possibleTagsOnly) { String word = ExtractorFrames.cWord.extract(tHistories.getHistory(xValue)); String[] tags = maxentTagger.dict.getTags(word); Set<String> s = Generics.newHashSet(Arrays.asList(maxentTagger.tags.deterministicallyExpandTags(tags))); if(DEBUG) System.err.printf("possible tags for %s: %s\n", word, Arrays.toString(s.toArray())); if(!s.contains(fK.tag)) continue; } numEvidence += this.px[xValue]; } if (populated(numF, numEvidence)) { int[] positions = tFeature.getPositions(fK); if (maxentTagger.occurringTagsOnly || maxentTagger.possibleTagsOnly) { // TODO positions = null; } if (positions == null) { // write this in the file and create a TaggerFeature for it //int numElem int numElements = 0; for (int x : xValues) { if (maxentTagger.occurringTagsOnly) { //check whether the current word in x has occurred with y String word = ExtractorFrames.cWord.extract(tHistories.getHistory(x)); if (maxentTagger.dict.getCount(word, fK.tag) == 0) { continue; } } if(maxentTagger.possibleTagsOnly) { String word = ExtractorFrames.cWord.extract(tHistories.getHistory(x)); String[] tags = maxentTagger.dict.getTags(word); Set<String> s = Generics.newHashSet(Arrays.asList(maxentTagger.tags.deterministicallyExpandTags(tags))); if(!s.contains(fK.tag)) continue; } numElements++; hF.writeInt(x); fnumArr[x][y]++; } TaggerFeature tF = new TaggerFeature(current, current + numElements - 1, fK, maxentTagger.getTagIndex(fK.tag), this); tFeature.addPositions(current, current + numElements - 1, fK); current = current + numElements; feats.add(tF); if (VERBOSE) { log.info(" added feature with key " + fK + " has support " + numElements); } } else { for(int x : xValues) { fnumArr[x][y]++; } // this is the second time to write these values TaggerFeature tF = new TaggerFeature(positions[0], positions[1], fK, maxentTagger.getTagIndex(fK.tag), this); feats.add(tF); if (VERBOSE) { log.info(" added feature with key " + fK + " has support " + xValues.length); } } // TODO: rearrange some of this code, such as not needing to // look up the tag # in the index if (maxentTagger.fAssociations.size() <= fK.num) { for (int i = maxentTagger.fAssociations.size(); i <= fK.num; ++i) { maxentTagger.fAssociations.add(Generics.<String, int[]>newHashMap()); } } Map<String, int[]> fValueAssociations = maxentTagger.fAssociations.get(fK.num); int[] fTagAssociations = fValueAssociations.get(fK.val); if (fTagAssociations == null) { fTagAssociations = new int[ySize]; for (int i = 0; i < ySize; ++i) { fTagAssociations[i] = -1; } fValueAssociations.put(fK.val, fTagAssociations); } fTagAssociations[maxentTagger.tags.getIndex(fK.tag)] = numFeats; numFeats++; } } // foreach FeatureKey fK // read out the file and put everything in an array of ints stored in Feats tFeature.release(); feats.xIndexed = new int[current]; hF.seek(0); int current1 = 0; while (current1 < current) { feats.xIndexed[current1] = hF.readInt(); current1++; } log.info(" total feats: " + sTemplates.size() + ", populated: " + numFeats); hF.close(); hFile.delete(); // what is the maximum number of active features per pair int max = 0; int maxGt = 0; int numZeros = 0; for (int x = 0; x < xSize; x++) { int numGt = 0; for (int y = 0; y < ySize; y++) { if (fnumArr[x][y] > 0) { numGt++; if (max < fnumArr[x][y]) { max = fnumArr[x][y]; } } else { // if 00 numZeros++; } } if (maxGt < numGt) { maxGt = numGt; } } // for x log.info(" Max features per x,y pair: " + max); log.info(" Max non-zero y values for an x: " + maxGt); log.info(" Number of non-zero feature x,y pairs: " + (xSize * ySize - numZeros)); log.info(" Number of zero feature x,y pairs: " + numZeros); log.info("end getFeaturesNew."); } catch (Exception e) { throw new RuntimeIOException(e); } } private void hashHistories() { int fAll = maxentTagger.extractors.size() + maxentTagger.extractorsRare.size(); int fGeneral = maxentTagger.extractors.size(); log.info("Hashing histories ..."); for (int x = 0; x < xSize; x++) { History h = tHistories.getHistory(x); // It's the 2010s now and it doesn't take so long to featurize.... // if (x > 0 && x % 10000 == 0) { // System.err.printf("%d ",x); // if (x % 100000 == 0) { log.info(); } // } int fSize = (maxentTagger.isRare(ExtractorFrames.cWord.extract(h)) ? fAll : fGeneral); for (int i = 0; i < fSize; i++) { tFeature.addPrev(i, h); } } // for x // now for the populated ones // log.info(); log.info("Hashed " + xSize + " histories."); log.info("Hashing populated histories ..."); for (int x = 0; x < xSize; x++) { History h = tHistories.getHistory(x); // It's the 2010s now and it doesn't take so long to featurize.... // if (x > 0 && x % 10000 == 0) { // log.info(x + " "); // if (x % 100000 == 0) { log.info(); } // } int fSize = (maxentTagger.isRare(ExtractorFrames.cWord.extract(h)) ? fAll : fGeneral); for (int i = 0; i < fSize; i++) { tFeature.add(i, h, x); // write this to check whether to add } } // for x // log.info(); log.info("Hashed populated histories."); } protected boolean populated(int fNo, int size) { return isPopulated(fNo, size, maxentTagger); } protected static boolean isPopulated(int fNo, int size, MaxentTagger maxentTagger) { // Feature number 0 is hard-coded as the current word feature, which has a special threshold if (fNo == 0) { return (size > maxentTagger.curWordMinFeatureThresh); } else if (fNo < maxentTagger.extractors.size()) { return (size > maxentTagger.minFeatureThresh); } else { return (size > maxentTagger.rareWordMinFeatureThresh); } } private void initTemplatesNew() { maxentTagger.dict.setAmbClasses(maxentTagger.ambClasses, maxentTagger.veryCommonWordThresh, maxentTagger.tags); } // Add a new feature key in a hashtable of feature templates private void addTemplatesNew(History h, String tag) { // Feature templates general for (int i = 0; i < numFeatsGeneral; i++) { String s = maxentTagger.extractors.extract(i, h); if (s.equals(zeroSt)) { continue; } //do not add the feature //iterate over tags in dictionary if (maxentTagger.alltags) { int numTags = maxentTagger.numTags(); for (int j = 0; j < numTags; j++) { String tag1 = maxentTagger.getTag(j); FeatureKey key = new FeatureKey(i, s, tag1); if (!maxentTagger.extractors.get(i).precondition(tag1)) { continue; } add(key); } } else { //only this tag FeatureKey key = new FeatureKey(i, s, tag); if (!maxentTagger.extractors.get(i).precondition(tag)) { continue; } add(key); } } } private void addRareTemplatesNew(History h, String tag) { // Feature templates rare if (!(maxentTagger.isRare(ExtractorFrames.cWord.extract(h)))) { return; } int start = numFeatsGeneral; for (int i = start; i < numFeatsAll; i++) { String s = maxentTagger.extractorsRare.extract(i - start, h); if (s.equals(zeroSt)) { continue; } //do not add the feature if (maxentTagger.alltags) { int numTags = maxentTagger.numTags(); for (int j = 0; j < numTags; j++) { String tag1 = maxentTagger.getTag(j); FeatureKey key = new FeatureKey(i, s, tag1); if (!maxentTagger.extractorsRare.get(i - start).precondition(tag1)) { continue; } add(key); } } else { //only this tag FeatureKey key = new FeatureKey(i, s, tag); if (!maxentTagger.extractorsRare.get(i - start).precondition(tag)) { continue; } add(key); } } } HistoryTable getHistoryTable() { return tHistories; } /* public String getY(int index) { return maxentTagger.tags.getTag(vArray[index][1]); } */ /* public static void main(String[] args) { int[] hPos = {0, 1, 2, -1, -2}; boolean[] isTag = {false, false, false, true, true}; maxentTagger.init(); TaggerExperiments gophers = new TaggerExperiments("trainhuge.txt", null); //gophers.ptilde(); } */ }