/* This file is part of the Joshua Machine Translation System. * * Joshua is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 * of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, * MA 02111-1307 USA */ package joshua.corpus.suffix_array; import java.io.IOException; import java.io.ObjectInput; import java.io.ObjectOutput; import java.util.ArrayList; import java.util.Arrays; import java.util.HashSet; import java.util.LinkedHashMap; import java.util.Set; import java.util.logging.Level; import java.util.logging.Logger; import joshua.corpus.Phrase; import joshua.util.IntegerPair; /** * Represents all locations in a corpus where the most frequent * phrases are located. * * @author Lane Schwartz * @author Chris Callison-Burch * @version $LastChangedDate: 2010-02-01 14:37:27 -0600 (Mon, 01 Feb 2010) $ */ public class FrequentMatches { // // /** Logger for this class. */ // private static final Logger logger = // Logger.getLogger(FrequentMatches.class.getName()); // // /** // * Stores the frequency rank for each phrase. // * <p> // * For a given phrase p, this variable stores the value of // * n indicating that p is the <i>n</i>th most frequent phrase in // * the corpus. // * <p> // * The iteration order of this map should start with the // * most frequent phrase and end with the least frequent // * phrase stored in the map. // * <p> // * The key set for this map should be identical to the key // * set in the <code>FrequentPhrases.frequentPhrases</code> // * map. // */ // private final LinkedHashMap<Phrase,Short> ranks; // // /** // * Maximum number of phrases of which this object is aware. // * <p> // * This is <em>not</em> the number of times such phrases // * occur in the corpus. Rather, it is the number of unique // * phrase patterns of which this object is aware. // */ // private final short maxPhrases; // // /** // * List of collocation identifiers that have been added to // * this object. // * <p> // * The values for these identifiers are of the format // * returned by the <code>getKey</code> method. // * <p> // * This variable is temporary, and is set to null // * once histogramSort is performed. // * // * This variable is not serialized, // * and as such is marked transient. // */ // private transient int[] keys; // // /** // * For each of the n most frequent phrases, // * this array holds the starting index into // * the position1 and position2 arrays where // * the positions for that phrase are stored. // * <p> // * The length of this array is equal to <code>maxPhrases</code>. // */ // int[] bucketIndex; // // /** // * List of positions in a corpus where the first phrase in // * a collocation starts. // */ // final int[] position1; // // /** // * List of positions in a corpus where the second phrase // * in a collocation starts. // */ // final int[] position2; // // /** // * The number of collocations that have been added to this // * object. // * <p> // * This variable is not serialized, // * and as such is marked transient. // */ // transient int counter = 0; // // /** // * The minimum allowed span for a nonterminal gap. // */ // final short minNonterminalSpan; // // // /** // * Constructs an empty list of locations where collocations // * of frequent phrases are found in a corpus. // * // * @param ranks Map from phrase to frequency rank of the phrase. // * @param maxPhrases The maximum number of frequent phrases. // * @param capacity The total number of matches expected. // */ // FrequentMatches( // FrequentPhrases frequentPhrases, // int maxPhraseLength, // // int windowSize, // short minNonterminalSpan) { // // logger.fine("Calculating number of frequent collocations"); // int capacity = frequentPhrases.countCollocations(maxPhraseLength, windowSize, minNonterminalSpan); // logger.fine("Total collocations: " + capacity); // // this.ranks = frequentPhrases.getRanks(); // this.maxPhrases = frequentPhrases.getMaxPhrases(); // this.minNonterminalSpan = minNonterminalSpan; // // if (logger.isLoggable(Level.FINE)) logger.fine("Allocating " + ((int)(capacity*4 / 1024.0 / 1024.0)) + "MB for collocation keys"); // keys = new int[capacity]; // if (logger.isLoggable(Level.FINE)) logger.fine("Allocating " + ((int)(capacity*4 / 1024.0 / 1024.0)) + "MB for collocation position1"); // position1 = new int[capacity]; // if (logger.isLoggable(Level.FINE)) logger.fine("Allocating " + ((int)(capacity*4 / 1024.0 / 1024.0)) + "MB for collocation position2"); // position2 = new int[capacity]; // if (logger.isLoggable(Level.FINE)) logger.fine("Done allocating memory for collocations data"); // // } // // // public int getRank(Phrase phrase) { // return ranks.get(phrase); // } // // public boolean contains(Phrase phrase) { // return ranks.containsKey(phrase); // } // // /** // * Gets the starting position in the corpus // * of the <em>n</em>'th instance of the specified phrase. // * // * @param phrase // * @param phraseIndex // * @return // */ // int getStartPosition(Phrase phrase, int phraseIndex, int positionNumber) { // int rank = ranks.get(phrase); // int bucketStart = bucketIndex[rank]; // if (positionNumber==0) { // return position1[bucketStart+phraseIndex]; // } else if (positionNumber==1) { // return position2[bucketStart+phraseIndex]; // } else { // throw new ArrayIndexOutOfBoundsException(""); // } // } // // public int getMatchCount(Phrase phrase) { // if (ranks.containsKey(phrase)) { // // int rank = ranks.get(phrase); // // int start = bucketIndex[rank]; // int end = (rank+1 < maxPhrases) // ? bucketIndex[rank+1] // : position1.length; // // return end - start; // // } else { // return 0; // } // } // // /** // * Adds a collocated pair of phrases to this container, // * along with their respective positions in the corpus. // */ // void add(Phrase phrase1, Phrase phrase2, int position1, int position2) { // // // The second phrase must start after the first phrase ends, // // and there must be a minimum gap // // (minNonterminalSpan) between the phrases // if (position2 >= position1 + phrase1.size() + minNonterminalSpan) { // // int key = getKey(phrase1, phrase2); // // keys[counter] = key; // this.position1[counter] = position1; // this.position2[counter] = position2; // // counter++; // // } // } // // // // // // /** // * Returns an integer identifier for the collocation of // * <code>phrase1</code> with <code>phrase2</code>. // * <p> // * If <code>rank1</code> is the rank of <code>phrase1</code> // * and <code>rank2</code> is the rank of <code>phrase2</code>, // * the identifier returned by this method is defined to be // * <code>rank1*maxPhrases + rank2</code>. // * <p> // * As such, the range of possible values returned by this // * method will be </code>0</code> through // * <code>maxPhrases*maxPhrases-1</code>. // * // * @param phrase1 First phrase in a collocation. // * @param phrase2 Second phrase in a collocation. // * @return a unique integer identifier for the collocation. // */ // private int getKey(Phrase phrase1, Phrase phrase2) { // // short rank1 = ranks.get(phrase1); // short rank2 = ranks.get(phrase2); // // int rank = rank1*maxPhrases + rank2; // // return rank; // } // // public String toString() { // StringBuilder s = new StringBuilder(); // // int max = ranks.keySet().size(); // ArrayList<Phrase> frequentPhrases = new ArrayList<Phrase>(ranks.keySet()); //// System.out.println(ranks.keySet().size()); //// System.out.println(maxPhrases); // // for (short rank1=0; rank1<max; rank1+=1) { // for (short rank2=0; rank2<max; rank2+=1) { // int locationIndex = rank1*maxPhrases + rank2; // // s.append(frequentPhrases.get(rank1).toString()); // s.append(" X "); // s.append(frequentPhrases.get(rank2).toString()); // s.append('\t'); // s.append(locationIndex); // s.append('\n'); // } // } // // return s.toString(); // } // // void histogramSort() { // if (logger.isLoggable(Level.FINE)) logger.fine("Sorting collocations"); // this.bucketIndex = histogramSort(maxPhrases, keys, position1, position2); // // this.keys = null; // } // // /** // * Sorts match data using a specialization of bucket sort. // * // * @param maxPhrases // * @param keys // * @param position1 // * @param position2 // * @return Array containing the starting positions of each bucket // */ // private static int[] histogramSort(int maxPhrases, int[] keys, int[] position1, int[] position2) { // // int maxBuckets = maxPhrases*maxPhrases; // // logger.fine("Calculating histograms"); // int[] histogram = calculateHistogram(maxPhrases, keys, maxBuckets); // // if (logger.isLoggable(Level.FINEST)) logger.finest("Allocating memory for " + maxBuckets + " integers"); // int[] offsets = new int[maxBuckets]; // // logger.fine("Calculating offsets"); // for (int key=0, offset=0; key<maxBuckets; key++) { // // offsets[key] = 0; // // int value = histogram[key]; // histogram[key] = offset; // offset += value; // // } // // //// if (logger.isLoggable(Level.FINE)) logger.fine("Allocating temporary memory for keys: " + ((keys.length)*4/1024/1024) + "MB"); //// int[] tmpKeys = new int[keys.length]; // if (logger.isLoggable(Level.FINE)) logger.fine("Allocating temporary memory for position1: " + ((keys.length)*4/1024/1024) + "MB"); // int[] tmpPosition1 = new int[keys.length]; // if (logger.isLoggable(Level.FINE)) logger.fine("Allocating temporary memory for position2: " + ((keys.length)*4/1024/1024) + "MB"); // int[] tmpPosition2 = new int[keys.length]; // // if (logger.isLoggable(Level.FINE)) logger.fine("Placing data into buckets"); // for (int i=0, n=keys.length; i < n; i++) { // // int key = keys[i]; // int location = histogram[key] + offsets[key]; // offsets[key] += 1; // //// tmpKeys[location] = key; // tmpPosition1[location] = position1[i]; // tmpPosition2[location] = position2[i]; // // } // //// logger.fine("Copying sorted keys to final location"); //// System.arraycopy(tmpKeys, 0, keys, 0, keys.length); // // logger.fine("Copying sorted position1 data to final location"); // System.arraycopy(tmpPosition1, 0, position1, 0, keys.length); // // logger.fine("Copying sorted position1 data to final location"); // System.arraycopy(tmpPosition2, 0, position2, 0, keys.length); // // // Try and help the garbage collector know we're done with these // offsets = null; //// tmpKeys = null; // tmpPosition1 = null; // tmpPosition2 = null; // // return histogram; // // } // // // /** // * Calculate how many times each key occurred. // * // * @param maxPhrases // * @param keys // * @param maxBuckets // * @return Histogram indicating how many times each key occurred // */ // private static int[] calculateHistogram(int maxPhrases, int[] keys, int maxBuckets) { // // int[] histogram = new int[maxBuckets]; // Arrays.fill(histogram, 0); // // for (int key : keys) { // // histogram[key] += 1; // // } // // if (logger.isLoggable(Level.FINE)) { // // int max = -1; // int maxKey = -1; // // int least = Integer.MAX_VALUE; // int leastKey = Integer.MAX_VALUE; // // for (int key=0; key<maxBuckets; key++) { // if (histogram[key] > max) { // max = histogram[key]; // maxKey = key; // } // if (histogram[key] < least) { // least = histogram[key]; // leastKey = key; // } // } // // if (maxKey != -1) { // int a = maxKey / maxPhrases; // int b = maxKey % maxPhrases; // // logger.fine("Most frequent collocation is key " + maxKey + " a=="+a+ " b=="+b+" (" +max+") times"); // } else { // logger.fine("Most frequent collocation not found"); // } // // if (leastKey != Integer.MAX_VALUE) { // int c = leastKey / maxPhrases; // int d = leastKey % maxPhrases; // logger.fine("Least frequent collocation is key " + leastKey + " c=="+c+ " d=="+d+" (" +least+") times"); // } else { // logger.fine("Least frequent collocation not found"); // } // } // // return histogram; // } // // /** // * Not supported; throws an UnsupportedOperationException. // * // * @param in // * @throws UnsupportedOperationException // */ // public void readExternal(ObjectInput in) throws IOException, // ClassNotFoundException { // // throw new UnsupportedOperationException(); // // } // // /** // * Write the contents of this class as binary data to an // * output stream. // * // * @param out // */ // public void writeExternal(ObjectOutput out) throws IOException { // // TODO Auto-generated method stub // // } }