FrequentPhrases.java example

Explorer
relax-decode-master
- third-party
/* This file is part of the Joshua Machine Translation System.
 * 
 * Joshua is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation; either version 2.1
 * of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free
 * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
 * MA 02111-1307 USA
 */
package joshua.corpus.suffix_array;

import java.io.IOException;
import java.io.ObjectInput;
import java.io.ObjectOutput;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.PriorityQueue;
import java.util.Set;
import java.util.SortedSet;
import java.util.Stack;
import java.util.logging.Level;
import java.util.logging.Logger;

import joshua.corpus.ContiguousPhrase;
import joshua.corpus.Corpus;
import joshua.corpus.Phrase;
import joshua.corpus.mm.MemoryMappedCorpusArray;
import joshua.corpus.suffix_array.mm.MemoryMappedSuffixArray;
import joshua.corpus.vocab.SymbolTable;
import joshua.corpus.vocab.Vocabulary;
import joshua.util.Cache;
import joshua.util.Counted;
import joshua.util.io.BinaryIn;

/**
 * Represents the most frequent phrases in a corpus.
 * 
 * @author Chris Callison-Burch
 * @author Lane Schwartz
 */
public class FrequentPhrases {

	/** Logger for this class. */
	private static final Logger logger = 
		Logger.getLogger(FrequentPhrases.class.getName());
	
	/** Suffix array in which frequent phrases are located. */
	final Suffixes suffixes;
	
	/** 
	 * Stores the number of times a phrase occurred in the
	 * corpus.
	 * <p>
	 * The iteration order of this map should start with the
	 * most frequent phrase and end with the least frequent
	 * phrase stored in the map.
	 * <p>
	 * The key set for this map should be identical to the key
	 * set in the <code>ranks</code> map.
	 */
	LinkedHashMap<Phrase,Integer> frequentPhrases;
	
	/** Maximum number of phrases of which this object is aware. */
	short maxPhrases;
	
	/** Maximum phrase length to consider. */
	int maxPhraseLength;
	
	/** 
	 * Minimum number of words in a corpus 
	 * which a nonterminal may represent. 
	 */
	int minNonterminalSpan;
	
	/**
	 * Maximum span (from first token to last token) in a corpus
	 * which a phrase may represent.
	 */
	int maxPhraseSpan;
	
	/** Stores sorted lists of corpus locations for most frequent phrases. */
	Map<Phrase,InvertedIndex> invertedIndices;
	
	/** 
	 * Stores the corpus locations of collocations 
	 * of frequent phrases with other frequent phrases.
	 */
	List<HierarchicalPhrases> frequentCollocations;
	
	/**
	 * Constructs data regarding the frequencies of the <em>n</em>
	 * most frequent phrases found in the corpus backed by the
	 * provided suffix array.
	 * 
	 * @param suffixes   Suffix array corresponding to a corpus.
	 * @param minFrequency The minimum frequency required to
	 *                   for a phrase to be considered frequent.
	 * @param maxPhrases The number of phrases to consider.
	 * @param maxPhraseLength Maximum phrase length to consider.
	 * @param maxContiguousPhraseLength Maximum phrase length to consider for a contiguous phrase
	 * @param maxPhraseSpan Maximum span (from first token to last token) in a corpus
	 *                      which a phrase may represent.
	 * @param minNonterminalSpan Minimum number of words in a corpus 
	 *                           which a nonterminal may represent.
	 */
	public FrequentPhrases(
			Suffixes suffixes,
			int minFrequency,
			short maxPhrases,
			int maxPhraseLength,
			int maxContiguousPhraseLength,
			int maxPhraseSpan, int minNonterminalSpan) {
		
		this.maxPhrases = maxPhrases;
		this.maxPhraseLength = maxPhraseLength;
		this.minNonterminalSpan = minNonterminalSpan;
		this.maxPhraseSpan = maxPhraseSpan;
		
		this.suffixes = suffixes;
		this.frequentPhrases = getMostFrequentPhrases(suffixes, minFrequency, maxPhrases, maxContiguousPhraseLength);
		this.invertedIndices = calculateInvertedIndices();
		this.frequentCollocations = countCollocations();
	}
	
	public FrequentPhrases(Suffixes suffixes, String binaryFilename) throws IOException, ClassNotFoundException {
		this.suffixes = suffixes;
		BinaryIn<InvertedIndex> in = new BinaryIn<InvertedIndex>(binaryFilename, InvertedIndex.class);
		this.readExternal(in);
	}
//
//	public short getMaxPhrases() {
//		return this.maxPhrases;
//	}
	
	Suffixes getSuffixes() {
		return this.suffixes;
	}
	
//	/**
//	 * This method performs a one-pass computation of the
//	 * collocation of two frequent subphrases. It is used for
//	 * the precalculation of the translations of hierarchical
//	 * phrases which are problematic to calculate on the fly.
//	 * This procedure is described in "Hierarchical Phrase-Based
//	 * Translation with Suffix Arrays" by Adam Lopez.
//	 *
//	 * @param maxPhraseLength the maximum length of any phrase
//	 *                   in the phrases
//	 * @param windowSize the maximum allowable space between
//	 *                   phrases for them to still be considered
//	 *                   collocated
//	 * @param minNonterminalSpan Minimum span allowed for a nonterminal 
//	 */
//	public FrequentMatches getCollocations(
//			int maxPhraseLength,
//			int windowSize,
//			short minNonterminalSpan
//	) {
//	
////		FrequentMatches collocations = new FrequentMatches(this, maxPhraseLength, windowSize, minNonterminalSpan);
////		
////		countCollocations(maxPhraseLength, windowSize, minNonterminalSpan);
////		
////		collocations.histogramSort();
////		
////		return collocations;
//		
//		throw new RuntimeException("Not currently supported");
//		
//	}


	/**
	 * Gets the number of times any frequent phrase co-occurred 
	 * with any frequent phrase within the given window.
	 * <p>        
	 * This method performs a one-pass computation of the
	 * collocation of two frequent sub-phrases. It is used for
	 * the precalculation of the translations of hierarchical
	 * phrases which are problematic to calculate on the fly.
	 * 
	 * This procedure is described in "Hierarchical Phrase-Based
	 * Translation with Suffix Arrays" by Adam Lopez.
	 *
	 * @param maxPhraseLength the maximum length of any phrase
	 *                   in the phrases
	 * @param windowSize the maximum allowable space between
	 *                   phrases for them to still be considered
	 *                   collocated
	 *                   
	 * @return The number of times any frequent phrase co-occurred 
	 *         with any frequent phrase within the given window.
	 */
//	int countCollocations(int maxPhraseLength, int windowSize, short minNonterminalSpan) {
//		return countCollocations(maxPhraseLength, windowSize, minNonterminalSpan);
//	}
	
	
	protected List<HierarchicalPhrases> getFrequentCollocations() {
		return this.frequentCollocations;
	}
	
	
	/**
	 * Gets the hierarchical phrases that represent 
	 * the collocations of one frequent phrase with 
	 * another frequent phrase.
	 * <p>        
	 * This method performs a one-pass computation of the
	 * collocation of two frequent sub-phrases. It is used for
	 * the precalculation of the translations of hierarchical
	 * phrases which are problematic to calculate on the fly.
	 * 
	 * This procedure is described in "Hierarchical Phrase-Based
	 * Translation with Suffix Arrays" by Adam Lopez.
	 * 
	 * @return The number of times any frequent phrase co-occurred 
	 *         with any frequent phrase within the given window.
	 */
	private List<HierarchicalPhrases> countCollocations() {
		
		PhrasePairCollocations collocations = new PhrasePairCollocations(suffixes.getCorpus());

		LinkedList<Phrase> phrasesInWindow = new LinkedList<Phrase>();
		LinkedList<Integer> positions = new LinkedList<Integer>();
		int sentenceNumber = 1;
		int endOfSentence = suffixes.getSentencePosition(sentenceNumber);

		if (logger.isLoggable(Level.FINEST)) logger.finest("END OF SENT: " + endOfSentence);

		Corpus corpus = suffixes.getCorpus();
		int endOfCorpus = corpus.size();
		
		// Start at the beginning of the corpus...
		for (int currentPosition : corpus.corpusPositions()) {
					
			// Start with a phrase length of 1, at the current position...
			for (int i = 1, endOfPhrase = currentPosition + i; 
					// ...ensure the phrase length isn't too long...
					i <= maxPhraseLength  &&  
					// ...and that the phrase doesn't extend past the end of the sentence...
					endOfPhrase <= endOfSentence  &&  
					// ...or past the end of the corpus
					endOfPhrase <= endOfCorpus; 
					// ...then increment the phrase length and end of phrase marker.
					i++, endOfPhrase = currentPosition + i) {

				
				// Get the current phrase
				Phrase phrase = new ContiguousPhrase(currentPosition, endOfPhrase, corpus);

				if (logger.isLoggable(Level.FINEST)) logger.finest("Found phrase (" +currentPosition + ","+endOfPhrase+") "  + phrase);

				// If the phrase is one we care about...
				if (frequentPhrases.containsKey(phrase)) {

					if (logger.isLoggable(Level.FINER)) logger.finer("\"" + phrase + "\" found at currentPosition " + currentPosition);

					// Remember the phrase...
					phrasesInWindow.add(phrase);

					// ...and its starting position
					positions.add(currentPosition);
				}

			} // end iterating over various phrase lengths


			// check whether we're at the end of the sentence and dequeue...
			if (currentPosition == endOfSentence) {

				if (logger.isLoggable(Level.FINEST)) {
					logger.finest("REACHED END OF SENT: " + currentPosition);
					logger.finest("PHRASES:   " + phrasesInWindow);
					logger.finest("POSITIONS: " + positions);
				}

				// empty the whole queue...
//				for (int i = 0, n=phrasesInWindow.size(); i < n; i++) {
				while (! phrasesInWindow.isEmpty()) {

					processPhraseWindow(collocations, phrasesInWindow, positions);

				}
				// clear the queues
				phrasesInWindow.clear();
				positions.clear();

				// update the end of sentence marker
				sentenceNumber++;
				endOfSentence = suffixes.getSentencePosition(sentenceNumber)-1;

				if (logger.isLoggable(Level.FINER)) logger.finer("END OF SENT: " + sentenceNumber + " at position " + endOfSentence);

			} // Done processing end of sentence.


			// check whether the initial elements are
			// outside the window size...
			if (! phrasesInWindow.isEmpty()) {
				int position1 = positions.get(0);
				// dequeue the first element and
				// calculate its collocations...
				while (! phrasesInWindow.isEmpty() &&
						((currentPosition+1==endOfCorpus) || 
								(currentPosition-position1 >= maxPhraseSpan))) {

					processPhraseWindow(collocations, phrasesInWindow, positions);
					
//					if (logger.isLoggable(Level.FINEST)) logger.finest("OUTSIDE OF WINDOW: " + position1 + " " +  currentPosition + " " + maxPhraseSpan);
//					
//					Phrase phrase1 = phrasesInWindow.removeFirst();
//					positions.removeFirst();
//					
//					Iterator<Phrase> phraseIterator = phrasesInWindow.iterator();
//					Iterator<Integer> positionIterator = positions.iterator();
//
//					int end1 = position1 + phrase1.size();
//					
//					for (int j = 0, n=phrasesInWindow.size(); j < n; j++) {
//
//						Phrase phrase2 = phraseIterator.next();
//						int position2 = positionIterator.next();
//
//						if (position2-end1 >= minNonterminalSpan) {
//							if (logger.isLoggable(Level.FINEST)) logger.finest("CASE2: " + phrase1 + "\t" + phrase2 + "\t" + position1 + "\t" + position2);
//							collocations.record(phrase1, phrase2, position1, position2);
//						} else if (logger.isLoggable(Level.FINEST)) {
//							logger.finest("Not recording collocation: " + phrase1 + "\t" + phrase2 + "\t" + position1 + "\t" + position2);
//						}
//					}
					if (phrasesInWindow.size() > 0) {
						position1 = positions.getFirst();
					} else {
						position1 = currentPosition;
					}
				}
			}

		} // end iterating over positions in the corpus

		return collocations.getHierarchicalPhrases();
//		return count;
	}

	/**
	 * @param collocations
	 * @param phrasesInWindow
	 * @param positions
	 * @param i
	 * @param n
	 */
	private void processPhraseWindow(PhrasePairCollocations collocations,
			LinkedList<Phrase> phrasesInWindow,
			LinkedList<Integer> positions) {
		
		Phrase phrase1 = phrasesInWindow.removeFirst();
		int position1 = positions.removeFirst();

		Iterator<Phrase> phraseIterator = phrasesInWindow.iterator();
		Iterator<Integer> positionIterator = positions.iterator();

		int end1 = position1 + phrase1.size();
		
		while (phraseIterator.hasNext() && positionIterator.hasNext()) {
		
			Phrase phrase2 = phraseIterator.next();
			int position2 = positionIterator.next();

			int end2 = position2 + phrase2.size();
			
			if (position2-end1 >= minNonterminalSpan  &&  end2-position1 <= maxPhraseSpan) {
				if (logger.isLoggable(Level.FINEST)) logger.finest("    Recording collocation: " + phrase1 + "\t" + phrase2 + "\t" + position1 + "\t" + position2);
				collocations.record(phrase1, phrase2, position1, position2);
			} else if (logger.isLoggable(Level.FINEST)) {
				logger.finest("Not recording collocation: "+ phrase1 + "\t" + phrase2 + "\t" + position1 + "\t" + position2);
			}

		}
	}


//	/**
//	 * Returns an integer identifier for the collocation of
//	 * <code>phrase1</code> with <code>phrase2</code>.
//	 * <p>
//	 * If <code>rank1</code> is the rank of <code>phrase1</code>
//	 * and <code>rank2</code> is the rank of <code>phrase2</code>,
//	 * the identifier returned by this method is defined to be
//	 * <code>rank1*maxPhrases + rank2</code>.
//	 * <p>
//	 * As such, the range of possible values returned by this
//	 * method will be </code>0</code> through
//	 * <code>maxPhrases*maxPhrases-1</code>.
//	 *
//	 * @param phrase1 First phrase in a collocation.
//	 * @param phrase2 Second phrase in a collocation.
//	 * @return a unique integer identifier for the collocation.
//	 */
//	private int getKey(LinkedHashMap<Phrase,Short> ranks, Phrase phrase1, Phrase phrase2) {
//
//		short rank1 = ranks.get(phrase1);
//		short rank2 = ranks.get(phrase2);
//
//		int rank = rank1*maxPhrases + rank2;
//
//		return rank;
//	}
	

	//	/**
	//	 * Builds a HashMap of all the occurrences of the phrase,
	//	 * keying them based on the index of the sentence that they
	//	 * occur in. Since we iterate over all occurrences of the
	//	 * phrase, this method is linear with respect to the number
	//	 * of occurrences, and should not be used for very frequent
	//	 * phrases. This is part of the baseline method described
	//	 * in Section 4.1 of Adam Lopez's EMNLP paper.
	//	 */
	//	public HashMap<Integer,HashSet<Integer>> keyPositionsWithSentenceNumber(Phrase phrase) {
	//		// keys are the sentence numbers of partial matches
	//		HashMap<Integer,HashSet<Integer>> positionsKeyedWithSentenceNumber = new HashMap<Integer,HashSet<Integer>>(suffixes.size());
	//		int[] bounds = suffixes.findPhrase(phrase);
	//		if (bounds == null) return positionsKeyedWithSentenceNumber;
	//		
	//		int[] positions = suffixes.getAllPositions(bounds);
	//		for (int i = 0; i < positions.length; i++) {
	//			int sentenceNumber = suffixes.getSentenceIndex(positions[i]);
	//			HashSet<Integer> positionsInSentence = positionsKeyedWithSentenceNumber.get(sentenceNumber);
	//			if (positionsInSentence == null) {
	//				positionsInSentence = new HashSet<Integer>();
	//			}
	//			positionsInSentence.add(positions[i]);
	//			positionsKeyedWithSentenceNumber.put(sentenceNumber, positionsInSentence);
	//		}
	//		return positionsKeyedWithSentenceNumber;
	//	}

	//===============================================================
	// Protected 
	//===============================================================

	//===============================================================
	// Methods
	//===============================================================

	/**
	 * Calculates the frequency ranks of the provided phrases.
	 * <p>
	 * The iteration order of the <code>frequentPhrases</code>
	 * parameter is used by this method to determine the
	 * rank of each phrase. Specifically, the first phrase
	 * returned by the map's iterator is taken to be the most
	 * frequent phrase; the last phrase returned by the map's
	 * iterator is taken to be the least frequent phrase.
	 * 
	 * @param frequentPhrases Map from phrase to frequency of
	 *                        that phrase in a corpus.
	 * @return the frequency ranks of the provided phrases
	 */
	protected LinkedHashMap<Phrase,Short> getRanks() {
		
		logger.fine("Calculating ranks of frequent phrases");
		
		LinkedHashMap<Phrase,Short> ranks = new LinkedHashMap<Phrase,Short>(frequentPhrases.size());

		short i=0;
		for (Phrase phrase : frequentPhrases.keySet()) {
			ranks.put(phrase, i++);
		}
		
		logger.fine("Done calculating ranks");
		
		return ranks;
	}
	

	/**
	 * Calculates the most frequent phrases in the corpus.
	 * <p>
	 * Allows a threshold to be set for the minimum frequency
	 * to remember, as well as the maximum number of phrases.
	 * <p>
	 * This method implements the 
	 * <code>print_LDIs_stack</code> function defined in 
	 * section 2.5 of Yamamoto and Church.
	 *
	 * @param suffixes     a suffix array for the corpus
	 * @param minFrequency the minimum frequency required to
	 *                     retain phrases
	 * @param maxPhrases   the maximum number of phrases to
	 *                     return
	 * @param maxPhraseLength the maximum phrase length to
	 *                     consider
	 * 
	 * @return A map from phrase to the number of times 
	 *         that phrase occurred in the corpus. 
	 *         The iteration order of the map will start 
	 *         with the most frequent phrase, and 
	 *         end with the least frequent calculated phrase.
	 *         
	 * @see "Yamamoto and Church (2001), section 2.5"
	 */
	@SuppressWarnings("unchecked")
	protected static LinkedHashMap<Phrase,Integer> getMostFrequentPhrases(
			Suffixes suffixes,
			int minFrequency,
			int maxPhrases,
			int maxPhraseLength
	) {
		
		PriorityQueue<Counted<Phrase>> frequentPhrases = new PriorityQueue<Counted<Phrase>>();
		Set<Integer> prunedFrequencies = new HashSet<Integer>();
		
		Corpus corpus = suffixes.getCorpus();
		
		FrequencyClasses frequencyClasses = getFrequencyClasses(suffixes);
		
		for (FrequencyClass frequencyClass : frequencyClasses.withMinimumFrequency(minFrequency)) {
			
			int frequency = frequencyClass.getFrequency();
			
			if (! prunedFrequencies.contains(frequency)) {
				
				int i = frequencyClass.getIntervalStart();
				int startOfPhrase = suffixes.getCorpusIndex(i);
				int sentenceNumber = suffixes.getSentenceIndex(startOfPhrase);
				int endOfSentence = suffixes.getSentencePosition(sentenceNumber+1);
				
				int max = Math.min(maxPhraseLength, endOfSentence-startOfPhrase);
				if (logger.isLoggable(Level.FINER)) logger.finer("Max phrase length is " + max + " for " + frequencyClass.toString());
				
				for (int phraseLength : frequencyClass.validPhraseLengths(max)) {
					
					int endOfPhrase = startOfPhrase + phraseLength;
					
					Phrase phrase = new ContiguousPhrase(
							startOfPhrase, 
							endOfPhrase, 
							corpus);
					
					frequentPhrases.add(new Counted<Phrase>(phrase, frequency));
					if (frequentPhrases.size() > maxPhrases) {
						Counted<Phrase> pruned = frequentPhrases.poll();
						int prunedFrequency = pruned.getCount();
						prunedFrequencies.add(prunedFrequency);
						if (logger.isLoggable(Level.FINER)) logger.info("Pruned " + pruned.getElement() + " with frequency " + prunedFrequency);
						break;
					}
					
				}
			} else if (logger.isLoggable(Level.FINER)) {
				logger.finer("Skipping pruned frequency " + frequency);
			}
		}

		while (! frequentPhrases.isEmpty() && prunedFrequencies.contains(frequentPhrases.peek().getCount())) {
			Counted<Phrase> pruned = frequentPhrases.poll();
			if (logger.isLoggable(Level.FINER)) logger.finer("Pruned " + pruned.getElement() + " " + pruned.getCount());
		}
		
		Counted<Phrase>[] reverse = new Counted[frequentPhrases.size()];
		{
			int i=frequentPhrases.size()-1;
			while (! frequentPhrases.isEmpty()) {
				reverse[i] = frequentPhrases.poll();
				i -= 1;
			}
		}
		
		LinkedHashMap<Phrase,Integer> results = new LinkedHashMap<Phrase,Integer>();
		for (Counted<Phrase> countedPhrase : reverse) {
			Phrase phrase = countedPhrase.getElement();
			Integer count = countedPhrase.getCount();
			results.put(phrase, count);
		}
//		
//		while (! frequentPhrases.isEmpty()) {
//			Counted<Phrase> countedPhrase = frequentPhrases.poll();
//			Phrase phrase = countedPhrase.getElement();
//			Integer count = countedPhrase.getCount();
//			results.put(phrase, count);
//		}
//		
		return results;
		
	}
	
	/**
	 * Calculates the frequencies for 
	 * all phrase frequency classes in the corpus.
	 * <p>
	 * This method is implements the 
	 * <code>print_LDIs_stack</code> function defined in 
	 * section 2.5 of Yamamoto and Church.
	 *
	 * @param suffixes a suffix array for the corpus
	 * @return A list of term frequency classes
	 *         
	 * @see "Yamamoto and Church (2001), section 2.5"
	 */
	protected static FrequencyClasses getFrequencyClasses(Suffixes suffixes) {
		
		// calculate the longest common prefix delimited intervals...
		int[] longestCommonPrefixes = calculateLongestCommonPrefixes(suffixes);

		// Construct an initially empty object to hold class frequency information
		FrequencyClasses frequencyClasses = new FrequencyClasses(longestCommonPrefixes);
		
		// stack_i <-- an integer array for the stack of left edges, i
		Stack<Integer> startIndices = new Stack<Integer>();
		
		// stack_k <-- an integer array for the stack of representatives, k
		Stack<Integer> shortestInteriorLCPIndices = new Stack<Integer>();
		
		// stack_i[0] <-- 0
		startIndices.push(0);

		// stack_k[0] <-- 0
		shortestInteriorLCPIndices.push(0);
		
		// sp <-- 1 (a stack pointer)
		
		// for j <-- 0,1,2, ..., N-1
		for (int j = 0, size=suffixes.size(); j < size; j++) {	
			
			// Output an lcp-delimited interval <j,j> with tf=1
			//        (trivial interval i==j, frequency=1)
			if (logger.isLoggable(Level.FINEST)) logger.finest("Output trivial interval <"+j+","+j+"> with tf=1");
			frequencyClasses.record(j);
			//frequencyClasses.record(j, j, Integer.MAX_VALUE, 1);

			// While lcp[j+1] < lcp[stack_k[sp-1]] do
			while (longestCommonPrefixes[j+1] < longestCommonPrefixes[shortestInteriorLCPIndices.peek()]) {
							
				int i = startIndices.pop();
				int k = shortestInteriorLCPIndices.pop();
				
				int longestBoundingLCP = Math.max(longestCommonPrefixes[i], longestCommonPrefixes[j+1]);
				int shortestInteriorLCP = longestCommonPrefixes[k];

				// Output an interval <i,j> with tf=j-i+1, if it is lcp-delimited
				//                    (non-trivial interval)
				// sp <-- sp - 1
				if (longestBoundingLCP < shortestInteriorLCP) {
	
					int frequency = j-i+1;
					if (logger.isLoggable(Level.FINEST)) logger.finest("Output interval <"+i+","+j+"> with k="+k+" and tf="+j+"-"+i+"+1="+(j-i+1));
					frequencyClasses.record(i, j, k, frequency);	
				}
				
			}
			
			// stack_i[sp] <-- stack_k[sp-1]
			startIndices.push(shortestInteriorLCPIndices.peek());

			// stack_k[sp] <-- j+1
			shortestInteriorLCPIndices.push(j+1);

			// sp <-- sp + 1

		}
		
		return frequencyClasses;
	}
			


	public void cacheInvertedIndices() {
	
		for (HierarchicalPhrases phrases : frequentCollocations) {
			suffixes.cacheMatchingPhrases(phrases);
		}
		
		for (Map.Entry<Phrase, InvertedIndex> entry : invertedIndices.entrySet()) {
			
			Pattern pattern = new Pattern(entry.getKey());
			InvertedIndex list = entry.getValue();
			
			HierarchicalPhrases phraseLocations = new HierarchicalPhrases(pattern,list.corpusLocations, list.sentenceNumbers);
			suffixes.cacheMatchingPhrases(phraseLocations);
			if (logger.isLoggable(Level.FINE)) logger.fine("Cached sorted locations for " + pattern);
			
			if (pattern.toString().equals("[.]")) {
				logger.fine("Found .");
			}
			
			if (logger.isLoggable(Level.FINE)) {
				StringBuilder s = new StringBuilder();
				String patternString = pattern.toString();
				for (Integer i : list.corpusLocations) {
					s.append(patternString);
					s.append('\t');
					s.append(i);
					s.append('\n');
				}
				logger.fine(s.toString());
			}
			
		}
		
	}

	/**
	 * Constructs an auxiliary array that stores longest common
	 * prefixes. The length of the array is the corpus size+1.
	 * Each elements lcp[i] indicates the length of the common
	 * prefix between two positions s[i-1] and s[i] in the
	 * suffix array.
	 * 
	 * @param suffixes Suffix array
	 * @return Longest common prefix array
	 */
	protected static int[] calculateLongestCommonPrefixes(Suffixes suffixes) {

		int length = suffixes.size();
		Corpus corpus = suffixes.getCorpus();

		int[] longestCommonPrefixes = new int[length +1];
		
		// For each element in the suffix array
		for (int i = 1; i < length; i++) {
			int corpusIndex = suffixes.getCorpusIndex(i);
			int prevCorpusIndex = suffixes.getCorpusIndex(i-1);

			// Start by assuming that the two positions 
			//    don't have anything in common
			int commonPrefixSize = 0;
			
			// While the 1st position is not at the end of the corpus...
			while(corpusIndex+commonPrefixSize < length && 
					// ... and the 2nd position is not at the end of the corpus...
					prevCorpusIndex + commonPrefixSize < length &&
					// ... and the nth word at the 1st position ...
					(corpus.getWordID(corpusIndex  + commonPrefixSize) == 
						// ... is the same as the nth word at the 2nd position ...
						corpus.getWordID(prevCorpusIndex + commonPrefixSize) && 
						// ... and the length to consider isn't too long
						commonPrefixSize <= Suffixes.MAX_COMPARISON_LENGTH)) {
				
				// The two positions match for their respective nth words!
				// Increment commonPrefixSize to reflect this fact
				commonPrefixSize++;
			}
			
			// Record how long the common prefix is between
			//    suffix array element s[i] and s[i-1] 
			longestCommonPrefixes[i] = commonPrefixSize;
		}
		
		// By definition, the 0th element of lcp is 0
		longestCommonPrefixes[0] = 0;
		
		// By definition, the final element of lcp is 0
		longestCommonPrefixes[length] = 0;
		
		return longestCommonPrefixes;

	}
	
//	/**
//	 * This method extracts phrases which reach the specified
//	 * minimum frequency. It uses the equivalency classes for
//	 * substrings in the interval i-j in the suffix array, as
//	 * defined in section 2.3 of the the Yamamoto and Church
//	 * CL article. This is a helper function for the
//	 * getMostFrequentPhrases method.
//	 * 
//	 * @param suffixes Suffix array
//	 * @param longestCommonPrefixes Longest common prefix array
//	 * @param i Index specifying a starting range in the suffix array
//	 * @param j Index specifying an ending range in the suffix array
//	 * @param k Index specifying a representative value of the range,
//	 *          such that i < k <= j, and such that longestCommonPrefixes[k]
//	 *          is the shortest interior longest common prefix of the range 
//	 *          (see section 2.5 of Yamamoto and Church)
//	 * @param phrases
//	 * @param frequencies
//	 * @param minFrequency
//	 * @param maxPhrases
//	 * @param maxPhraseLength
//	 * @param comparator
//	 */
//	protected static void recordPhraseFrequencies(
//			Suffixes            suffixes,
//			int[]               longestCommonPrefixes,
//			int                 i,
//			int                 j,
//			int                 k,
//			List<Phrase>        phrases,
//			List<Integer>       frequencies,
//			int                 minFrequency,
//			int                 maxPhrases,
//			int                 maxPhraseLength,
//			Comparator<Integer> comparator
//	) {
//		
//		if (i==j) {
//			logger.info("Output trivial interval <"+j+","+j+"> with k="+k+" and tf=1");
//		} else {
//
//			int LBL = Math.max(longestCommonPrefixes[i], longestCommonPrefixes[j+1]);
//			int SIL = longestCommonPrefixes[k];
//
//			if (LBL < SIL) {
//				logger.info("Output interval <"+i+","+j+"> with k="+k+" and tf="+j+"-"+i+"+1="+(j-i+1));				
//			} else {
//				logger.info("Interval <"+i+","+j+"> is NOT lcp-delimited, because " + LBL + " not < " +SIL);
//			}
//		}
//	}
	
	
	private Map<Phrase,InvertedIndex> calculateInvertedIndices() {
		Map<Phrase,InvertedIndex> invertedIndices = new HashMap<Phrase,InvertedIndex>(frequentPhrases.keySet().size());
		
		Corpus corpus = suffixes.getCorpus();
		int endOfCorpus = corpus.size();
		logger.fine("Corpus has size " + endOfCorpus);
		
		int sentenceNumber = 0;
		int endOfSentence = suffixes.getSentencePosition(sentenceNumber+1);
		boolean trackMe = false;
		// Start at the beginning of the corpus...
		for (int currentPosition : corpus.corpusPositions()) {
//					
			if (trackMe) 
				{
				logger.fine("At corpus position " + currentPosition);
				}
//			
//			if (currentPosition==0 || currentPosition==1) {
//				logger.fine("Here!");
//			}
			
			// Start with a phrase length of 1, at the current position...
			for (int i = 1, endOfPhrase = currentPosition + i; 
					// ...ensure the phrase length isn't too long...
					i <= maxPhraseLength  &&  
					// ...and that the phrase doesn't extend past the end of the sentence...
					endOfPhrase <= endOfSentence  &&  
					// ...or past the end of the corpus
					endOfPhrase <= endOfCorpus; 
					// ...then increment the phrase length and end of phrase marker.
					i++, endOfPhrase = currentPosition + i) {

				if (trackMe) logger.fine("endOfPhrase=="+endOfPhrase);
				// Get the current phrase
				Phrase phrase = new ContiguousPhrase(currentPosition, endOfPhrase, corpus);

				if (phrase.toString().equals(".")) {
					logger.fine("Huzzah, £20 for the King!");
					trackMe = true;
				}
				
				if (logger.isLoggable(Level.FINE)) logger.fine("In sentence " + sentenceNumber + " found phrase (" +currentPosition + ","+endOfPhrase+") "  + phrase);

				// If the phrase is one we care about...
				if (frequentPhrases.containsKey(phrase)) {

					if (logger.isLoggable(Level.FINER)) logger.finer("\"" + phrase + "\" found at currentPosition " + currentPosition);

					if (! invertedIndices.containsKey(phrase)) {
						invertedIndices.put(phrase, new InvertedIndex());
					}
					
					InvertedIndex invertedIndex = invertedIndices.get(phrase);
					
					logger.fine("Recording position " + currentPosition + " in sentence " + sentenceNumber + " for phrase " + phrase);
					invertedIndex.record(currentPosition, sentenceNumber);

				}
				
			} // end iterating over various phrase lengths

			if (currentPosition+1 == endOfSentence) {
				sentenceNumber += 1;
				endOfSentence = suffixes.getSentencePosition(sentenceNumber+1);
			}
		}
		
		return invertedIndices;
	}
	
	/* See Javadoc for java.io.Externalizable interface. */
	public void readExternal(ObjectInput in) throws IOException,
			ClassNotFoundException {
		
		boolean loggingFiner = logger.isLoggable(Level.FINER);
		
		SymbolTable vocab = suffixes.getVocabulary();
		
		// Read in the maximum number of phrases of which this object is aware.
		this.maxPhrases = in.readShort();
		if (loggingFiner) logger.finer(" Read: maxPhrases="+maxPhrases);
		
		// Read in the maximum phrase length to consider.
		this.maxPhraseLength = in.readInt();
		if (loggingFiner) logger.finer(" Read: maxPhraseLength="+maxPhraseLength);
		
		// Read in the count of frequent phrase types
		int frequentPhrasesSize = in.readInt();
		if (loggingFiner) logger.finer(" Read: frequentPhrases.size()="+frequentPhrasesSize);
		
		// Read in the frequentPhrases map
		this.frequentPhrases = new LinkedHashMap<Phrase,Integer>();
		for (int i=0; i<frequentPhrasesSize; i++) {
			
			// Write out number of times the phrase is found in the corpus
			int count = in.readInt();
			if (loggingFiner) logger.finer(" Read: phraseCount="+count);
			
			// Read in the number of tokens in the phrase
			int tokenCount = in.readInt();
			if (loggingFiner) logger.finer(" Read: wordIDs.length="+tokenCount);
			
			int[] wordIDs = new int[tokenCount];
			for (int j=0; j<tokenCount; j++) {
				int wordID = in.readInt();
				if (loggingFiner) logger.finer(" Read: wordIDs["+j+"]="+wordID);
				wordIDs[j] = wordID;
			}
			
			
			BasicPhrase phrase = new BasicPhrase(wordIDs, vocab);
//			if (loggingFinest) logger.finer("Read: phrase="+Arrays.toString(wordIDs)+ " " + phrase);
			this.frequentPhrases.put(phrase, count);
			
		}
		
		// Read in number of inverted indices
		int invertedIndicesCount = in.readInt();
		if (loggingFiner) logger.finer(" Read: invertedIndices.size()="+invertedIndicesCount);
		
		// Read in inverted indices
		this.invertedIndices = new HashMap<Phrase,InvertedIndex>(frequentPhrases.keySet().size());
		for (int i=0; i<invertedIndicesCount; i++) {
			
			// Read in the number of tokens in the phrase
			int tokenCount = in.readInt();
			if (loggingFiner) logger.finer(" Read: wordIDs.length="+tokenCount);
			
			int[] wordIDs = new int[tokenCount];
			for (int j=0; j<tokenCount; j++) {
				wordIDs[j] = in.readInt();
				if (loggingFiner) logger.finer(" Read: wordID["+j+"]="+wordIDs[j]);
			}
			
			// Reconstruct phrase
			BasicPhrase phrase = new BasicPhrase(wordIDs, vocab);
			
			// Read in inverted index
			InvertedIndex invertedIndex = new InvertedIndex();
			if (loggingFiner) logger.finer(" Read: about to InvertedIndex");
			if (phrase.toString().equals("it")) {
				logger.fine("Found it!");
			}
			invertedIndex.readExternal(in);
			
			this.invertedIndices.put(phrase, invertedIndex);
		}
		
		// Read collocations
		int frequentCollocationsSize = in.readInt();
		this.frequentCollocations = new ArrayList<HierarchicalPhrases>(frequentCollocationsSize);
		for (int i=0; i<frequentCollocationsSize; i++) {
			
			// Read the pattern
			int wordsLength = in.readInt();
			int[] words = new int[wordsLength];
			for (int j=0; j<wordsLength; j++) {
				words[j]=in.readInt();
			}
			Pattern pattern = new Pattern(vocab, words);
			
//			int terminalSequenceLengthsLength = in.readInt();
//			int[] terminalSequenceLengths = new int[terminalSequenceLengthsLength];
//			for (int j=0; j<terminalSequenceLengthsLength; j++) {
//				terminalSequenceLengths[j]=in.readInt();
//			}
			
			// Read the number of corpus matches
//			int phrasesSize = in.readInt();
			
			// Next, read the sentence numbers
			// There should be size of these
			int[] sentenceNumber = new int[in.readInt()];
			for (int j=0, n=sentenceNumber.length; j<n; j++) {
				sentenceNumber[j] = in.readInt();
			}
			
			// Next, read the start index of each corpus match
			// There should be size of these
			int[] terminalSequenceStartIndices = new int[in.readInt()];
			for (int j=0, n=terminalSequenceStartIndices.length; j<n; j++) {
				terminalSequenceStartIndices[j] = in.readInt();
			}

			HierarchicalPhrases phrases = new HierarchicalPhrases(pattern, terminalSequenceStartIndices, sentenceNumber);
			this.frequentCollocations.add(phrases);
			
		}
	}

	public void writeExternal(ObjectOutput out) throws IOException { 
		
		boolean loggingFiner = logger.isLoggable(Level.FINER);
		
		// Write out maximum number of phrases of which this object is aware.
		out.writeShort(maxPhrases);
		if (loggingFiner) logger.finest("Wrote: maxPhrases="+maxPhrases);
		
		// Write out maximum phrase length to consider.
		out.writeInt(maxPhraseLength);
		if (loggingFiner) logger.finest("Wrote: maxPhraseLength="+maxPhraseLength);
		
		// Write out count of frequent phrase types
		out.writeInt(frequentPhrases.size());
		if (loggingFiner) logger.finest("Wrote: frequentPhrases.size()="+frequentPhrases.size());
		
		// Write out frequentPhrases map
		for (Map.Entry<Phrase, Integer> entry : frequentPhrases.entrySet()) {
			Phrase phrase = entry.getKey();
			int phraseCount = entry.getValue();
			int[] wordIDs = phrase.getWordIDs();
			
			// Write out number of times the phrase is found in the corpus
			out.writeInt(phraseCount);
			if (loggingFiner) logger.finer("Wrote: phraseCount="+phraseCount);
			
			// Write out the number of tokens in the phrase
			out.writeInt(wordIDs.length);
			if (loggingFiner) logger.finer("Wrote: wordIDs.length="+wordIDs.length);
			
			// Write out each token in the phrase
			int index = 0;
			for (int wordID : wordIDs) {
				out.writeInt(wordID);
				if (loggingFiner) logger.finer("Wrote: wordIDs["+index+"]="+wordID);
				index+=1;
			}
//			if (loggingFinest) logger.finest("Wrote: wordIDs="+Arrays.toString(wordIDs));
		}
		
		// Write out number of inverted indices
		out.writeInt(invertedIndices.size());
		if (loggingFiner) logger.finer("Wrote: invertedIndices.size()="+invertedIndices.size());
		
		// Write out inverted indices
		for (Map.Entry<Phrase, InvertedIndex> entry : invertedIndices.entrySet()) {
			
			Pattern pattern = new Pattern(entry.getKey());
			int[] wordIDs = pattern.getWordIDs();
			
			// Write out number of tokens in the pattern
			out.writeInt(wordIDs.length);
			if (loggingFiner) logger.finer("Wrote: wordIDs.length="+wordIDs.length);
			
			// Write out each token in the phrase
			int index = 0;
			for (int wordID : wordIDs) {
				out.writeInt(wordID);
				if (loggingFiner) logger.finer("Wrote: wordID["+index+"]="+wordID);
				index+=1;
			}
			
			// Write out inverted index for this phrase
			InvertedIndex list = entry.getValue();
			if (loggingFiner) logger.finer("Wrote: about to InvertedIndex");
//			if (pattern.toString().contains("[it]")) {
//				logger.fine("Found it!");
//			}
			out.writeObject(list);
		}
		
		/////////////
		
		// Write collocations
		out.writeInt(frequentCollocations.size());
		for (HierarchicalPhrases phrases : frequentCollocations) {
			
			// Write the pattern
			int[] words = phrases.pattern.getWordIDs();
			out.writeInt(words.length);
			for (int token : phrases.pattern.getWordIDs()) {
				out.writeInt(token);
			}
//			out.writeInt(phrases.pattern.arity());
//			
//			out.writeInt(phrases.terminalSequenceLengths.length);
//			for (int l : phrases.terminalSequenceLengths) {
//				out.writeInt(l);
//			}
			
			// Write the number of corpus matches
//			out.writeInt(phrases.size);
			
			// Next, write the sentence numbers
			// There should be size of these
			out.writeInt(phrases.sentenceNumber.length);
			for (int n : phrases.sentenceNumber) {
				out.writeInt(n);
			}
			
			// Next, write the start index of each corpus match
			// There should be size of these
			out.writeInt(phrases.terminalSequenceStartIndices.length);
			for (int startIndex : phrases.terminalSequenceStartIndices) {
				out.writeInt(startIndex);
			}
			
		}
		

	}
	

	public String toString() {

		String format = null;

		StringBuilder s = new StringBuilder();

		for (Map.Entry<Phrase, Integer> entry : frequentPhrases.entrySet()) {

			Phrase phrase = entry.getKey();
			Integer frequency = entry.getValue();

			if (format==null) {
				int length = frequency.toString().length();
				format = "%1$" + length + "d";
			}

			s.append(String.format(format, frequency));
			s.append('\t');
			s.append(phrase.toString());
			s.append('\n');

		}

		return s.toString();
	}


	/**
	 * Private helper method for performing fast intersection.
	 * 
	 * @param <E>
	 * @param sortedData
	 * @param sortedQueries
	 * @param result
	 */
	private static <E extends Comparable<E>> void fastIntersect(List<E> sortedData, List<E> sortedQueries, SortedSet<E> result) {

		int medianQueryIndex = sortedQueries.size() / 2;
		E medianQuery = sortedQueries.get(medianQueryIndex);

		int index = Collections.binarySearch(sortedData, medianQuery);

		if (index >= 0) {
			result.add(medianQuery);
		} else {
			index = (-1 * index) + 1;
		}

		if (index-1 >= 0 && medianQueryIndex-1 >=0) {
			fastIntersect(sortedData.subList(0, index), sortedQueries.subList(0, medianQueryIndex), result);
		}

		if (index+1 < sortedData.size()  &&  medianQueryIndex+1 < sortedQueries.size()) {
			fastIntersect(sortedData.subList(index+1, sortedData.size()), sortedQueries.subList(medianQueryIndex+1, sortedQueries.size()), result);
		}
	}	


	//===============================================================
	// Static
	//===============================================================



	//===============================================================
	// Inner classes
	//===============================================================

	

	//===============================================================
	// Main method
	//===============================================================
	
	public static void main(String[] args) throws IOException, ClassNotFoundException {


		Vocabulary symbolTable;
		Corpus corpusArray;
		Suffixes suffixArray;
		FrequentPhrases frequentPhrases;

		if (args.length == 1) {

			String corpusFileName = args[0];

			logger.info("Constructing vocabulary from file " + corpusFileName);
			symbolTable = new Vocabulary();
			int[] lengths = Vocabulary.initializeVocabulary(corpusFileName, symbolTable, true);

			logger.info("Constructing corpus array from file " + corpusFileName);
			corpusArray = SuffixArrayFactory.createCorpusArray(corpusFileName, symbolTable, lengths[0], lengths[1]);

			logger.info("Constructing suffix array from file " + corpusFileName);
			suffixArray = new SuffixArray(corpusArray, Cache.DEFAULT_CAPACITY);

		} else if (args.length == 3) {

			String binarySourceVocabFileName = args[0];
			String binaryCorpusFileName = args[1];
			String binarySuffixArrayFileName = args[2];

			if (logger.isLoggable(Level.INFO)) logger.info("Constructing source language vocabulary from binary file " + binarySourceVocabFileName);
			ObjectInput in = BinaryIn.vocabulary(binarySourceVocabFileName);
			symbolTable = new Vocabulary();
			symbolTable.readExternal(in);

			logger.info("Constructing corpus array from file " + binaryCorpusFileName);
			if (logger.isLoggable(Level.INFO)) logger.info("Constructing memory mapped source language corpus array.");
			corpusArray = new MemoryMappedCorpusArray(symbolTable, binaryCorpusFileName);

			logger.info("Constructing suffix array from file " + binarySuffixArrayFileName);
			suffixArray = new MemoryMappedSuffixArray(binarySuffixArrayFileName, corpusArray, Cache.DEFAULT_CAPACITY);


		} else {

			System.err.println("Usage: java " + SuffixArray.class.getName() + " source.vocab source.corpus source.suffixes");
			System.exit(0);

			symbolTable = null;
			corpusArray = null;
			suffixArray = null;

		}

		int minFrequency = 0;
		short maxPhrases = 100;
		int maxPhraseLength = 10;
		int maxPhraseSpan = 10;
		short minNonterminalSpan = 2;

		logger.info("Calculating " + maxPhrases + " most frequent phrases");
		frequentPhrases = new FrequentPhrases(suffixArray, minFrequency, maxPhrases, maxPhraseLength, maxPhraseLength, maxPhraseSpan, minNonterminalSpan);

		logger.info("Frequent phrases: \n" + frequentPhrases.toString());

		logger.info("Caching inverted indices");
		frequentPhrases.cacheInvertedIndices();
		
		logger.info("Calculating collocations for most frequent phrases");
		List<HierarchicalPhrases> collocations = frequentPhrases.getFrequentCollocations();//frequentPhrases.countCollocations(maxPhraseLength, maxPhraseSpan, minNonterminalSpan);
		
		Comparator<HierarchicalPhrases> compare = new Comparator<HierarchicalPhrases>() {
			public int compare(HierarchicalPhrases o1, HierarchicalPhrases o2) {
				Integer i1 = o1.size;
				Integer i2 = o2.size();
				return i2.compareTo(i1);
			}
			
		};
		Collections.sort(collocations,compare);
		for (HierarchicalPhrases locations : collocations) {
			logger.info(locations.toString());
		}
//		FrequentMatches matches = frequentPhrases.getCollocations(maxPhraseLength, windowSize, minNonterminalSpan);
//
//		
//
//		
//		logger.info("Printing collocations for most frequent phrases");		
//		logger.info("Total collocations: " + matches.counter);
//		
//		logger.info(matches.toString());
		
//				for (int i=0, n=matches.counter; i<n; i+=3) {
//					
//					int key = matches..get(i);
//					short rank2 = (short) key;
//					short rank1 = (short) (key >> 8);
//					Phrase phrase1 = frequentPhrases.phraseList.get(rank1);
//					Phrase phrase2 = frequentPhrases.phraseList.get(rank2);
//					
//					String pattern = phrase1.toString() + " X " + phrase2.toString();
//					
//					int position1 = collocations.get(i+1);
//					int position2 = collocations.get(i+2);
//					
//					System.out.println(pattern + " " + position1 + "," + position2);
//				}



		//		for (Map.Entry<Integer, ArrayList<int[]>> entry : collocations.entrySet()) {
		//			
		//			int key = entry.getKey();
		//			ArrayList<int[]> values = entry.getValue();
		//			
		//			short rank2 = (short) key;
		//			short rank1 = (short) (key >> 8);
		//			
		//			Phrase phrase1 = frequentPhrases.phraseList.get(rank1);
		//			Phrase phrase2 = frequentPhrases.phraseList.get(rank2);
		//			
		//			String pattern = phrase1.toString() + " X " + phrase2.toString();
		//			
		//			for (int[] value : values) {
		//				System.out.println(value + "\t" + pattern);
		//			}
		//		}


	}
	
}