/* This file is part of the Joshua Machine Translation System. * * Joshua is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 * of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, * MA 02111-1307 USA */ package joshua.corpus.suffix_array; import joshua.corpus.Corpus; import joshua.corpus.MatchedHierarchicalPhrases; import joshua.decoder.ff.tm.Rule; import joshua.util.FileUtility; import joshua.util.Cache; import java.io.FileOutputStream; import java.io.IOException; import java.io.ObjectOutput; import java.util.List; import java.util.Random; import java.util.logging.Logger; /** * SuffixArray is the main class for producing suffix arrays from * corpora, and manipulating them once created. Suffix arrays are * a space economical way of storing a corpus and allowing very * quick searching of any substring within the corpus. A suffix * array contains a list of references to every point in a corpus, * and each reference denotes the suffix starting at that point and * continuing to the end of the corpus. The suffix array is sorted * alphabetically, so any substring within the corpus can be found * with a binary search in O(log n) time, where n is the length of * the corpus. * * @author Colin Bannard * @since 10 December 2004 * @author Josh Schroeder * @since 2 Jan 2005 * @author Chris Callison-Burch * @since 9 February 2005 * @version $LastChangedDate:2008-07-30 17:15:52 -0400 (Wed, 30 Jul 2008) $ */ public class SuffixArray extends AbstractSuffixArray { /** * A random number generator used in the quick sort * implementation. */ private static final Random RAND = new Random(); /** Logger for this class. */ private static final Logger logger = Logger.getLogger(SuffixArray.class.getName()); //=============================================================== // Member variables //=============================================================== protected int[] suffixes; //=============================================================== // Constructor(s) //=============================================================== public SuffixArray(Corpus corpusArray) { this(corpusArray, DEFAULT_CACHE_CAPACITY); } /** * Constructor takes a CorpusArray and creates a sorted * suffix array from it. */ public SuffixArray(Corpus corpusArray, int maxCacheSize) { super(corpusArray, new Cache<Pattern,MatchedHierarchicalPhrases>(maxCacheSize), new Cache<Pattern,List<Rule>>(maxCacheSize)); // (maxCacheSize > 0) ? // new Cache<Pattern,MatchedHierarchicalPhrases>(maxCacheSize) : // null); suffixes = new int[corpusArray.size()]; // Create an array of suffix IDs for (int i = 0, n=corpusArray.size(); i < n; i++) { suffixes[i] = i; } // Sort the array of suffixes sort(suffixes); } // /** // * Protected constructor takes in the already prepared // * member variables. // * // * @see joshua.corpus.suffix_array.SuffixArrayFactory#createSuffixArray(Corpus,int) // */ // protected SuffixArray(int[] suffixes, Corpus corpusArray) { // this(suffixes, corpusArray, DEFAULT_CACHE_CAPACITY); // } // // /** // * Protected constructor takes in the already prepared // * member variables. // * // * @see joshua.corpus.suffix_array.SuffixArrayFactory#createSuffixArray(Corpus,int) // */ // protected SuffixArray(int[] suffixes, Corpus corpusArray, int maxCacheSize) { // super(corpusArray, // new Cache<Pattern,MatchedHierarchicalPhrases>(maxCacheSize), // new Cache<Pattern,List<Rule>>(maxCacheSize)); //// (maxCacheSize > 0) ? //// new Cache<Pattern,MatchedHierarchicalPhrases>(maxCacheSize) : //// null); // // this.suffixes = suffixes; // // } //=============================================================== // Public //=============================================================== //=========================================================== // Accessor methods (set/get) //=========================================================== /** * @return the position in the corpus corresponding to the * specified index in the suffix array. */ public int getCorpusIndex(int suffixIndex) { return suffixes[suffixIndex]; } /** * Returns the number of suffixes in the suffix array, which * is identical to the length of the corpus. */ public int size() { return suffixes.length; } //=========================================================== // Methods //=========================================================== /** * Sorts the initalized, unsorted suffixes. Uses quick sort * and the compareSuffixes method defined in CorpusArray. */ protected void sort(int[] suffixes) { qsort(suffixes, 0, suffixes.length - 1); } public void writeWordIDsToFile(String filename) throws IOException { FileOutputStream out = FileUtility.writeBytes(new int[]{size()}, filename); FileUtility.writeBytes(suffixes, out); } //=============================================================== // Private //=============================================================== //=============================================================== // Methods //=============================================================== /** Quick sort */ private void qsort(int[] array, int begin, int end) { if (end > begin) { int index; { index = begin + RAND.nextInt(end - begin + 1); int pivot = array[index]; { int tmp = array[index]; array[index] = array[end]; array[end] = tmp; } for (int i = index = begin; i < end; ++ i) { if (corpus.compareSuffixes(array[i], pivot, MAX_COMPARISON_LENGTH) <= 0) { { int tmp = array[index]; array[index] = array[i]; array[i] = tmp; index++; } } } { int tmp = array[index]; array[index] = array[end]; array[end] = tmp; } } qsort(array, begin, index - 1); qsort(array, index + 1, end); } } public void writeExternal(ObjectOutput out) throws IOException { // Write the corpus logger.finer("Writing corpus to object output..."); out.writeObject(corpus); logger.finer("Writing suffix length to object output..."); out.writeInt(suffixes.length); logger.finer("Writing suffixes to object output..."); for (int word : suffixes) { out.writeInt(word); } logger.finer("Completed externalization"); } //=============================================================== // Main //=============================================================== }