/* This file is part of the Joshua Machine Translation System. * * Joshua is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 * of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, * MA 02111-1307 USA */ package joshua.corpus.suffix_array; import java.util.Arrays; import java.util.Iterator; /** * Represents a term frequency class in the context of * Yamamoto & Church (2001). * * @author Lane Schwartz */ public class FrequencyClass { /** * Inclusive start index of an interval. * * This index is an index into a suffix array. */ private final int i; /** * Inclusive end index of an interval. * * This index is an index into a suffix array. */ private final int j; /** * Representative index into a suffix array. * <p> * For an interval that is lcp-delimited, * the interval is uniquely determined by a representative index k * such that i < k <= j, and lcp[k] is the shortest interior lcp * for the interval. */ private final int k; /** * Term frequency of each terms that is a member * of the class defined by the interval <i,j>. */ private final int frequency; private final int[] longestCommonPrefixes; /** * Constructs a non-trivial frequency record. * * @param i Inclusive start index of an interval. * This index is an index into a suffix array. * @param j Inclusive end index of an interval. * This index is an index into a suffix array. * @param k Representative index into a suffix array. * For an interval that is lcp-delimited, * the interval is uniquely determined by * a representative index k such that i < k <= j, * and lcp[k] is the shortest interior lcp * for the interval. * @param frequency Term frequency of each terms that is a member * of the class defined by the interval <i,j>. * @param longestCommonPrefixes Longest common prefix array */ public FrequencyClass(int i, int j, int k, int frequency, int[] longestCommonPrefixes) { this.i = i; this.j = j; this.k = k; this.frequency = frequency; this.longestCommonPrefixes = longestCommonPrefixes; } /** * Constructs a trivial frequency record. * * @param j Inclusive start and end index of an interval. * This index is an index into a suffix array. * @param longestCommonPrefixes Longest common prefix array */ public FrequencyClass(int j, int[] longestCommonPrefixes) { this.i = j; this.j = j; this.k = -1; this.frequency = 1; this.longestCommonPrefixes = longestCommonPrefixes; } public boolean hasTrivialInterval() { return i==j; } /** * Gets the inclusive start index of the interval. * * @return The inclusive start index of the interval */ public int getIntervalStart() { return this.i; } /** * Gets the inclusive end index of the interval. * * @return The inclusive end index of the interval */ public int getIntervalEnd() { return this.j; } /** * Gets the representative index into a suffix array. * For an interval that is lcp-delimited, * the interval is uniquely determined by * a representative index k such that i < k <= j, * and lcp[k] is the shortest interior lcp * for the interval. * * @return the representative index into a suffix array. * For an interval that is lcp-delimited, * the interval is uniquely determined by * a representative index k such that i < k <= j, * and lcp[k] is the shortest interior lcp * for the interval. */ public int getRepresentativeIndex() { return this.k; } /** * Gets the term frequency of each terms that is a member * of the class defined by the interval <i,j>. * * @return The term frequency of each terms that is a member * of the class defined by the interval <i,j>. */ public int getFrequency() { return this.frequency; } public Iterable<Integer> validPhraseLengths(final int maxPhraseLength) { final int longestBoundingLCP = Math.max(longestCommonPrefixes[i], longestCommonPrefixes[j+1]); final int shortestInteriorLCP = (hasTrivialInterval()) ? Integer.MAX_VALUE : longestCommonPrefixes[k]; return new Iterable<Integer>() { public Iterator<Integer> iterator() { return new Iterator<Integer>() { int max = Math.min(maxPhraseLength, shortestInteriorLCP); int m = longestBoundingLCP+1; public boolean hasNext() { return m <= max; } public Integer next() { int next = m; m += 1; return next; } public void remove() { throw new UnsupportedOperationException(); } }; } }; } public boolean equals(Object o) { if (o instanceof FrequencyClass) { FrequencyClass other = (FrequencyClass) o; if (this.i==other.i && this.j==other.j && this.k==other.k && this.frequency==other.frequency && Arrays.equals( this.longestCommonPrefixes, other.longestCommonPrefixes)) { return true; } else { return false; } } else { return false; } } public int hashCode() { return frequency + 37*i + 71*j + 83*k + Arrays.hashCode(longestCommonPrefixes); } public String toString() { if (hasTrivialInterval()) { return "trivial <"+i+","+j+">, tf="+frequency; } else { return "nontrivial <"+i+","+j+"> rep="+k+", tf="+frequency; } } }