/* This file is part of the Joshua Machine Translation System.
*
* Joshua is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1
* of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free
* Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
* MA 02111-1307 USA
*/
package joshua.corpus.suffix_array;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.NoSuchElementException;
/**
* Represents a list of term frequency classes in the context of
* Yamamoto & Church (2001).
*
* @author Lane Schwartz
*/
public class FrequencyClasses implements Iterable<FrequencyClass> {
final ArrayList<Integer> data;
final int[] longestCommonPrefixes;
int numClasses;
int numTrivialClasses;
/**
* Constructs an initially empty list of frequency class data.
*/
public FrequencyClasses(int[] longestCommonPrefixes) {
this.data = new ArrayList<Integer>();
this.longestCommonPrefixes = longestCommonPrefixes;
this.numClasses = 0;
this.numTrivialClasses = 0;
}
/**
* Record a term frequency class.
*
* @param i Inclusive start index of an interval.
* This index is an index into a suffix array.
* @param j Inclusive end index of an interval.
* This index is an index into a suffix array.
* @param k Representative index into a suffix array.
* For an interval that is lcp-delimited,
* the interval is uniquely determined by
* a representative index k such that i < k <= j,
* and lcp[k] is the shortest interior lcp
* for the interval.
* @param frequency Term frequency of each terms that is a member
* of the class defined by the interval <i,j>.
*/
public void record(int i, int j, int k, int frequency) {
data.add(i);
data.add(j);
data.add(k);
data.add(frequency);
numClasses++;
}
/**
* Record a trivial term frequency class.
*
* @param j Inclusive start and end index of an interval.
* This index is an index into a suffix array.
*/
public void record(int j) {
data.add(j);
data.add(j);
numTrivialClasses++;
}
/**
* Gets the number of frequency classes
* stored in this object.
*
* @return The number of frequency classes
* stored in this object.
*/
public int size() {
return numClasses + numTrivialClasses;
}
/**
* Gets an iterator capable of traversing
* all term frequency classes recorded by this object.
*
* @return An iterator capable of traversing
* all term frequency classes recorded by this object
*/
public Iterator<FrequencyClass> iterator() {
return this.withMinimumFrequency(0).iterator();
}
public Iterable<FrequencyClass> withMinimumFrequency(final int minFrequency) {
return new Iterable<FrequencyClass>() {
public Iterator<FrequencyClass> iterator() {
return new Iterator<FrequencyClass>() {
int index=0;
FrequencyClass next = null;
public boolean hasNext() {
boolean hasNext = false;
while (index<data.size()) {
int i = data.get(index++);
int j = data.get(index++);
if (i==j) {
if (minFrequency <= 1) {
next = new FrequencyClass(j, longestCommonPrefixes);
hasNext = true;
break;
}
} else {
int k = data.get(index++);
int frequency = data.get(index++);
if (frequency >= minFrequency) {
next = new FrequencyClass(i, j, k, frequency, longestCommonPrefixes);
hasNext = true;
break;
}
}
}
return hasNext;
}
public FrequencyClass next() {
if (next==null) {
if (hasNext()) {
return next;
} else {
throw new NoSuchElementException();
}
} else {
return next;
}
}
public void remove() {
throw new UnsupportedOperationException();
}
};
}
};
}
}