package edu.umd.rhsmith.diads.tools.tfidf;
import java.util.LinkedList;
import java.util.List;
public class NGramGenerator extends DefaultTermFilter {
private final int maxN;
public NGramGenerator(int n) throws IllegalArgumentException {
if (n <= 0) {
throw new IllegalArgumentException();
}
this.maxN = n;
}
@Override
public List<String> filterTerms(List<String> terms) {
if (this.getPreFilter() != null) {
terms = this.getPreFilter().filterTerms(terms);
}
List<String> nGrams = generateNGrams(terms, maxN, this.getCleaner());
return nGrams;
}
public static List<String> generateNGrams(List<String> terms, int maxN,
TermCleaner cleaner) {
String[] t = new String[terms.size()];
terms.toArray(t);
List<String> nGrams = new LinkedList<String>();
for (int s = 0; s < t.length; s++) {
StringBuilder nGram = new StringBuilder();
for (int x = s; x < s + maxN && x < t.length; x++) {
String term = t[x];
if (cleaner != null) {
term = cleaner.clean(term);
if (term == null) {
continue;
}
}
nGram.append(term);
nGrams.add(nGram.toString());
nGram.append(' ');
}
}
return nGrams;
}
public static List<String> generateNGrams(List<String> terms, int maxN) {
return generateNGrams(terms, maxN, null);
}
}