package org.wikibrain.core.nlp; import java.util.ArrayList; import java.util.Iterator; import java.util.List; /** * Adapted from http://stackoverflow.com/questions/3656762/n-gram-generation-from-a-sentence * @author Shilad Sen */ public class NGramCreator { public List<String> getNGrams(List<String> words, int n) { return getNGrams(words, n, n); } public List<String> getNGrams(List<String> words, int minN, int maxN) { List<String> ngrams = new ArrayList<String>(); for (int n = minN; n <= maxN; n++) { for (int i = 0; i < words.size() - n + 1; i++) { ngrams.add(concat(words, i, i+n)); } } return ngrams; } public List<Token> getNGramTokens(List<Token> words, int minN, int maxN) { List<Token> ngrams = new ArrayList<Token>(); for (int n = minN; n <= maxN; n++) { for (int i = 0; i < words.size() - n + 1; i++) { ngrams.add(new Token(words.subList(i, i+n))); } } return ngrams; } public static String concat(List<String> words, int start, int end) { StringBuilder sb = new StringBuilder(); for (int i = start; i < end; i++) sb.append((i > start ? " " : "") + words.get(i)); return sb.toString(); } }