package com.miguelfonseca.completely.text.analyze.tokenize; import com.miguelfonseca.completely.text.analyze.Analyzer; import java.util.Collection; import java.util.LinkedList; import java.util.List; import static com.miguelfonseca.completely.common.Precondition.checkArgument; import static com.miguelfonseca.completely.common.Precondition.checkPointer; /** * Break text into q-grams (also known as n-grams). */ public class QGramTokenizer extends Analyzer { private final int size; /** * Constructs a new {@link QGramTokenizer}. * * @throws IllegalArgumentException if {@code size} is negative; */ public QGramTokenizer(int size) { checkArgument(size >= 0); this.size = size; } @Override public Collection<String> apply(Collection<String> input) { checkPointer(input != null); List<String> result = new LinkedList<>(); for (String text : input) { checkPointer(text != null); for (int i = 0; i + size <= text.length(); ++i) { result.add(text.substring(i, i + size)); } } return result; } }