package com.miguelfonseca.completely.text.analyze.tokenize;
import com.miguelfonseca.completely.text.analyze.Analyzer;
import java.text.BreakIterator;
import java.util.Collection;
import java.util.LinkedList;
import java.util.List;
import static com.miguelfonseca.completely.common.Precondition.checkPointer;
/**
* Break text into words.
*/
public class WordTokenizer extends Analyzer
{
private final BreakIterator boundary;
/**
* Constructs a new {@link WordTokenizer}.
*/
public WordTokenizer()
{
this.boundary = BreakIterator.getWordInstance();
}
@Override
public Collection<String> apply(Collection<String> input)
{
checkPointer(input != null);
List<String> result = new LinkedList<>();
for (String text : input)
{
checkPointer(text != null);
boundary.setText(text.toString());
for (
int start = boundary.first(), end = boundary.next();
end != BreakIterator.DONE;
start = end, end = boundary.next()
)
{
String word = text.substring(start, end);
if (Character.isLetterOrDigit(word.charAt(0)))
{
result.add(word);
}
}
}
return result;
}
}