package com.kennycason.kumo.nlp.tokenizer; import org.languagetool.language.English; import org.languagetool.tokenizers.Tokenizer; import java.util.ArrayList; import java.util.List; public class EnglishWordTokenizer implements WordTokenizer { private static final English ENGLISH = new English(); public EnglishWordTokenizer() {} @Override public List<String> tokenize(final String sentence) { final Tokenizer tokenizer = ENGLISH.getWordTokenizer(); final List<String> rawTokens = tokenizer.tokenize(sentence); final List<String> tokens = new ArrayList<>(); for (final String rawToken : rawTokens) { tokens.add(rawToken.substring(0, rawToken.indexOf('/'))); } return tokens; } }