package wordcloud.nlp.tokenizer;
import org.languagetool.language.Chinese;
import org.languagetool.tokenizers.Tokenizer;
import java.util.ArrayList;
import java.util.List;
public class ChineseWordTokenizer implements WordTokenizer {
private static final Chinese CHINESE = new Chinese();
public ChineseWordTokenizer() {}
@Override
public List<String> tokenize(String sentence) {
final Tokenizer tokenizer = CHINESE.getWordTokenizer();
final List<String> rawTokens = tokenizer.tokenize(sentence);
final List<String> tokens = new ArrayList<>();
for(String rawToken : rawTokens) { // parse parts-of-speech tags away
tokens.add(rawToken.substring(0, rawToken.indexOf('/')));
}
return tokens;
}
}