package com.kennycason.kumo.nlp.tokenizer; import org.languagetool.language.Chinese; import org.languagetool.tokenizers.Tokenizer; import java.util.ArrayList; import java.util.List; public class ChineseWordTokenizer implements WordTokenizer { private static final Chinese CHINESE = new Chinese(); public ChineseWordTokenizer() {} @Override public List<String> tokenize(final String sentence) { final Tokenizer tokenizer = CHINESE.getWordTokenizer(); final List<String> rawTokens = tokenizer.tokenize(sentence); final List<String> tokens = new ArrayList<>(); for (final String rawToken : rawTokens) { // parse parts-of-speech tags away (政府/n, 依照/p, 法律/n, 行/ng, 使/v, 执法/vn) if (rawToken.contains("/")) { tokens.add(rawToken.substring(0, rawToken.indexOf('/'))); } else { tokens.add(rawToken); } } return tokens; } }