package com.yc.nlp.seg;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class InitSeg {
private static Logger logger = LoggerFactory.getLogger(InitSeg.class);
private Seg segger;
public InitSeg() {
logger.debug("initialize initseg begin...");
segger = new Seg();
segger.load("com/yc/nlp/seg/seg.marshal");
logger.debug("initialize initseg end...");
}
/**
* 对所有的中文进行分词
* @param sent
* @return
*/
public List<String> seg(String sent) {
List<String> words = new ArrayList<String>();
Pattern pattern = Pattern.compile("([u4E00-u9FA5]+)");
for (String s : pattern.split(sent)) {
s = s.trim();
if ("".equals(s)) {
continue;
}
pattern = Pattern.compile("[\u4E00-\u9FA5]");
if (pattern.matcher(s).find()) {
words.addAll(singleSeg(s));
} else {
for (String word : s.split("\\s+")) {
word = word.trim();
if (!"".equals(word)) {
words.add(word);
}
}
}
}
return words;
}
/**
* 对单独的句子进行分词
* @param sent
* @return
*/
public List<String> singleSeg(String sent) {
return segger.seg(sent);
}
public static void main(String[] args) {
}
}