package com.yc.nlp.seg; import java.io.BufferedReader; import java.io.File; import java.util.ArrayList; import java.util.List; import com.yc.nlp.pojo.Result; import com.yc.nlp.pojo.WordTag; import com.yc.nlp.util.MemFile; import com.yc.nlp.util.TnT; public class Seg { private TnT segger; public Seg() { segger = new TnT(); } public void save(String fname) { this.segger.save(fname); } public void load(String fname) { this.segger.load(fname); } /** * 训练文件 * @param fileName */ public void train(String fileName) { List<List<WordTag>> wordTags = new ArrayList<List<WordTag>>(); BufferedReader br = MemFile.readFile(fileName, this); if (br != null) { wordTags = MemFile.segFile(br, wordTags); } // 加载自定义的训练文件 File extendFiles = new File("extend"); if (extendFiles.isDirectory() && extendFiles.listFiles().length > 0) { for (File file : extendFiles.listFiles()) { br = MemFile.readFile(file.getName(), this); if (br != null) { wordTags = MemFile.segFile(br, wordTags); } } } this.segger.train(wordTags); } /** * 分词 * @param sentence * @return */ public List<String> seg(String sentence) { List<String> ret = new ArrayList<String>(); try { List<String> data = new ArrayList<String>(); char[] chars = sentence.toCharArray(); for (Character ch : chars) { data.add(ch.toString()); } List<Result> results = this.segger.tag(data); StringBuilder sb = new StringBuilder(); for (Result result : results) { if (result.getCh().equals("s")) { ret.add(result.getWord()); } else if (result.getCh().equals("e")) { sb.append(result.getWord()); ret.add(sb.toString()); sb.delete(0, sb.length()); } else { sb.append(result.getWord()); } } } catch (Exception e) { e.printStackTrace(); } return ret; } public static void main(String[] args) { Seg seg = new Seg(); seg.train("data.txt");// 主要是用来放置一些简单快速的中文分词和词性标注的程序 seg.save("seg1.marshal"); System.out.println(seg.seg("这个东西真心很赞")); } }