package edu.fudan.nlp.cn.tag; import java.io.FileOutputStream; import java.io.OutputStreamWriter; import java.util.ArrayList; import org.apache.commons.cli.BasicParser; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.Options; import edu.fudan.ml.classifier.struct.inf.ConstraintViterbi; import edu.fudan.ml.classifier.struct.inf.LinearViterbi; import edu.fudan.ml.types.Dictionary; import edu.fudan.ml.types.Instance; import edu.fudan.nlp.cn.Sentenizer; import edu.fudan.nlp.cn.tag.format.FormatCWS; import edu.fudan.nlp.pipe.Pipe; import edu.fudan.nlp.pipe.SeriesPipes; import edu.fudan.nlp.pipe.seq.DictLabel; import edu.fudan.nlp.pipe.seq.String2Sequence; import edu.fudan.util.MyCollection; import edu.fudan.util.exception.LoadModelException; import gnu.trove.set.hash.THashSet; /** * 中文分词器 * @author xpqiu * @version 1.0 * @since FudanNLP 1.0 */ public class CWSTagger extends AbstractTagger { // 考虑不同CWStagger可能使用不同dict,所以不使用静态 private DictLabel dictPipe = null; private Pipe oldfeaturePipe=null; /** * 是否对英文单词进行预处理 */ private boolean isEnFilter = true; /** * 是否对英文单词进行预处理,将连续的英文字母看成一个单词 * @param b */ public void setEnFilter(boolean b){ isEnFilter = b; prePipe = new String2Sequence(isEnFilter); } /** * 构造函数,使用LinearViterbi解码 * @param str 模型文件名 * @throws LoadModelException */ public CWSTagger(String str) throws LoadModelException { super(str); prePipe = new String2Sequence(isEnFilter); // DynamicViterbi dv = new DynamicViterbi( // (LinearViterbi) cl.getInferencer(), // cl.getAlphabetFactory().buildLabelAlphabet("labels"), // cl.getAlphabetFactory().buildFeatureAlphabet("features"), // false); // dv.setDynamicTemplets(DynamicTagger.getDynamicTemplet("example-data/structure/template_dynamic")); // cl.setInferencer(dv); } private void initDict(Dictionary dict) { dictPipe = new DictLabel(dict, labels); oldfeaturePipe = featurePipe; featurePipe = new SeriesPipes(new Pipe[] { dictPipe, featurePipe }); LinearViterbi dv = new ConstraintViterbi( (LinearViterbi) getClassifier().getInferencer()); getClassifier().setInferencer(dv); } /** * 构造函数,使用ConstraintViterbi解码 * @param str 模型文件名 * @param dict 外部词典资源 * @throws Exception */ public CWSTagger(String str, Dictionary dict) throws Exception { this(str); initDict(dict); } /** * 设置词典 * @param dict 词典 */ public void setDictionary(Dictionary dict) { removeDictionary(); initDict(dict); } /** * 设置词典 * @param newset */ public void setDictionary(THashSet<String> newset) { if(newset.size()==0) return; ArrayList<String> al = new ArrayList<String>(); MyCollection.TSet2List(newset, al); Dictionary dict = new Dictionary(); dict.addSegDict(al); setDictionary(dict); } /** * 移除词典 */ public void removeDictionary() { if(oldfeaturePipe != null){ featurePipe = oldfeaturePipe; } LinearViterbi dv = new LinearViterbi( (LinearViterbi) getClassifier().getInferencer()); getClassifier().setInferencer(dv); dictPipe = null; oldfeaturePipe = null; } @Override public String tag(String src) { if(src==null||src.length()==0) return src; String[] sents = Sentenizer.split(src); String tag = ""; try { for (int i = 0; i < sents.length; i++) { Instance inst = new Instance(sents[i]); String[] preds = _tag(inst); String s = FormatCWS.toString(inst, preds,delim); tag += s; if (i < sents.length - 1) tag += delim; } } catch (Exception e) { e.printStackTrace(); } return tag; } /** * 先进行断句,得到每句的分词结果,返回List[]数组 * @param src 字符串 * @return String[][] 多个句子数组 */ public String[][] tag2DoubleArray(String src) { if(src==null||src.length()==0) return null; String[] sents = Sentenizer.split(src); String[][] words = new String[sents.length][]; for(int i=0;i<sents.length;i++){ words[i] = tag2Array(sents[i]); } return words; } /** * 得到分词结果 List,不进行断句 * @param src 字符串 * @return ArrayList<String> 词数组,每个元素为一个词 */ public ArrayList<String> tag2List(String src) { if(src==null||src.length()==0) return null; ArrayList<String> res =null; try { Instance inst = new Instance(src); String[] preds = _tag(inst); res = FormatCWS.toList(inst, preds); } catch (Exception e) { e.printStackTrace(); } return res; } /** * 得到分词结果 String[],不进行断句 * @param src 字符串 * @return String[] 词数组,每个元素为一个词 */ public String[] tag2Array(String src) { ArrayList<String> words = tag2List(src); return (String[]) words.toArray(new String[words.size()]); } public static void main(String[] args) throws Exception { Options opt = new Options(); opt.addOption("h", false, "Print help for this application"); opt.addOption("f", false, "segment file. Default string mode."); opt.addOption("s", false, "segment string"); BasicParser parser = new BasicParser(); CommandLine cl = parser.parse(opt, args); if (args.length == 0 || cl.hasOption('h')) { HelpFormatter f = new HelpFormatter(); f.printHelp( "SEG:\n" + "java edu.fudan.nlp.tag.CWSTagger -f model_file input_file output_file;\n" + "java edu.fudan.nlp.tag.CWSTagger -s model_file string_to_segement", opt); return; } String[] arg = cl.getArgs(); String modelFile; String input; String output = null; if (cl.hasOption("f") && arg.length == 3) { modelFile = arg[0]; input = arg[1]; output = arg[2]; } else if (arg.length == 2) { modelFile = arg[0]; input = arg[1]; } else { System.err.println("paramenters format error!"); System.err.println("Print option \"-h\" for help."); return; } CWSTagger seg = new CWSTagger(modelFile); if (cl.hasOption("f")) { String s = seg.tagFile(input); OutputStreamWriter w = new OutputStreamWriter(new FileOutputStream( output), "utf8"); w.write(s); w.close(); } else { String s = seg.tag(input); System.out.println(s); } } }