package edu.fudan.nlp.cn.tag; import java.io.FileOutputStream; import java.io.OutputStreamWriter; import java.util.Set; import org.apache.commons.cli.BasicParser; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.Options; import edu.fudan.ml.classifier.struct.inf.ConstraintViterbi; import edu.fudan.ml.classifier.struct.inf.LinearViterbi; import edu.fudan.ml.types.Dictionary; import edu.fudan.ml.types.Instance; import edu.fudan.nlp.cn.Chars; import edu.fudan.nlp.pipe.Pipe; import edu.fudan.nlp.pipe.SeriesPipes; import edu.fudan.nlp.pipe.seq.DictPOSLabel; import edu.fudan.util.exception.LoadModelException; /** * 词性标注器 * 先分词,再做词性标注 * @author xpqiu * @version 1.0 * @since FudanNLP 1.5 */ public class POSTagger extends AbstractTagger { private DictPOSLabel dictPipe = null; private Pipe oldfeaturePipe = null; /** * 分词模型 */ public CWSTagger cws; /** * 构造函数 * @param cwsmodel 分词模型文件 * @param str 词性模型文件 * @throws Exception */ public POSTagger(String cwsmodel, String str) throws Exception { super(str); cws = new CWSTagger(cwsmodel); } //分词词典也被dict指定 public POSTagger(String cwsmodel, String str, Dictionary dict) throws Exception { super(str); cws = new CWSTagger(cwsmodel); setDictionary(dict, true); } /** * 构造函数 * @param str 词性模型文件 * @throws Exception */ public POSTagger(String str) throws LoadModelException { super(str); System.out.println("只能处理分好词的句子"); } /** * 不建立分词模型,只能处理分好词的句子 * @param str * @param dict * @throws LoadModelException */ public POSTagger(String str, Dictionary dict) throws LoadModelException { super(str); setDictionary(dict, false); } /** * 构造函数 * @param cws 分词模型 * @param str 词性模型文件 * @throws LoadModelException */ public POSTagger(CWSTagger cws, String str) throws LoadModelException { super(str); if(cws==null) throw new LoadModelException("分词模型不能为空"); this.cws = cws; } /** * * @param cws 分词模型 * @param str 词性模型文件 * @param dict 词性词典 * @param isSetSegDict bool 指定该dict是否用于cws分词(分词和词性可以使用不同的词典)。true为替换之前的分词词典, false为使用原来分词设置的词典。 * @throws Exception */ public POSTagger(CWSTagger cws, String str, Dictionary dict, boolean isSetSegDict) throws Exception { super(str); if(cws==null) throw new Exception("分词模型不能为空"); this.cws = cws; setDictionary(dict, isSetSegDict); } /** * 设置词典, 参数指定是否同时设置分词词典 * @param dict 词典 * @throws LoadModelException */ public void setDictionary(Dictionary dict, boolean isSetSegDict) { removeDictionary(isSetSegDict); if(cws != null && isSetSegDict) cws.setDictionary(dict); dictPipe = null; dictPipe = new DictPOSLabel(dict, labels); oldfeaturePipe = featurePipe; featurePipe = new SeriesPipes(new Pipe[] { dictPipe, featurePipe }); LinearViterbi dv = new ConstraintViterbi( (LinearViterbi) getClassifier().getInferencer(),labels.size()); getClassifier().setInferencer(dv); } /** * 移除词典, 参数指定是否同时移除分词词典 */ public void removeDictionary(boolean isRemoveSegDict) { if(cws != null && isRemoveSegDict) cws.removeDictionary(); if(oldfeaturePipe != null){ featurePipe = oldfeaturePipe; } LinearViterbi dv = new LinearViterbi( (LinearViterbi) getClassifier().getInferencer()); getClassifier().setInferencer(dv); dictPipe = null; oldfeaturePipe = null; } /** * * @param src 字符串 * @return */ public String[][] tag2Array(String src) { if(src==null||src.length()==0) return null; if(cws==null){ System.out.println("只能处理分好词的句子"); return null; } String[] words = cws.tag2Array(src); if(words.length==0) return null; String[] target = null; Instance inst = new Instance(words); doProcess(inst); int[] pred = (int[]) getClassifier().classify(inst).getLabel(0); target = labels.lookupString(pred); String[][] tags = new String[2][]; tags[0] = words; tags[1] = target; return tags; } public String[][][] tag2DoubleArray(String src) { if(cws==null){ System.out.println("只能处理分好词的句子"); return null; } String[][] words = cws.tag2DoubleArray(src); String[][][] tags = new String[words.length][2][]; for (int i = 0; i < words.length; i++) { tags[i][0] = words[i]; tags[i][1] = tagSeged(words[i]); } return tags; } @Override public String tag(String src) { if(src==null||src.length()==0) return src; if(cws==null){ System.out.println("只能处理分好词的句子"); return null; } String[] words = cws.tag2Array(src); if(words.length==0) return src; Instance inst = new Instance(words); doProcess(inst); int[] pred = (int[]) getClassifier().classify(inst).getLabel(0); String[] target = labels.lookupString(pred); String res = format(words, target); return res; } /** * 将词/词性数组转换成用空格隔开的“词/词性”序列序列 * @param words 词数组 * @param target 词性数组 * @return 用空格隔开的“词/词性”序列 */ public String format(String[] words, String[] target) { StringBuilder sb = new StringBuilder(); for(int j=0;j<words.length;j++){ sb.append(words[j]); if(Chars.isWhiteSpace(words[j]))//空格不输出词性 continue; sb.append("/"); sb.append(target[j]); if(j<words.length-1) sb.append(delim); } String res = sb.toString(); return res; } /** * 处理分好词的句子 * @param src * @return */ public String[] tagSeged(String[] src) { if(src==null || src.length==0) return null; String[] target=null; try { Instance inst = new Instance(src); doProcess(inst); int[] pred = (int[]) getClassifier().classify(inst).getLabel(0); target = labels.lookupString(pred); } catch (Exception e) { e.printStackTrace(); } return target; } /** * 处理分好词的句子 * @param src * @return 用空格隔开的词性序列 */ public String tagSeged2String(String[] src) { StringBuilder sb = new StringBuilder(); String[] target = tagSeged(src); if(target==null) return null; for(int j=0;j<target.length;j++){ sb.append(target[j]); if(j<target.length-1) sb.append(" "); } return sb.toString(); } /** * 处理分好词的句子 * @param src * @return 用空格隔开的“词/词性”序列 */ public String tagSeged2StringALL(String[] src) { StringBuilder sb = new StringBuilder(); String[] target = tagSeged(src); if(target==null) return null; String res = format(src, target); return res; } /** * 得到支持的词性标签集合 * @return 词性标签集合 */ public Set<String> getSupportedTags(){ return labels.toSet(); } public static void main(String[] args) throws Exception { Options opt = new Options(); opt.addOption("h", false, "Print help for this application"); opt.addOption("f", false, "segment file. Default string mode."); opt.addOption("s", false, "segment string"); BasicParser parser = new BasicParser(); CommandLine cl = parser.parse(opt, args); if (args.length == 0 || cl.hasOption('h')) { HelpFormatter f = new HelpFormatter(); f.printHelp( "Tagger:\n" + "java edu.fudan.nlp.tag.POSTagger -f cws_model_file pos_model_file input_file output_file;\n" + "java edu.fudan.nlp.tag.POSTagger -s cws_model_file pos_model_file string_to_segement", opt); return; } String[] arg = cl.getArgs(); String cws_model_file,pos_model_file; String input; String output = null; if (cl.hasOption("f") && arg.length == 4) { cws_model_file = arg[0]; pos_model_file = arg[1]; input = arg[2]; output = arg[3]; } else if (arg.length == 3) { cws_model_file = arg[0]; pos_model_file = arg[1]; input = arg[2]; } else { System.err.println("paramenters format error!"); System.err.println("Print option \"-h\" for help."); return; } POSTagger pos = new POSTagger(cws_model_file, pos_model_file); if (cl.hasOption("f")) { String s = pos.tagFile(input); OutputStreamWriter w = new OutputStreamWriter(new FileOutputStream( output), "utf8"); w.write(s); w.close(); } else { String s = pos.tag(input); System.out.println(s); } } /** * 设定词性标注标记的类型 * @param lang cn:中文 en:英文 */ public void SetTagType(String lang){ if(lang.equals("en")) this.labels = factory.buildLabelAlphabet("label-en"); else if(lang.equals("cn")) this.labels = factory.DefaultLabelAlphabet(); } }