package edu.fudan.nlp.cn.tag;
import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import java.util.Set;
import org.apache.commons.cli.BasicParser;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Options;
import edu.fudan.ml.classifier.struct.inf.ConstraintViterbi;
import edu.fudan.ml.classifier.struct.inf.LinearViterbi;
import edu.fudan.ml.types.Dictionary;
import edu.fudan.ml.types.Instance;
import edu.fudan.nlp.cn.Chars;
import edu.fudan.nlp.pipe.Pipe;
import edu.fudan.nlp.pipe.SeriesPipes;
import edu.fudan.nlp.pipe.seq.DictPOSLabel;
import edu.fudan.util.exception.LoadModelException;
/**
* 词性标注器
* 先分词,再做词性标注
* @author xpqiu
* @version 1.0
* @since FudanNLP 1.5
*/
public class POSTagger extends AbstractTagger {
private DictPOSLabel dictPipe = null;
private Pipe oldfeaturePipe = null;
/**
* 分词模型
*/
public CWSTagger cws;
/**
* 构造函数
* @param cwsmodel 分词模型文件
* @param str 词性模型文件
* @throws Exception
*/
public POSTagger(String cwsmodel, String str) throws Exception {
super(str);
cws = new CWSTagger(cwsmodel);
}
//分词词典也被dict指定
public POSTagger(String cwsmodel, String str, Dictionary dict) throws Exception {
super(str);
cws = new CWSTagger(cwsmodel);
setDictionary(dict, true);
}
/**
* 构造函数
* @param str 词性模型文件
* @throws Exception
*/
public POSTagger(String str) throws LoadModelException {
super(str);
System.out.println("只能处理分好词的句子");
}
/**
* 不建立分词模型,只能处理分好词的句子
* @param str
* @param dict
* @throws LoadModelException
*/
public POSTagger(String str, Dictionary dict) throws LoadModelException {
super(str);
setDictionary(dict, false);
}
/**
* 构造函数
* @param cws 分词模型
* @param str 词性模型文件
* @throws LoadModelException
*/
public POSTagger(CWSTagger cws, String str) throws LoadModelException {
super(str);
if(cws==null)
throw new LoadModelException("分词模型不能为空");
this.cws = cws;
}
/**
*
* @param cws 分词模型
* @param str 词性模型文件
* @param dict 词性词典
* @param isSetSegDict bool 指定该dict是否用于cws分词(分词和词性可以使用不同的词典)。true为替换之前的分词词典, false为使用原来分词设置的词典。
* @throws Exception
*/
public POSTagger(CWSTagger cws, String str, Dictionary dict, boolean isSetSegDict) throws Exception {
super(str);
if(cws==null)
throw new Exception("分词模型不能为空");
this.cws = cws;
setDictionary(dict, isSetSegDict);
}
/**
* 设置词典, 参数指定是否同时设置分词词典
* @param dict 词典
* @throws LoadModelException
*/
public void setDictionary(Dictionary dict, boolean isSetSegDict) {
removeDictionary(isSetSegDict);
if(cws != null && isSetSegDict)
cws.setDictionary(dict);
dictPipe = null;
dictPipe = new DictPOSLabel(dict, labels);
oldfeaturePipe = featurePipe;
featurePipe = new SeriesPipes(new Pipe[] { dictPipe, featurePipe });
LinearViterbi dv = new ConstraintViterbi(
(LinearViterbi) getClassifier().getInferencer(),labels.size());
getClassifier().setInferencer(dv);
}
/**
* 移除词典, 参数指定是否同时移除分词词典
*/
public void removeDictionary(boolean isRemoveSegDict) {
if(cws != null && isRemoveSegDict)
cws.removeDictionary();
if(oldfeaturePipe != null){
featurePipe = oldfeaturePipe;
}
LinearViterbi dv = new LinearViterbi(
(LinearViterbi) getClassifier().getInferencer());
getClassifier().setInferencer(dv);
dictPipe = null;
oldfeaturePipe = null;
}
/**
*
* @param src 字符串
* @return
*/
public String[][] tag2Array(String src) {
if(src==null||src.length()==0)
return null;
if(cws==null){
System.out.println("只能处理分好词的句子");
return null;
}
String[] words = cws.tag2Array(src);
if(words.length==0)
return null;
String[] target = null;
Instance inst = new Instance(words);
doProcess(inst);
int[] pred = (int[]) getClassifier().classify(inst).getLabel(0);
target = labels.lookupString(pred);
String[][] tags = new String[2][];
tags[0] = words;
tags[1] = target;
return tags;
}
public String[][][] tag2DoubleArray(String src) {
if(cws==null){
System.out.println("只能处理分好词的句子");
return null;
}
String[][] words = cws.tag2DoubleArray(src);
String[][][] tags = new String[words.length][2][];
for (int i = 0; i < words.length; i++) {
tags[i][0] = words[i];
tags[i][1] = tagSeged(words[i]);
}
return tags;
}
@Override
public String tag(String src) {
if(src==null||src.length()==0)
return src;
if(cws==null){
System.out.println("只能处理分好词的句子");
return null;
}
String[] words = cws.tag2Array(src);
if(words.length==0)
return src;
Instance inst = new Instance(words);
doProcess(inst);
int[] pred = (int[]) getClassifier().classify(inst).getLabel(0);
String[] target = labels.lookupString(pred);
String res = format(words, target);
return res;
}
/**
* 将词/词性数组转换成用空格隔开的“词/词性”序列序列
* @param words 词数组
* @param target 词性数组
* @return 用空格隔开的“词/词性”序列
*/
public String format(String[] words, String[] target) {
StringBuilder sb = new StringBuilder();
for(int j=0;j<words.length;j++){
sb.append(words[j]);
if(Chars.isWhiteSpace(words[j]))//空格不输出词性
continue;
sb.append("/");
sb.append(target[j]);
if(j<words.length-1)
sb.append(delim);
}
String res = sb.toString();
return res;
}
/**
* 处理分好词的句子
* @param src
* @return
*/
public String[] tagSeged(String[] src) {
if(src==null || src.length==0)
return null;
String[] target=null;
try {
Instance inst = new Instance(src);
doProcess(inst);
int[] pred = (int[]) getClassifier().classify(inst).getLabel(0);
target = labels.lookupString(pred);
} catch (Exception e) {
e.printStackTrace();
}
return target;
}
/**
* 处理分好词的句子
* @param src
* @return 用空格隔开的词性序列
*/
public String tagSeged2String(String[] src) {
StringBuilder sb = new StringBuilder();
String[] target = tagSeged(src);
if(target==null)
return null;
for(int j=0;j<target.length;j++){
sb.append(target[j]);
if(j<target.length-1)
sb.append(" ");
}
return sb.toString();
}
/**
* 处理分好词的句子
* @param src
* @return 用空格隔开的“词/词性”序列
*/
public String tagSeged2StringALL(String[] src) {
StringBuilder sb = new StringBuilder();
String[] target = tagSeged(src);
if(target==null)
return null;
String res = format(src, target);
return res;
}
/**
* 得到支持的词性标签集合
* @return 词性标签集合
*/
public Set<String> getSupportedTags(){
return labels.toSet();
}
public static void main(String[] args) throws Exception {
Options opt = new Options();
opt.addOption("h", false, "Print help for this application");
opt.addOption("f", false, "segment file. Default string mode.");
opt.addOption("s", false, "segment string");
BasicParser parser = new BasicParser();
CommandLine cl = parser.parse(opt, args);
if (args.length == 0 || cl.hasOption('h')) {
HelpFormatter f = new HelpFormatter();
f.printHelp(
"Tagger:\n"
+ "java edu.fudan.nlp.tag.POSTagger -f cws_model_file pos_model_file input_file output_file;\n"
+ "java edu.fudan.nlp.tag.POSTagger -s cws_model_file pos_model_file string_to_segement",
opt);
return;
}
String[] arg = cl.getArgs();
String cws_model_file,pos_model_file;
String input;
String output = null;
if (cl.hasOption("f") && arg.length == 4) {
cws_model_file = arg[0];
pos_model_file = arg[1];
input = arg[2];
output = arg[3];
} else if (arg.length == 3) {
cws_model_file = arg[0];
pos_model_file = arg[1];
input = arg[2];
} else {
System.err.println("paramenters format error!");
System.err.println("Print option \"-h\" for help.");
return;
}
POSTagger pos = new POSTagger(cws_model_file, pos_model_file);
if (cl.hasOption("f")) {
String s = pos.tagFile(input);
OutputStreamWriter w = new OutputStreamWriter(new FileOutputStream(
output), "utf8");
w.write(s);
w.close();
} else {
String s = pos.tag(input);
System.out.println(s);
}
}
/**
* 设定词性标注标记的类型
* @param lang cn:中文 en:英文
*/
public void SetTagType(String lang){
if(lang.equals("en"))
this.labels = factory.buildLabelAlphabet("label-en");
else if(lang.equals("cn"))
this.labels = factory.DefaultLabelAlphabet();
}
}