POSTagger.java example

Explorer
fudannlp-master
package edu.fudan.nlp.cn.tag;

import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import java.util.Set;

import org.apache.commons.cli.BasicParser;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Options;

import edu.fudan.ml.classifier.struct.inf.ConstraintViterbi;
import edu.fudan.ml.classifier.struct.inf.LinearViterbi;
import edu.fudan.ml.types.Dictionary;
import edu.fudan.ml.types.Instance;
import edu.fudan.nlp.cn.Chars;
import edu.fudan.nlp.pipe.Pipe;
import edu.fudan.nlp.pipe.SeriesPipes;
import edu.fudan.nlp.pipe.seq.DictPOSLabel;
import edu.fudan.util.exception.LoadModelException;

/**
 * 词性标注器
 * 先分词，再做词性标注
 * @author xpqiu
 * @version 1.0
 * @since FudanNLP 1.5
 */
public class POSTagger extends AbstractTagger {

	private DictPOSLabel dictPipe = null;
	private Pipe oldfeaturePipe = null;
	/**
	 * 分词模型
	 */
	public CWSTagger cws;

	/**
	 * 构造函数
	 * @param cwsmodel 分词模型文件
	 * @param str 词性模型文件
	 * @throws Exception
	 */
	public POSTagger(String cwsmodel, String str) throws Exception {
		super(str);
		cws = new CWSTagger(cwsmodel);
	}

	//分词词典也被dict指定
	public POSTagger(String cwsmodel, String str, Dictionary dict) throws Exception {
		super(str);
		cws = new CWSTagger(cwsmodel);
		setDictionary(dict, true);	
	}


	/**
	 * 构造函数
	 * @param str 词性模型文件
	 * @throws Exception
	 */
	public POSTagger(String str) throws LoadModelException {
		super(str);
		System.out.println("只能处理分好词的句子");
	}

	/**
	 * 不建立分词模型，只能处理分好词的句子
	 * @param str
	 * @param dict
	 * @throws LoadModelException
	 */
	public POSTagger(String str, Dictionary dict) throws LoadModelException   {
		super(str);
		setDictionary(dict, false);
	}

	/**
	 * 构造函数
	 * @param cws 分词模型
	 * @param str 词性模型文件
	 * @throws LoadModelException
	 */
	public POSTagger(CWSTagger cws, String str) throws LoadModelException {
		super(str);
		if(cws==null)
			throw new LoadModelException("分词模型不能为空");
		this.cws = cws;	
	}
	/**
	 * 
	 * @param cws 分词模型
	 * @param str 词性模型文件
	 * @param dict 词性词典
	 * @param isSetSegDict bool 指定该dict是否用于cws分词（分词和词性可以使用不同的词典）。true为替换之前的分词词典, false为使用原来分词设置的词典。
	 * @throws Exception
	 */
	public POSTagger(CWSTagger cws, String str, Dictionary dict, boolean isSetSegDict) throws Exception {
		super(str);
		if(cws==null)
			throw new Exception("分词模型不能为空");
		this.cws = cws;
		setDictionary(dict, isSetSegDict);	
	}

	/**
	 * 设置词典, 参数指定是否同时设置分词词典
	 * @param dict 词典
	 * @throws LoadModelException 
	 */
	public void setDictionary(Dictionary dict, boolean isSetSegDict)   {
		removeDictionary(isSetSegDict);
		if(cws != null && isSetSegDict)
			cws.setDictionary(dict);
		dictPipe = null;
		dictPipe = new DictPOSLabel(dict, labels);
		oldfeaturePipe = featurePipe;
		featurePipe = new SeriesPipes(new Pipe[] { dictPipe, featurePipe });
		LinearViterbi dv = new ConstraintViterbi(
				(LinearViterbi) getClassifier().getInferencer(),labels.size());
		getClassifier().setInferencer(dv);
	}

	/**
	 * 移除词典, 参数指定是否同时移除分词词典
	 */
	public void removeDictionary(boolean isRemoveSegDict)	{
		if(cws != null && isRemoveSegDict)
			cws.removeDictionary();

		if(oldfeaturePipe != null){
			featurePipe = oldfeaturePipe;
		}
		LinearViterbi dv = new LinearViterbi(
				(LinearViterbi) getClassifier().getInferencer());
		getClassifier().setInferencer(dv);

		dictPipe = null;
		oldfeaturePipe = null;
	}

	/**
	 * 
	 * @param src 字符串
	 * @return
	 */
	public String[][] tag2Array(String src) {
		if(src==null||src.length()==0)
			return null;
		if(cws==null){
			System.out.println("只能处理分好词的句子");
			return null;
		}
		String[] words = cws.tag2Array(src);
		if(words.length==0)
			return null;
		String[] target = null;
		Instance inst = new Instance(words);
		doProcess(inst);
		int[]  pred = (int[]) getClassifier().classify(inst).getLabel(0);
		target = labels.lookupString(pred);


		String[][] tags = new String[2][];
		tags[0] = words;		
		tags[1] = target;
		return tags;
	}

	public String[][][] tag2DoubleArray(String src) {
		if(cws==null){
			System.out.println("只能处理分好词的句子");
			return null;
		}
		String[][] words = cws.tag2DoubleArray(src);
		String[][][] tags = new String[words.length][2][];

		for (int i = 0; i < words.length; i++) {
			tags[i][0] = words[i];
			tags[i][1] = tagSeged(words[i]);
		}

		return tags;
	}

	@Override
	public String tag(String src) {
		if(src==null||src.length()==0)
			return src;
		if(cws==null){
			System.out.println("只能处理分好词的句子");
			return null;
		}
		String[] words = cws.tag2Array(src);
		if(words.length==0)
			return src;

		Instance inst = new Instance(words);
		doProcess(inst);
		int[] pred = (int[]) getClassifier().classify(inst).getLabel(0);
		String[] target = labels.lookupString(pred);
		String res = format(words, target);
		return res;
	}
	/**
	 * 将词/词性数组转换成用空格隔开的“词/词性”序列序列
	 * @param words 词数组
	 * @param target 词性数组
	 * @return 用空格隔开的“词/词性”序列
	 */
	public String format(String[] words, String[] target) {
		StringBuilder sb = new StringBuilder();
		for(int j=0;j<words.length;j++){
			sb.append(words[j]);
			if(Chars.isWhiteSpace(words[j]))//空格不输出词性
				continue;
			sb.append("/");
			sb.append(target[j]);
			if(j<words.length-1)
				sb.append(delim);
		} 
		String res = sb.toString();
		return res;
	}


	/**
	 * 处理分好词的句子
	 * @param src
	 * @return
	 */
	public String[] tagSeged(String[] src) {
		if(src==null || src.length==0)
			return null;
		String[] target=null;
		try {
			Instance inst = new Instance(src);
			doProcess(inst);
			int[] pred = (int[]) getClassifier().classify(inst).getLabel(0);
			target = labels.lookupString(pred);			
		} catch (Exception e) {
			e.printStackTrace();
		}		
		return target;
	}
	/**
	 * 处理分好词的句子
	 * @param src
	 * @return 用空格隔开的词性序列
	 */
	public String tagSeged2String(String[] src) {
		StringBuilder sb = new StringBuilder();
		String[] target = tagSeged(src);
		if(target==null)
			return null;
		for(int j=0;j<target.length;j++){
			sb.append(target[j]);
			if(j<target.length-1)
				sb.append(" ");
		}
		return sb.toString();
	}

	/**
	 * 处理分好词的句子
	 * @param src
	 * @return 用空格隔开的“词/词性”序列
	 */
	public String tagSeged2StringALL(String[] src) {
		StringBuilder sb = new StringBuilder();
		String[] target = tagSeged(src);
		if(target==null)
			return null;
		String res = format(src, target);
		return res;
	}


	/**
	 * 得到支持的词性标签集合
	 * @return 词性标签集合
	 */
	public Set<String> getSupportedTags(){
		return labels.toSet();
	}

	public static void main(String[] args) throws Exception {
		Options opt = new Options();

		opt.addOption("h", false, "Print help for this application");
		opt.addOption("f", false, "segment file. Default string mode.");
		opt.addOption("s", false, "segment string");
		BasicParser parser = new BasicParser();
		CommandLine cl = parser.parse(opt, args);

		if (args.length == 0 || cl.hasOption('h')) {
			HelpFormatter f = new HelpFormatter();
			f.printHelp(
					"Tagger:\n"
							+ "java edu.fudan.nlp.tag.POSTagger -f cws_model_file pos_model_file input_file output_file;\n"
							+ "java edu.fudan.nlp.tag.POSTagger -s cws_model_file pos_model_file string_to_segement",
							opt);
			return;
		}
		String[] arg = cl.getArgs();
		String cws_model_file,pos_model_file;
		String input;
		String output = null;
		if (cl.hasOption("f") && arg.length == 4) {
			cws_model_file = arg[0];
			pos_model_file = arg[1];
			input = arg[2];
			output = arg[3];
		} else if (arg.length == 3) {
			cws_model_file = arg[0];
			pos_model_file = arg[1];
			input = arg[2];
		} else {
			System.err.println("paramenters format error!");
			System.err.println("Print option \"-h\" for help.");
			return;
		}
		POSTagger pos = new POSTagger(cws_model_file, pos_model_file);
		if (cl.hasOption("f")) {
			String s = pos.tagFile(input);
			OutputStreamWriter w = new OutputStreamWriter(new FileOutputStream(
					output), "utf8");
			w.write(s);
			w.close();
		} else {
			String s = pos.tag(input);
			System.out.println(s);
		}
	}

	/**
	 * 设定词性标注标记的类型
	 * @param lang cn:中文 en:英文
	 */
	public void SetTagType(String lang){
		if(lang.equals("en"))
			this.labels = factory.buildLabelAlphabet("label-en");
		else if(lang.equals("cn"))
			this.labels = factory.DefaultLabelAlphabet();
	}
}