CWSTagger.java example

Explorer
fudannlp-master
package edu.fudan.nlp.cn.tag;

import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import java.util.ArrayList;

import org.apache.commons.cli.BasicParser;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Options;

import edu.fudan.ml.classifier.struct.inf.ConstraintViterbi;
import edu.fudan.ml.classifier.struct.inf.LinearViterbi;
import edu.fudan.ml.types.Dictionary;
import edu.fudan.ml.types.Instance;
import edu.fudan.nlp.cn.Sentenizer;
import edu.fudan.nlp.cn.tag.format.FormatCWS;
import edu.fudan.nlp.pipe.Pipe;
import edu.fudan.nlp.pipe.SeriesPipes;
import edu.fudan.nlp.pipe.seq.DictLabel;
import edu.fudan.nlp.pipe.seq.String2Sequence;

import edu.fudan.util.MyCollection;
import edu.fudan.util.exception.LoadModelException;
import gnu.trove.set.hash.THashSet;

/**
 * 中文分词器
 * @author xpqiu
 * @version 1.0
 * @since FudanNLP 1.0
 */
public class CWSTagger extends AbstractTagger {
	// 考虑不同CWStagger可能使用不同dict，所以不使用静态
	private DictLabel dictPipe = null;
	private Pipe oldfeaturePipe=null;
	/**
	 * 是否对英文单词进行预处理
	 */
	private boolean isEnFilter = true;

	/**
	 * 是否对英文单词进行预处理，将连续的英文字母看成一个单词
	 * @param b
	 */
	public void setEnFilter(boolean b){
		isEnFilter = b;
		prePipe = new String2Sequence(isEnFilter);
	}

	/**
	 * 构造函数，使用LinearViterbi解码
	 * @param str 模型文件名
	 * @throws LoadModelException
	 */
	public CWSTagger(String str) throws LoadModelException {
		super(str);
		prePipe = new String2Sequence(isEnFilter);

		//		DynamicViterbi dv = new DynamicViterbi(
		//				(LinearViterbi) cl.getInferencer(), 
		//				cl.getAlphabetFactory().buildLabelAlphabet("labels"), 
		//				cl.getAlphabetFactory().buildFeatureAlphabet("features"),
		//				false);
		//		dv.setDynamicTemplets(DynamicTagger.getDynamicTemplet("example-data/structure/template_dynamic"));
		//		cl.setInferencer(dv);
	}

	private void initDict(Dictionary dict) {
		
	
			dictPipe = new DictLabel(dict, labels);

		oldfeaturePipe = featurePipe;
		featurePipe = new SeriesPipes(new Pipe[] { dictPipe, featurePipe });

		LinearViterbi dv = new ConstraintViterbi(
				(LinearViterbi) getClassifier().getInferencer());
		getClassifier().setInferencer(dv);
	}

	/**
	 * 构造函数，使用ConstraintViterbi解码
	 * @param str 模型文件名
	 * @param dict 外部词典资源
	 * @throws Exception
	 */
	public CWSTagger(String str, Dictionary dict) throws Exception {
		this(str);
		initDict(dict);
	}

	/**
	 * 设置词典
	 * @param dict 词典
	 */
	public void setDictionary(Dictionary dict) {
		removeDictionary();
		initDict(dict);
	}
	/**
	 * 设置词典
	 * @param newset
	 */
	public void setDictionary(THashSet<String> newset) {
		if(newset.size()==0)
			return;
		ArrayList<String> al = new ArrayList<String>();
		MyCollection.TSet2List(newset, al);
		Dictionary dict = new Dictionary();
		dict.addSegDict(al);
		setDictionary(dict);

	}

	/**
	 * 移除词典
	 */
	public void removeDictionary()	{
		if(oldfeaturePipe != null){
			featurePipe = oldfeaturePipe;
		}
		LinearViterbi dv = new LinearViterbi(
				(LinearViterbi) getClassifier().getInferencer());
		getClassifier().setInferencer(dv);

		dictPipe = null;
		oldfeaturePipe = null;
	}

	@Override
	public String tag(String src) {
		if(src==null||src.length()==0)
			return src;
		String[] sents = Sentenizer.split(src);
		String tag = "";
		try {
			for (int i = 0; i < sents.length; i++) {
				Instance inst = new Instance(sents[i]);
				String[] preds = _tag(inst);
				String s = FormatCWS.toString(inst, preds,delim);
				tag += s;
				if (i < sents.length - 1)
					tag += delim;
			}
		} catch (Exception e) {
			e.printStackTrace();
		}
		return tag;
	}

	/**
	 * 先进行断句，得到每句的分词结果，返回List[]数组
	 * @param src 字符串
	 * @return String[][] 多个句子数组
	 */
	public String[][] tag2DoubleArray(String src) {
		if(src==null||src.length()==0)
			return null;
		String[] sents = Sentenizer.split(src);
		String[][] words = new String[sents.length][];
		for(int i=0;i<sents.length;i++){
			words[i] = tag2Array(sents[i]);
		}		
		return words;
	}

	/**
	 * 得到分词结果 List，不进行断句
	 * @param src 字符串
	 * @return ArrayList<String> 词数组，每个元素为一个词
	 */
	public ArrayList<String> tag2List(String src) {
		if(src==null||src.length()==0)
			return null;
		ArrayList<String> res =null;
		try {
			Instance inst = new Instance(src);
			String[] preds = _tag(inst);
			res  = FormatCWS.toList(inst, preds);
		} catch (Exception e) {
			e.printStackTrace();
		}
		return res;
	}
	/**
	 * 得到分词结果 String[]，不进行断句
	 * @param src 字符串
	 * @return String[] 词数组，每个元素为一个词
	 */
	public String[] tag2Array(String src) {
		ArrayList<String> words = tag2List(src);
		return (String[]) words.toArray(new String[words.size()]);
	}

	public static void main(String[] args) throws Exception {
		Options opt = new Options();

		opt.addOption("h", false, "Print help for this application");
		opt.addOption("f", false, "segment file. Default string mode.");
		opt.addOption("s", false, "segment string");
		BasicParser parser = new BasicParser();
		CommandLine cl = parser.parse(opt, args);

		if (args.length == 0 || cl.hasOption('h')) {
			HelpFormatter f = new HelpFormatter();
			f.printHelp(
					"SEG:\n"
							+ "java edu.fudan.nlp.tag.CWSTagger -f model_file input_file output_file;\n"
							+ "java edu.fudan.nlp.tag.CWSTagger -s model_file string_to_segement",
							opt);
			return;
		}
		String[] arg = cl.getArgs();
		String modelFile;
		String input;
		String output = null;
		if (cl.hasOption("f") && arg.length == 3) {
			modelFile = arg[0];
			input = arg[1];
			output = arg[2];
		} else if (arg.length == 2) {
			modelFile = arg[0];
			input = arg[1];
		} else {
			System.err.println("paramenters format error!");
			System.err.println("Print option \"-h\" for help.");
			return;
		}
		CWSTagger seg = new CWSTagger(modelFile);
		if (cl.hasOption("f")) {
			String s = seg.tagFile(input);
			OutputStreamWriter w = new OutputStreamWriter(new FileOutputStream(
					output), "utf8");
			w.write(s);
			w.close();
		} else {
			String s = seg.tag(input);
			System.out.println(s);
		}
	}


}