Tagger.java example

Explorer
fudannlp-master
package edu.fudan.nlp.tag;

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.OutputStreamWriter;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;

import org.apache.commons.cli.BasicParser;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Options;

import edu.fudan.data.reader.SequenceReader;
import edu.fudan.ml.classifier.linear.Linear;
import edu.fudan.ml.classifier.linear.OnlineTrainer;
import edu.fudan.ml.classifier.linear.inf.Inferencer;
import edu.fudan.ml.classifier.linear.update.Update;
import edu.fudan.ml.classifier.struct.inf.LinearViterbi;
import edu.fudan.ml.classifier.struct.inf.HigherOrderViterbi;
import edu.fudan.ml.classifier.struct.update.LinearViterbiPAUpdate;
import edu.fudan.ml.classifier.struct.update.HigherOrderViterbiPAUpdate;
import edu.fudan.ml.loss.Loss;
import edu.fudan.ml.loss.struct.HammingLoss;
import edu.fudan.ml.types.Instance;
import edu.fudan.ml.types.InstanceSet;
import edu.fudan.ml.types.alphabet.AlphabetFactory;
import edu.fudan.ml.types.alphabet.IFeatureAlphabet;
import edu.fudan.ml.types.alphabet.LabelAlphabet;
import edu.fudan.nlp.cn.tag.format.SimpleFormatter;
import edu.fudan.nlp.pipe.Pipe;
import edu.fudan.nlp.pipe.SeriesPipes;
import edu.fudan.nlp.pipe.Target2Label;
import edu.fudan.nlp.pipe.seq.AddCharRange;
import edu.fudan.nlp.pipe.seq.Sequence2FeatureSequence;
import edu.fudan.nlp.pipe.seq.SplitDataAndTarget;
import edu.fudan.nlp.pipe.seq.templet.TempletGroup;
import edu.fudan.util.MyStrings;

/**
 * 序列标注器训练和测试程序
 * 
 * @author xpqiu
 * 
 */
public class Tagger {

	protected Linear cl;
	protected String train;
	protected String testfile = null;
	protected String output = null;
	protected String templateFile;
	public static boolean standard = true;
	protected String model;
	protected int iterNum;
	protected float c;
	protected boolean useLoss = true;
	protected String delimiter = "\\s+|\\t+";
	protected boolean interim = false;
	protected AlphabetFactory factory;
	protected Pipe featurePipe;
	protected TempletGroup templets;
	protected String newmodel;
	protected boolean hasLabel;

	public Tagger() {
	}

	public void setFile(String templateFile, String train, String model) {
		this.templateFile = templateFile;
		this.train = train;
		this.model = model;
	}

	/**
	 * 序列标注训练和测试主程序
	 * 训练： java -classpath fudannlp.jar edu.fudan.nlp.tag.Tagger -train template train model 
	 * 测试： java -classpath fudannlp.jar edu.fudan.nlp.tag.Tagger [-haslabel] model test [result]
	 * 
	 * @param args
	 * @throws Exception
	 */
	public static void main(String[] args) throws Exception {

		Options opt = new Options();

		opt.addOption("h", false, "Print help for this application");
		opt.addOption("iter", true, "iterative num, default 50");
		opt.addOption("c", true, "parameters C in PA algorithm, default 0.8");
		opt.addOption("train", false,
				"switch to training mode(Default: test model");
		opt.addOption("retrain", false,
				"switch to retraining mode(Default: test model");
		opt.addOption("margin", false, "use hamming loss as margin threshold");
		opt.addOption("interim", false, "save interim model file");
		opt.addOption("haslabel", false, "test file has includes label or not");

		BasicParser parser = new BasicParser();
		CommandLine cl;
		try {
			cl = parser.parse(opt, args);
		} catch (Exception e) {
			System.err.println("Parameters format error");
			return;
		}

		if (args.length == 0 || cl.hasOption('h')) {
			HelpFormatter f = new HelpFormatter();
			f.printHelp(
					"Tagger:\n"
							+ "tagger [option] -train templet_file train_file model_file [test_file];\n"
							+ "tagger [option] -retrain train_file model_file newmodel_file [test_file];\n"
							+ "tagger [option] -label model_file test_file output_file\n",
							opt);
			return;
		}
		Tagger tagger = new Tagger();
		tagger.iterNum = Integer.parseInt(cl.getOptionValue("iter", "50"));
		tagger.c = Float.parseFloat(cl.getOptionValue("c", "0.8"));
		tagger.useLoss = cl.hasOption("margin");
		tagger.interim = cl.hasOption("interim");
		tagger.hasLabel = cl.hasOption("haslabel");

		String[] arg = cl.getArgs();
		if (cl.hasOption("train") && arg.length == 3) {
			tagger.templateFile = arg[0];
			tagger.train = arg[1];
			tagger.model = arg[2];
			System.out.println("Training model ...");
			tagger.train();
		} else if (cl.hasOption("train") && arg.length == 4) {
			tagger.templateFile = arg[0];
			tagger.train = arg[1];
			tagger.model = arg[2];
			tagger.testfile = arg[3];
			System.out.println("Training model ...");
			tagger.train();
		} else if (cl.hasOption("train") && arg.length == 5) {
			tagger.templateFile = arg[0];
			tagger.train = arg[1];
			tagger.model = arg[2];
			tagger.testfile = arg[3];
			System.out.println("Training model ...");
			tagger.train();
			System.gc();
			tagger.output = arg[4];
			tagger.test();
		} else if (cl.hasOption("retrain") && arg.length == 3) {
			tagger.train = arg[0];
			tagger.model = arg[1];
			tagger.newmodel = arg[2];
			System.out.println("Re-Training model ...");
			tagger.train(true);
		} else if (cl.hasOption("retrain") && arg.length == 4) {
			tagger.train = arg[0];
			tagger.model = arg[1];
			tagger.newmodel = arg[2];
			tagger.testfile = arg[3];
			System.out.println("Re-Training model ...");
			tagger.train(true);
		} else if (cl.hasOption("retrain") && arg.length == 5) {
			tagger.train = arg[0];
			tagger.model = arg[1];
			tagger.newmodel = arg[2];
			tagger.testfile = arg[3];
			System.out.println("Re-Training model ...");
			tagger.train(true);
			System.gc();
			tagger.output = arg[4];
			tagger.test();
		} else if (arg.length == 3) {
			tagger.model = arg[0];
			tagger.testfile = arg[1];
			tagger.output = arg[2];
			tagger.test();
		} else if (arg.length == 2) {
			tagger.model = arg[0];
			tagger.testfile = arg[1];
			tagger.test();
		} else {
			System.err.println("paramenters format error!");
			System.err.println("Print option \"-h\" for help.");
			return;
		}

		System.gc();

	}



	/**
	 * @throws Exception
	 */
	public Pipe createProcessor(boolean flag) throws Exception {
		if (!flag) {
			templets = new TempletGroup();
			templets.load(templateFile);
//			templets.load_pro(templateFile);

			// Dictionary d = new Dictionary();
			// d.loadWithWeigth("D:/xpqiu/项目/自选/CLP2010/CWS/av-b-lut.txt",
			// "AV");
			// templets.add(new DictionaryTemplet(d, gid++, -1, 0));
			// templets.add(new DictionaryTemplet(d, gid++, 0, 1));
			// templets.add(new DictionaryTemplet(d, gid++, -1,0, 1));
			// templets.add(new DictionaryTemplet(d, gid++, -2,-1,0, 1));
//
//			templets.add(new CharRangeTemplet(templets.gid++,new
//					int[]{0}));
//			templets.add(new CharRangeTemplet(templets.gid++,new
//					int[]{-1,0}));
//			templets.add(new CharRangeTemplet(templets.gid++,new
//					int[]{0,1}));
		}

		if (cl != null)
			factory = cl.getAlphabetFactory();
		else
			factory = AlphabetFactory.buildFactory();

		/**
		 * 标签转为0、1、2、...
		 */
		LabelAlphabet labels = factory.DefaultLabelAlphabet();

		// 将样本通过Pipe抽取特征
		IFeatureAlphabet features = factory.DefaultFeatureAlphabet();
		featurePipe = new Sequence2FeatureSequence(templets, features, labels);

		Pipe pipe = new SeriesPipes(new Pipe[] { new Target2Label(labels), featurePipe });
		return pipe;
	}

	public void train() throws Exception{
		train(false);
	}

	/**
	 * 训练
	 * @param b 是否增量训练
	 * @throws Exception
	 */
	public void train(boolean b) throws Exception {

		System.out.print("Loading training data ...");
		long beginTime = System.currentTimeMillis();

		if(b)
			loadFrom(model);
		Pipe pipe = createProcessor(b);

		InstanceSet trainSet = new InstanceSet(pipe, factory);

		LabelAlphabet labels = factory.DefaultLabelAlphabet();
		IFeatureAlphabet features = factory.DefaultFeatureAlphabet();

		if(b){
			features.setStopIncrement(false);
			labels.setStopIncrement(false);
		}

		// 训练集
		trainSet.loadThruStagePipes(new SequenceReader(train, true));

		long endTime = System.currentTimeMillis();
		System.out.println(" done!");
		System.out.println("Time escape: " + (endTime - beginTime) / 1000 + "s");
		System.out.println();

		// 输出
		System.out.println("Training Number: " + trainSet.size());

		System.out.println("Label Number: " + labels.size()); // 标签个数
		System.out.println("Feature Number: " + features.size()); // 特征个数
		System.out.println();
		
		// 冻结特征集
		features.setStopIncrement(true);
		labels.setStopIncrement(true);

		InstanceSet testSet = null;
		// /////////////////
		if (testfile != null) {

			boolean hasTarget;
			if (false) {// 如果test data没有标注
				hasTarget = false;
				pipe = featurePipe;
			} else {
				hasTarget = true;
			}

			// 测试集
			testSet = new InstanceSet(pipe);

			testSet.loadThruStagePipes(new SequenceReader(testfile, hasTarget, "utf8"));
			System.out.println("Test Number: " + testSet.size()); // 样本个数
		}


		/**
		 * 
		 * 更新参数的准则
		 */
		Update update;
		// viterbi解码
		Inferencer inference;
		
		HammingLoss loss = new HammingLoss();
		if (standard) {
			inference = new LinearViterbi(templets, labels.size());
			update = new LinearViterbiPAUpdate((LinearViterbi) inference, loss);
		} else {
			inference = new HigherOrderViterbi(templets, labels.size());
			update = new HigherOrderViterbiPAUpdate(templets, labels.size(), true);
		}

		OnlineTrainer trainer;

		if(b){
			trainer = new OnlineTrainer(cl, update, loss, features.size(),iterNum, c);
		}else{

			trainer = new OnlineTrainer(inference, update, loss,
					features.size(), iterNum, c);
		}
		trainer.innerOptimized = false;
		trainer.finalOptimized = false;

		cl = trainer.train(trainSet, testSet);

//		ModelAnalysis ma = new ModelAnalysis(cl);
//		ma.removeZero();

		if(b)
			saveTo(newmodel);
		else
			saveTo(model);

	}



	private void test() throws Exception {
		if (cl == null)
			loadFrom(model);

		long starttime = System.currentTimeMillis();
		// 将样本通过Pipe抽取特征
		Pipe pipe = createProcessor(true);

		// 测试集
		InstanceSet testSet = new InstanceSet(pipe);

		testSet.loadThruStagePipes(new SequenceReader(testfile,hasLabel,"utf8"));
		System.out.println("Test Number: " + testSet.size()); // 样本个数

		long featuretime = System.currentTimeMillis();

		
		float error = 0;
		int senError = 0;
		int len = 0;
		boolean hasENG = false;
		int ENG_all = 0, ENG_right = 0;
		Loss loss = new HammingLoss();

		String[][] labelsSet = new String[testSet.size()][];
		String[][] targetSet = new String[testSet.size()][];
		LabelAlphabet la = cl.getAlphabetFactory().DefaultLabelAlphabet();
		for (int i = 0; i < testSet.size(); i++) {
			Instance carrier = testSet.get(i);
			int[] pred = (int[]) cl.classify(carrier).getLabel(0);
//			System.out.println(MyStrings.toString(pred, " "));
			if (hasLabel) {
				len += pred.length;
				float e = loss.calc(carrier.getTarget(), pred);
				error += e;
				if(e != 0)
					senError++;
				//测试中英混杂语料
				if(hasENG) {
					String[][] origin = (String[][])carrier.getSource();
					int[] target = (int[])carrier.getTarget();
					for(int j = 0; j < target.length; j++) {
						if(origin[j][0].contains("ENG")) {
							ENG_all++;
							if(target[j] == pred[j])
								ENG_right++;
						}
					}
				}
			}
			labelsSet[i] = la.lookupString(pred);
			if(hasLabel)
				targetSet[i] = la.lookupString((int[])carrier.getTarget());
		}

		long endtime = System.currentTimeMillis();
		System.out.println("totaltime\t" + (endtime - starttime) / 1000.0);
		System.out.println("feature\t" + (featuretime - starttime) / 1000.0);
		System.out.println("predict\t" + (endtime - featuretime) / 1000.0);

		if (hasLabel) {
			System.out.println("Test Accuracy:\t" + (1 - error / len));
			System.out.println("Sentence Accuracy:\t" + ((double)(testSet.size() - senError) / testSet.size()));
			if(hasENG)
				System.out.println("ENG Accuracy:\t" + ((double)ENG_right / ENG_all));
		}

		if (output != null) {
			BufferedWriter prn = new BufferedWriter(new OutputStreamWriter(
					new FileOutputStream(output), "utf8"));
			String s; 
			if(hasLabel)
				s = SimpleFormatter.format(testSet, labelsSet, targetSet);
			else
				s = SimpleFormatter.format(testSet, labelsSet);
			prn.write(s.trim());
			prn.close();
		}
		System.out.println("Done");
	}

	protected void saveTo(String modelfile) throws IOException {
		ObjectOutputStream out = new ObjectOutputStream(
				new BufferedOutputStream(new GZIPOutputStream(
						new FileOutputStream(modelfile))));
		out.writeObject(templets);
		out.writeObject(cl);
		out.close();
	}

	protected void loadFrom(String modelfile) throws IOException,
	ClassNotFoundException {
		ObjectInputStream in = new ObjectInputStream(new BufferedInputStream(
				new GZIPInputStream(new FileInputStream(modelfile))));
		templets = (TempletGroup) in.readObject();
		cl = (Linear) in.readObject();
		in.close();
	}
}