POSTaggerX.java example

Explorer
fudannlp-master
package edu.fudan.nlp.cn.tag;

import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.List;

import org.apache.commons.cli.BasicParser;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Options;

import edu.fudan.ml.classifier.struct.inf.LinearViterbi;
import edu.fudan.ml.types.Instance;
import edu.fudan.ml.types.alphabet.IFeatureAlphabet;
import edu.fudan.ml.types.alphabet.LabelAlphabet;
import edu.fudan.nlp.cn.Sentenizer;
import edu.fudan.nlp.cn.tag.format.Seq2ArrayWithTag;
import edu.fudan.nlp.cn.tag.format.Seq2StrWithTag;
import edu.fudan.nlp.pipe.seq.templet.TempletGroup;

/**
 * 用交叉标签的词性标注器
 * @author xpqiu
 * @version 1.0
 * @since FudanNLP 1.0
 */
public class POSTaggerX extends AbstractTagger {

	public POSTaggerX(String str) throws Exception {
		super(str);
		
//		DynamicViterbi dv = new DynamicViterbi(
//				(LinearViterbi) cl.getInferencer(), 
//				cl.getAlphabetFactory().buildLabelAlphabet("labels"), 
//				cl.getAlphabetFactory().buildFeatureAlphabet("features"),
//				false);
//		dv.setDynamicTemplets(DynamicTagger.getDynamicTemplet("example-data/structure/template_dynamic"));
//		cl.setInferencer(dv);
	}

	public String[][] tag2Array(String src) {
		ArrayList words = new ArrayList<String>();
		ArrayList pos = new ArrayList<String>();
		String[] s = Sentenizer.split(src);
		try {
			for (int i = 0; i < s.length; i++) {
				Instance inst = new Instance(s[i]);
				doProcess(inst);
				int[] pred = (int[]) getClassifier().classify(inst).getLabel(0);
				String[] target = labels.lookupString(pred);
				List[] res = Seq2ArrayWithTag.format(inst, target);
				words.addAll(res[0]);
				pos.addAll(res[1]);
			}
		} catch (Exception e) {
			e.printStackTrace();
		}
		String[][] tag = new String[2][];
		tag[0] = (String[]) words.toArray(new String[words.size()]);
		tag[1] = (String[]) pos.toArray(new String[pos.size()]);
		return tag;
	}

	@Override
	public String tag(String src) {
		String[] sents = Sentenizer.split(src);
		String tag = "";
		try {
			for (int i = 0; i < sents.length; i++) {
				Instance inst = new Instance(sents[i]);
				String[] preds = _tag(inst);
				String s = Seq2StrWithTag.format(inst, preds);
				tag += s;
				if (i < sents.length - 1)
					tag += delim;
			}
		} catch (Exception e) {
			e.printStackTrace();
		}
		return tag;
	}

	public static void main(String[] args) throws Exception {
		Options opt = new Options();

		opt.addOption("h", false, "Print help for this application");
		opt.addOption("f", false, "segment file. Default string mode.");
		opt.addOption("s", false, "segment string");
		BasicParser parser = new BasicParser();
		CommandLine cl = parser.parse(opt, args);

		if (args.length == 0 || cl.hasOption('h')) {
			HelpFormatter f = new HelpFormatter();
			f.printHelp(
					"Tagger:\n"
							+ "java edu.fudan.nlp.tag.POSTagger -f model_file input_file output_file;\n"
							+ "java edu.fudan.nlp.tag.POSTagger -s model_file string_to_segement",
					opt);
			return;
		}
		String[] arg = cl.getArgs();
		String modelFile;
		String input;
		String output = null;
		if (cl.hasOption("f") && arg.length == 3) {
			modelFile = arg[0];
			input = arg[1];
			output = arg[2];
		} else if (arg.length == 2) {
			modelFile = arg[0];
			input = arg[1];
		} else {
			System.err.println("paramenters format error!");
			System.err.println("Print option \"-h\" for help.");
			return;
		}
		POSTaggerX pos = new POSTaggerX(modelFile);
		if (cl.hasOption("f")) {
			String s = pos.tagFile(input);
			OutputStreamWriter w = new OutputStreamWriter(new FileOutputStream(
					output), "utf8");
			w.write(s);
			w.close();
		} else {
			String s = pos.tag(input);
			System.out.println(s);
		}
	}

}