package edu.fudan.nlp.cn.tag; import java.io.FileOutputStream; import java.io.OutputStreamWriter; import java.util.ArrayList; import java.util.List; import org.apache.commons.cli.BasicParser; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.Options; import edu.fudan.ml.classifier.struct.inf.LinearViterbi; import edu.fudan.ml.types.Instance; import edu.fudan.ml.types.alphabet.IFeatureAlphabet; import edu.fudan.ml.types.alphabet.LabelAlphabet; import edu.fudan.nlp.cn.Sentenizer; import edu.fudan.nlp.cn.tag.format.Seq2ArrayWithTag; import edu.fudan.nlp.cn.tag.format.Seq2StrWithTag; import edu.fudan.nlp.pipe.seq.templet.TempletGroup; /** * 用交叉标签的词性标注器 * @author xpqiu * @version 1.0 * @since FudanNLP 1.0 */ public class POSTaggerX extends AbstractTagger { public POSTaggerX(String str) throws Exception { super(str); // DynamicViterbi dv = new DynamicViterbi( // (LinearViterbi) cl.getInferencer(), // cl.getAlphabetFactory().buildLabelAlphabet("labels"), // cl.getAlphabetFactory().buildFeatureAlphabet("features"), // false); // dv.setDynamicTemplets(DynamicTagger.getDynamicTemplet("example-data/structure/template_dynamic")); // cl.setInferencer(dv); } public String[][] tag2Array(String src) { ArrayList words = new ArrayList<String>(); ArrayList pos = new ArrayList<String>(); String[] s = Sentenizer.split(src); try { for (int i = 0; i < s.length; i++) { Instance inst = new Instance(s[i]); doProcess(inst); int[] pred = (int[]) getClassifier().classify(inst).getLabel(0); String[] target = labels.lookupString(pred); List[] res = Seq2ArrayWithTag.format(inst, target); words.addAll(res[0]); pos.addAll(res[1]); } } catch (Exception e) { e.printStackTrace(); } String[][] tag = new String[2][]; tag[0] = (String[]) words.toArray(new String[words.size()]); tag[1] = (String[]) pos.toArray(new String[pos.size()]); return tag; } @Override public String tag(String src) { String[] sents = Sentenizer.split(src); String tag = ""; try { for (int i = 0; i < sents.length; i++) { Instance inst = new Instance(sents[i]); String[] preds = _tag(inst); String s = Seq2StrWithTag.format(inst, preds); tag += s; if (i < sents.length - 1) tag += delim; } } catch (Exception e) { e.printStackTrace(); } return tag; } public static void main(String[] args) throws Exception { Options opt = new Options(); opt.addOption("h", false, "Print help for this application"); opt.addOption("f", false, "segment file. Default string mode."); opt.addOption("s", false, "segment string"); BasicParser parser = new BasicParser(); CommandLine cl = parser.parse(opt, args); if (args.length == 0 || cl.hasOption('h')) { HelpFormatter f = new HelpFormatter(); f.printHelp( "Tagger:\n" + "java edu.fudan.nlp.tag.POSTagger -f model_file input_file output_file;\n" + "java edu.fudan.nlp.tag.POSTagger -s model_file string_to_segement", opt); return; } String[] arg = cl.getArgs(); String modelFile; String input; String output = null; if (cl.hasOption("f") && arg.length == 3) { modelFile = arg[0]; input = arg[1]; output = arg[2]; } else if (arg.length == 2) { modelFile = arg[0]; input = arg[1]; } else { System.err.println("paramenters format error!"); System.err.println("Print option \"-h\" for help."); return; } POSTaggerX pos = new POSTaggerX(modelFile); if (cl.hasOption("f")) { String s = pos.tagFile(input); OutputStreamWriter w = new OutputStreamWriter(new FileOutputStream( output), "utf8"); w.write(s); w.close(); } else { String s = pos.tag(input); System.out.println(s); } } }