package is2.mtag; import is2.data.*; import is2.io.CONLLReader09; import is2.io.CONLLWriter09; import is2.parser.Parser; import is2.tools.IPipe; import is2.tools.Tool; import is2.tools.Train; import is2.util.DB; import is2.util.OptionsSuper; import java.io.*; import java.util.Map.Entry; import java.util.zip.ZipEntry; import java.util.zip.ZipInputStream; import java.util.zip.ZipOutputStream; public class Tagger implements Tool, Train { ExtractorM pipe; ParametersFloat params; /** * Initialize * * @param options */ public Tagger(Options options) { // load the model try { readModel(options); } catch (Exception e) { e.printStackTrace(); } } /** * @param string * @throws IOException */ public Tagger(String modelFileName) { this(new Options(new String[]{"-model", modelFileName})); } public Tagger() { } public static void main(String[] args) throws FileNotFoundException, Exception { Options options = new Options(args); Tagger tagger = new Tagger(); if (options.train) { Long2Int li = new Long2Int(options.hsize); tagger.pipe = new ExtractorM(options, li); InstancesTagger is = (InstancesTagger) tagger.pipe.createInstances(options.trainfile); ParametersFloat params = new ParametersFloat(li.size()); tagger.train(options, tagger.pipe, params, is); tagger.writeModel(options, tagger.pipe, params); } if (options.test) { tagger.readModel(options); tagger.out(options, tagger.pipe, tagger.params); } if (options.eval) { Parser.out.println("\nEvaluate:"); Evaluator.evaluate(options.goldfile, options.outfile, options.formatTask); } } /* * (non-Javadoc) @see is2.mtag2.Learn#writeModel(is2.mtag2.Options, * is2.mtag2.Pipe, is2.data.ParametersFloat) */ @Override public void writeModel(OptionsSuper options, IPipe pipe, ParametersFloat params) { try { ZipOutputStream zos = new ZipOutputStream(new BufferedOutputStream(new FileOutputStream(options.modelName))); zos.putNextEntry(new ZipEntry("data")); DataOutputStream dos = new DataOutputStream(new BufferedOutputStream(zos)); MFO.writeData(dos); MFO.clearData(); DB.println("number of parameters " + params.parameters.length); dos.flush(); params.write(dos); pipe.write(dos); dos.flush(); dos.close(); } catch (Exception e) { e.printStackTrace(); } } /* * (non-Javadoc) @see is2.mtag2.Learn#readModel(is2.mtag2.Options) */ @Override public void readModel(OptionsSuper options) { try { pipe = new ExtractorM(options); params = new ParametersFloat(0); // load the model ZipInputStream zis = new ZipInputStream(new BufferedInputStream(new FileInputStream(options.modelName))); zis.getNextEntry(); DataInputStream dis = new DataInputStream(new BufferedInputStream(zis)); pipe.mf.read(dis); pipe.initValues(); pipe.initFeatures(); params.read(dis); pipe.li = new Long2Int(params.parameters.length); pipe.cl = new Cluster(dis); pipe.readMap(dis); dis.close(); this.pipe.types = new String[pipe.mf.getFeatureCounter().get(ExtractorM.FEAT)]; for (Entry<String, Integer> e : is2.mtag.MFO.getFeatureSet().get(ExtractorM.FEAT).entrySet()) { this.pipe.types[e.getValue()] = e.getKey(); } DB.println("Loading data finished. "); DB.println("number of parameter " + params.parameters.length); DB.println("number of classes " + this.pipe.types.length); } catch (Exception e) { e.printStackTrace(); } } /* * (non-Javadoc) @see is2.mtag2.Learn#train(is2.mtag2.Options, * is2.mtag2.Pipe, is2.data.ParametersFloat, is2.data.InstancesTagger) */ @Override public void train(OptionsSuper options, IPipe pipe, ParametersFloat params, Instances is) { int i; int del = 0; String[] wds = is2.mtag.MFO.reverse(is2.mtag.MFO.getFeatureSet().get(ExtractorM.WORD)); int numInstances = is.size(); float upd = (options.numIters * numInstances + 1); for (i = 0; i < options.numIters; i++) { long start = System.currentTimeMillis(); long last = System.currentTimeMillis(); FV pred = new FV(), gold = new FV(); int correct = 0, count = 0; for (int n = 0; n < numInstances; n++) { upd--; if ((n + 1) % 500 == 0) { del = PipeGen.outValueErr(n + 1, (count - correct), (float) correct / (float) count, del, last, upd); } int length = is.length(n); int feats[] = new int[length]; long[] vs = new long[ExtractorM._FC]; for (int w1 = 0; w1 < length; w1++) { count++; if (this.pipe.form2morph.get(is.forms[n][w1]) != null) { correct++; continue; } int bestType = this.pipe.fillFeatureVectorsOne(params, w1, wds[is.forms[n][w1]], is, n, is.gfeats[n], vs); feats[w1] = bestType; if (bestType == is.gfeats[n][w1]) { correct++; continue; } pred.clear(); int p = bestType << ExtractorM.s_type; for (int k = 0; k < vs.length; k++) { if (vs[k] == Integer.MIN_VALUE) { break; } if (vs[k] >= 0) { pred.add(this.pipe.li.l2i(vs[k] + p)); } } gold.clear(); p = is.gfeats[n][w1] << ExtractorM.s_type; for (int k = 0; k < vs.length; k++) { if (vs[k] == Integer.MIN_VALUE) { break; } if (vs[k] >= 0) { gold.add(this.pipe.li.l2i(vs[k] + p)); } } params.update(pred, gold, (float) upd, 1.0f); } } long end = System.currentTimeMillis(); String info = "time " + (end - start); PipeGen.outValueErr(numInstances, (count - correct), (float) correct / (float) count, last, 0, info); Parser.out.println(); } params.average(i * is.size()); } @Override public void out(OptionsSuper options, IPipe pipe, ParametersFloat params) { try { long start = System.currentTimeMillis(); CONLLReader09 depReader = new CONLLReader09(options.testfile, options.formatTask); CONLLWriter09 depWriter = new CONLLWriter09(options.outfile, options.formatTask); depReader.normalizeOn = false; Parser.out.print("Processing Sentence: "); pipe.initValues(); int cnt = 0; int del = 0; while (true) { InstancesTagger is = new InstancesTagger(); is.init(1, this.pipe.mf); cnt++; SentenceData09 instance = depReader.getNext(is); if (instance == null || instance.forms == null) { break; } is.fillChars(instance, 0, ExtractorM._CEND); instance = exec(instance, this.pipe, params, (InstancesTagger) is); SentenceData09 i09 = new SentenceData09(instance); i09.createSemantic(instance); if (options.overwritegold) { i09.ofeats = i09.pfeats; } depWriter.write(i09); if (cnt % 100 == 0) { del = PipeGen.outValue(cnt, del); } } depWriter.finishWriting(); del = PipeGen.outValue(cnt, del); long end = System.currentTimeMillis(); Parser.out.println(PipeGen.getSecondsPerInstnace(cnt, (end - start))); Parser.out.println(PipeGen.getUsedTime((end - start))); } catch (Exception e) { e.printStackTrace(); } } private SentenceData09 exec(SentenceData09 instance, ExtractorM pipe, ParametersFloat params, InstancesTagger is) { int length = instance.ppos.length; short[] feats = new short[instance.gpos.length]; long vs[] = new long[ExtractorM._FC]; String[] forms = instance.forms; instance.pfeats = new String[instance.gpos.length]; for (int j = 0; j < length; j++) { if (pipe.form2morph.get(is.forms[0][j]) != null) { feats[j] = (short) pipe.form2morph.get(is.forms[0][j]).intValue(); instance.pfeats[j] = this.pipe.types[feats[j]]; } else { int bestType = pipe.fillFeatureVectorsOne(params, j, forms[j], is, 0, feats, vs); feats[j] = (short) bestType; instance.pfeats[j] = this.pipe.types[bestType]; } } for (int j = 0; j < length; j++) { if (pipe.form2morph.get(is.forms[0][j]) != null) { feats[j] = (short) pipe.form2morph.get(is.forms[0][j]).intValue(); instance.pfeats[j] = this.pipe.types[feats[j]]; } else { int bestType = pipe.fillFeatureVectorsOne(params, j, forms[j], is, 0, feats, vs); feats[j] = (short) bestType; instance.pfeats[j] = this.pipe.types[bestType]; } } return instance; } /* * (non-Javadoc) @see is2.tools.Tool#apply(is2.data.SentenceData09) */ @Override public SentenceData09 apply(SentenceData09 instance) { try { //perform (instance, pipe, params); InstancesTagger is = new InstancesTagger(); is.init(1, pipe.mf); is.createInstance09(instance.forms.length); String[] forms = instance.forms; int length = forms.length; // is.setForm(0, 0, CONLLReader09.ROOT); for (int i = 0; i < length; i++) { is.setForm(0, i, forms[i]); } is.setLemma(0, 0, CONLLReader09.ROOT_LEMMA); for (int i = 1; i < length; i++) { is.setLemma(0, i, instance.plemmas[i]); } is.fillChars(instance, 0, ExtractorM._CEND); exec(instance, pipe, params, is); } catch (Exception e) { e.printStackTrace(); } return instance; } }