package is2.tag; import is2.data.*; import is2.io.CONLLReader09; import is2.io.CONLLWriter09; import is2.parser.Parser; import is2.tools.IPipe; import is2.tools.Tool; import is2.tools.Train; import is2.util.DB; import is2.util.Evaluator; import is2.util.OptionsSuper; import java.io.*; import java.util.ArrayList; import java.util.Map.Entry; import java.util.zip.ZipEntry; import java.util.zip.ZipInputStream; import java.util.zip.ZipOutputStream; public class Tagger implements Tool, Train { public ExtractorT2 pipe; public ParametersFloat params; public Long2IntInterface li; public MFO mf; private OptionsSuper _options; /** * Initialize * * @param options */ public Tagger(Options options) { // load the model try { readModel(options); } catch (Exception e) { e.printStackTrace(); } } public Tagger() { } /** * @param modelFileName the file name of the model */ public Tagger(String modelFileName) { this(new Options(new String[]{"-model", modelFileName})); } public static void main(String[] args) throws FileNotFoundException, Exception { long start = System.currentTimeMillis(); Options options = new Options(args); Tagger tagger = new Tagger(); if (options.train) { // depReader.normalizeOn=false; tagger.li = new Long2Int(options.hsize); tagger.pipe = new ExtractorT2(options, tagger.mf = new MFO()); //tagger.pipe.li =tagger.li; InstancesTagger is = (InstancesTagger) tagger.pipe.createInstances(options.trainfile); tagger.params = new ParametersFloat(tagger.li.size()); tagger.train(options, tagger.pipe, tagger.params, is); tagger.writeModel(options, tagger.pipe, tagger.params); } if (options.test) { tagger.readModel(options); tagger.out(options, tagger.pipe, tagger.params); } Parser.out.println(); if (options.eval) { Parser.out.println("\nEVALUATION PERFORMANCE:"); Evaluator.evaluateTagger(options.goldfile, options.outfile, options.format); } long end = System.currentTimeMillis(); Parser.out.println("used time " + ((float) ((end - start) / 100) / 10)); } @Override public void readModel(OptionsSuper options) { try { pipe = new ExtractorT2(options, mf = new MFO()); _options = options; // load the model ZipInputStream zis = new ZipInputStream(new BufferedInputStream(new FileInputStream(options.modelName))); zis.getNextEntry(); DataInputStream dis = new DataInputStream(new BufferedInputStream(zis)); pipe.mf.read(dis); pipe.initValues(); pipe.initFeatures(); params = new ParametersFloat(0); params.read(dis); li = new Long2Int(params.parameters.length); pipe.read(dis); dis.close(); pipe.types = new String[pipe.mf.getFeatureCounter().get(ExtractorT2.POS)]; for (Entry<String, Integer> e : pipe.mf.getFeatureSet().get(ExtractorT2.POS).entrySet()) { pipe.types[e.getValue()] = e.getKey(); } DB.println("Loading data finished. "); } catch (Exception e) { e.printStackTrace(); } } /** * Do the training * * @param instanceLengths * @param options * @param pipe * @param params * @throws IOException * @throws InterruptedException * @throws ClassNotFoundException */ @Override public void train(OptionsSuper options, IPipe pipe, ParametersFloat params, Instances is2) { InstancesTagger is = (InstancesTagger) is2; String wds[] = MFO.reverse(this.pipe.mf.getFeatureSet().get(ExtractorT2.WORD)); int pd[] = new int[this.pipe.types.length]; for (int k = 0; k < pd.length; k++) { pd[k] = k; } int del = 0; F2SF f = new F2SF(params.parameters); long vs[] = new long[ExtractorT2._MAX]; int types = this.pipe.types.length; double upd = options.numIters * is.size() + 1; for (int i = 0; i < options.numIters; i++) { long start = System.currentTimeMillis(); int numInstances = is.size(); long last = System.currentTimeMillis(); FV pred = new FV(), gold = new FV(); int correct = 0, count = 0; Parser.out.print("Iteration " + i + ": "); for (int n = 0; n < numInstances; n++) { if ((n + 1) % 500 == 0) { del = PipeGen.outValueErr(n + 1, (count - correct), (float) correct / (float) count, del, last, upd); } int length = is.length(n); upd--; for (int w = 1; w < length; w++) { double best = -1000; short bestType = -1; int[] lemmas; //= is.lemmas[n]; if (options.noLemmas) { lemmas = new int[is.length(n)]; } else { lemmas = is.plemmas[n]; } this.pipe.addFeatures(is, n, wds[is.forms[n][w]], w, is.gpos[n], is.forms[n], lemmas, vs); for (short t = 0; t < types; t++) { // the hypotheses of a part of speech tag long p = t << ExtractorT2.s_type; f.clear(); // add the features to the vector for (int k1 = 0; vs[k1] != Integer.MIN_VALUE; k1++) { if (vs[k1] > 0) { f.add(this.li.l2i(vs[k1] | p)); } } if (f.score > best) { bestType = t; best = f.score; } } count++; if (bestType == is.gpos[n][w]) { correct++; continue; } pred.clear(); for (int k1 = 0; vs[k1] != Integer.MIN_VALUE; k1++) { if (vs[k1] > 0) { pred.add(this.li.l2i(vs[k1] | bestType << ExtractorT2.s_type)); } } gold.clear(); for (int k1 = 0; vs[k1] != Integer.MIN_VALUE; k1++) { if (vs[k1] > 0) { gold.add(this.li.l2i(vs[k1] | is.gpos[n][w] << ExtractorT2.s_type)); } } params.update(pred, gold, (float) upd, 1.0F); } } long end = System.currentTimeMillis(); String info = "time " + (end - start); PipeGen.outValueErr(numInstances, (count - correct), (float) correct / (float) count, last, upd, info); Parser.out.println(); del = 0; } params.average(options.numIters * is.size()); } /** * Tag a sentence * * @param options * @param pipe * @param params * @throws IOException */ @Override public void out(OptionsSuper options, IPipe pipe, ParametersFloat params) { try { long start = System.currentTimeMillis(); // change this backe!!! // CONLLReader09 depReader = new CONLLReader09(options.testfile, CONLLReader09.NO_NORMALIZE); CONLLReader09 depReader = new CONLLReader09(options.testfile); CONLLWriter09 depWriter = new CONLLWriter09(options.outfile); Parser.out.print("Processing Sentence: "); pipe.initValues(); int cnt = 0; int del = 0; while (true) { InstancesTagger is = new InstancesTagger(); is.init(1, mf); SentenceData09 instance = depReader.getNext(is); if (instance == null || instance.forms == null) { break; } is.fillChars(instance, 0, ExtractorT2._CEND); cnt++; tag(is, instance); SentenceData09 i09 = new SentenceData09(instance); i09.createSemantic(instance); depWriter.write(i09); if (cnt % 100 == 0) { del = PipeGen.outValue(cnt, del); } } del = PipeGen.outValue(cnt, del); depWriter.finishWriting(); float min = 1000, max = -1000; // int r[] = new int[14]; /* * for(Entry<Float, Integer> e : map.entrySet()) { * if(e.getKey()<min)min=e.getKey(); * if(e.getKey()>max)max=e.getKey(); * * if(e.getKey()<0.2) r[0]++; else if(e.getKey()<0.5) * r[1]+=e.getValue(); else if(e.getKey()<0.7) r[2]+=e.getValue(); * else if(e.getKey()<0.8) r[3]+=e.getValue(); else * if(e.getKey()<0.9) r[4]+=e.getValue(); else if(e.getKey()<1.0) * r[5]+=e.getValue(); else if(e.getKey()<1.2) r[6]+=e.getValue(); * else if(e.getKey()<1.3) r[7]+=e.getValue(); else * if(e.getKey()<1.4) r[8]+=e.getValue(); else if(e.getKey()<1.5) * r[9]+=e.getValue(); else if(e.getKey()<1.9) r[10]+=e.getValue(); * else if(e.getKey()<2.2) r[11]+=e.getValue(); else * if(e.getKey()<2.5) r[12]+=e.getValue(); else if(e.getKey()>=2.5) * r[13]+=e.getValue(); } */ // for(int k=0;k<r.length;k++) Parser.out.println(k+" "+r[k][0]+" "+((float)r[k][1]/(float)r[k][0])+" good "+r[k][1]); // Parser.out.println("min "+min+" "+max); long end = System.currentTimeMillis(); Parser.out.println(PipeGen.getSecondsPerInstnace(cnt, (end - start))); Parser.out.println(PipeGen.getUsedTime(end - start)); } catch (Exception e) { e.printStackTrace(); } } public SentenceData09 tag(SentenceData09 instance) throws IOException { InstancesTagger is = new InstancesTagger(); is.init(1, pipe.mf); new CONLLReader09().insert(is, instance); is.fillChars(instance, 0, ExtractorT2._CEND); tag(is, instance); return instance; } private void tag(InstancesTagger is, SentenceData09 instance) { int length = instance.ppos.length; short[] pos = new short[instance.gpos.length]; float sc[] = new float[instance.ppos.length]; instance.ppos[0] = is2.io.CONLLReader09.ROOT_POS; pos[0] = (short) pipe.mf.getValue(ExtractorT2.POS, is2.io.CONLLReader09.ROOT_POS); for (int j = 1; j < length; j++) { short bestType = (short) pipe.fillFeatureVectorsOne(instance.forms[j], params, j, is, 0, pos, this.li, sc); pos[j] = bestType; instance.ppos[j] = pipe.types[bestType]; } for (int j = 1; j < length; j++) { short bestType = (short) pipe.fillFeatureVectorsOne(instance.forms[j], params, j, is, 0, pos, this.li, sc); instance.ppos[j] = pipe.types[bestType]; pos[j] = bestType; } } /** * Tag a single word and return a n-best list of Part-of-Speech tags. * * @param is set of sentences * @param instanceIndex index to the sentence in question * @param word word to be tagged * @return n-best list of Part-of-Speech tags */ public ArrayList<POS> tag(InstancesTagger is, int instanceIndex, int word, String wordForm) { return pipe.classify(wordForm, params, word, is, instanceIndex, is.pposs[instanceIndex], li); } public ArrayList<String> tagStrings(InstancesTagger is, int instanceIndex, int word, String wordForm) { ArrayList<POS> plist = pipe.classify(wordForm, params, word, is, instanceIndex, is.pposs[instanceIndex], li); String pos[] = is2.tag.MFO.reverse(this.pipe.mf.getFeatureSet().get(ExtractorT2.POS)); ArrayList<String> postags = null; for (POS p : plist) { try { postags.add(pos[p.p]); } catch (Exception e) { e.printStackTrace(); } } return postags; } /** * Tag a sentence * * @param options * @param pipe * @param parametersReranker * @throws IOException */ public String[] tag(String[] words, String[] lemmas) { String[] pposs = new String[words.length]; try { pipe.initValues(); int length = words.length + 1; InstancesTagger is = new InstancesTagger(); is.init(1, pipe.mf); is.createInstance09(length); SentenceData09 instance = new SentenceData09(); instance.forms = new String[length]; instance.forms[0] = is2.io.CONLLReader09.ROOT; instance.plemmas = new String[length]; instance.plemmas[0] = is2.io.CONLLReader09.ROOT_LEMMA; for (int j = 0; j < words.length; j++) { instance.forms[j + 1] = words[j]; instance.plemmas[j + 1] = lemmas[j]; } for (int j = 0; j < length; j++) { is.setForm(0, j, instance.forms[j]); is.setLemma(0, j, instance.plemmas[j]); } instance.ppos = new String[length]; is.fillChars(instance, 0, ExtractorT2._CEND); this.tag(is, instance); for (int j = 0; j < words.length; j++) { pposs[j] = instance.ppos[j + 1]; } } catch (Exception e) { e.printStackTrace(); } return pposs; } /* * (non-Javadoc) @see is2.tools.Tool#apply(is2.data.SentenceData09) */ @Override public SentenceData09 apply(SentenceData09 snt09) { try { tag(snt09); } catch (Exception e) { } return snt09; } /* * (non-Javadoc) @see is2.tools.Train#writeModel(is2.util.OptionsSuper, * is2.mtag2.Pipe, is2.data.ParametersFloat) */ @Override public void writeModel(OptionsSuper options, IPipe pipe, is2.data.ParametersFloat params) { try { ZipOutputStream zos = new ZipOutputStream(new BufferedOutputStream(new FileOutputStream(options.modelName))); zos.putNextEntry(new ZipEntry("data")); DataOutputStream dos = new DataOutputStream(new BufferedOutputStream(zos)); this.pipe.mf.writeData(dos); DB.println("number of parameters " + params.parameters.length); dos.flush(); params.write(dos); pipe.write(dos); dos.flush(); dos.close(); } catch (Exception e) { e.printStackTrace(); } } }