Tagger.java example

Explorer
dependency-parsing-toolbox-master
- Source
package is2.tag;

import is2.data.*;
import is2.io.CONLLReader09;
import is2.io.CONLLWriter09;
import is2.parser.Parser;
import is2.tools.IPipe;
import is2.tools.Tool;
import is2.tools.Train;
import is2.util.DB;
import is2.util.Evaluator;
import is2.util.OptionsSuper;
import java.io.*;
import java.util.ArrayList;
import java.util.Map.Entry;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
import java.util.zip.ZipOutputStream;

public class Tagger implements Tool, Train {

    public ExtractorT2 pipe;
    public ParametersFloat params;
    public Long2IntInterface li;
    public MFO mf;
    private OptionsSuper _options;

    /**
     * Initialize
     *
     * @param options
     */
    public Tagger(Options options) {


        // load the model
        try {
            readModel(options);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public Tagger() {
    }

    /**
     * @param modelFileName the file name of the model
     */
    public Tagger(String modelFileName) {
        this(new Options(new String[]{"-model", modelFileName}));
    }

    public static void main(String[] args) throws FileNotFoundException, Exception {

        long start = System.currentTimeMillis();
        Options options = new Options(args);

        Tagger tagger = new Tagger();

        if (options.train) {

            //		depReader.normalizeOn=false;

            tagger.li = new Long2Int(options.hsize);
            tagger.pipe = new ExtractorT2(options, tagger.mf = new MFO());

            //tagger.pipe.li =tagger.li;

            InstancesTagger is = (InstancesTagger) tagger.pipe.createInstances(options.trainfile);

            tagger.params = new ParametersFloat(tagger.li.size());

            tagger.train(options, tagger.pipe, tagger.params, is);
            tagger.writeModel(options, tagger.pipe, tagger.params);
        }

        if (options.test) {

            tagger.readModel(options);

            tagger.out(options, tagger.pipe, tagger.params);
        }

        Parser.out.println();

        if (options.eval) {
            Parser.out.println("\nEVALUATION PERFORMANCE:");
            Evaluator.evaluateTagger(options.goldfile, options.outfile, options.format);
        }
        long end = System.currentTimeMillis();
        Parser.out.println("used time " + ((float) ((end - start) / 100) / 10));
    }

    @Override
    public void readModel(OptionsSuper options) {

        try {
            pipe = new ExtractorT2(options, mf = new MFO());
            _options = options;
            // load the model
            ZipInputStream zis = new ZipInputStream(new BufferedInputStream(new FileInputStream(options.modelName)));
            zis.getNextEntry();
            DataInputStream dis = new DataInputStream(new BufferedInputStream(zis));

            pipe.mf.read(dis);
            pipe.initValues();
            pipe.initFeatures();

            params = new ParametersFloat(0);
            params.read(dis);
            li = new Long2Int(params.parameters.length);
            pipe.read(dis);

            dis.close();

            pipe.types = new String[pipe.mf.getFeatureCounter().get(ExtractorT2.POS)];
            for (Entry<String, Integer> e : pipe.mf.getFeatureSet().get(ExtractorT2.POS).entrySet()) {
                pipe.types[e.getValue()] = e.getKey();
            }

            DB.println("Loading data finished. ");
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    /**
     * Do the training
     *
     * @param instanceLengths
     * @param options
     * @param pipe
     * @param params
     * @throws IOException
     * @throws InterruptedException
     * @throws ClassNotFoundException
     */
    @Override
    public void train(OptionsSuper options, IPipe pipe, ParametersFloat params, Instances is2) {

        InstancesTagger is = (InstancesTagger) is2;
        String wds[] = MFO.reverse(this.pipe.mf.getFeatureSet().get(ExtractorT2.WORD));

        int pd[] = new int[this.pipe.types.length];
        for (int k = 0; k < pd.length; k++) {
            pd[k] = k;
        }

        int del = 0;
        F2SF f = new F2SF(params.parameters);
        long vs[] = new long[ExtractorT2._MAX];

        int types = this.pipe.types.length;

        double upd = options.numIters * is.size() + 1;

        for (int i = 0; i < options.numIters; i++) {

            long start = System.currentTimeMillis();

            int numInstances = is.size();

            long last = System.currentTimeMillis();
            FV pred = new FV(), gold = new FV();

            int correct = 0, count = 0;
            Parser.out.print("Iteration " + i + ": ");

            for (int n = 0; n < numInstances; n++) {

                if ((n + 1) % 500 == 0) {
                    del = PipeGen.outValueErr(n + 1, (count - correct), (float) correct / (float) count, del, last, upd);
                }

                int length = is.length(n);

                upd--;

                for (int w = 1; w < length; w++) {

                    double best = -1000;
                    short bestType = -1;

                    int[] lemmas; //= is.lemmas[n];
                    if (options.noLemmas) {
                        lemmas = new int[is.length(n)];
                    } else {
                        lemmas = is.plemmas[n];
                    }

                    this.pipe.addFeatures(is, n, wds[is.forms[n][w]], w, is.gpos[n], is.forms[n], lemmas, vs);

                    for (short t = 0; t < types; t++) {

                        // the hypotheses of a part of speech tag
                        long p = t << ExtractorT2.s_type;
                        f.clear();

                        // add the features to the vector
                        for (int k1 = 0; vs[k1] != Integer.MIN_VALUE; k1++) {
                            if (vs[k1] > 0) {
                                f.add(this.li.l2i(vs[k1] | p));
                            }
                        }

                        if (f.score > best) {
                            bestType = t;
                            best = f.score;
                        }
                    }

                    count++;
                    if (bestType == is.gpos[n][w]) {
                        correct++;
                        continue;
                    }

                    pred.clear();
                    for (int k1 = 0; vs[k1] != Integer.MIN_VALUE; k1++) {
                        if (vs[k1] > 0) {
                            pred.add(this.li.l2i(vs[k1] | bestType << ExtractorT2.s_type));
                        }
                    }

                    gold.clear();
                    for (int k1 = 0; vs[k1] != Integer.MIN_VALUE; k1++) {
                        if (vs[k1] > 0) {
                            gold.add(this.li.l2i(vs[k1] | is.gpos[n][w] << ExtractorT2.s_type));
                        }
                    }

                    params.update(pred, gold, (float) upd, 1.0F);
                }
            }

            long end = System.currentTimeMillis();
            String info = "time " + (end - start);
            PipeGen.outValueErr(numInstances, (count - correct), (float) correct / (float) count, last, upd, info);
            Parser.out.println();
            del = 0;
        }

        params.average(options.numIters * is.size());
    }

    /**
     * Tag a sentence
     *
     * @param options
     * @param pipe
     * @param params
     * @throws IOException
     */
    @Override
    public void out(OptionsSuper options, IPipe pipe, ParametersFloat params) {

        try {

            long start = System.currentTimeMillis();
// change this backe!!!
//		CONLLReader09 depReader = new CONLLReader09(options.testfile, CONLLReader09.NO_NORMALIZE);
            CONLLReader09 depReader = new CONLLReader09(options.testfile);

            CONLLWriter09 depWriter = new CONLLWriter09(options.outfile);

            Parser.out.print("Processing Sentence: ");
            pipe.initValues();

            int cnt = 0;
            int del = 0;
            while (true) {

                InstancesTagger is = new InstancesTagger();
                is.init(1, mf);
                SentenceData09 instance = depReader.getNext(is);
                if (instance == null || instance.forms == null) {
                    break;
                }


                is.fillChars(instance, 0, ExtractorT2._CEND);

                cnt++;


                tag(is, instance);

                SentenceData09 i09 = new SentenceData09(instance);
                i09.createSemantic(instance);
                depWriter.write(i09);

                if (cnt % 100 == 0) {
                    del = PipeGen.outValue(cnt, del);
                }

            }
            del = PipeGen.outValue(cnt, del);
            depWriter.finishWriting();

            float min = 1000, max = -1000;

            //	int r[] = new int[14];
		/*
             * for(Entry<Float, Integer> e : map.entrySet()) {
             * if(e.getKey()<min)min=e.getKey();
             * if(e.getKey()>max)max=e.getKey();
             *
             * if(e.getKey()<0.2) r[0]++; else if(e.getKey()<0.5)
             * r[1]+=e.getValue(); else if(e.getKey()<0.7) r[2]+=e.getValue();
             * else if(e.getKey()<0.8) r[3]+=e.getValue(); else
             * if(e.getKey()<0.9) r[4]+=e.getValue(); else if(e.getKey()<1.0)
             * r[5]+=e.getValue(); else if(e.getKey()<1.2) r[6]+=e.getValue();
             * else if(e.getKey()<1.3) r[7]+=e.getValue(); else
             * if(e.getKey()<1.4) r[8]+=e.getValue(); else if(e.getKey()<1.5)
             * r[9]+=e.getValue(); else if(e.getKey()<1.9) r[10]+=e.getValue();
             * else if(e.getKey()<2.2) r[11]+=e.getValue(); else
             * if(e.getKey()<2.5) r[12]+=e.getValue(); else if(e.getKey()>=2.5)
             * r[13]+=e.getValue(); }
             */
            //	for(int k=0;k<r.length;k++) Parser.out.println(k+" "+r[k][0]+" "+((float)r[k][1]/(float)r[k][0])+" good "+r[k][1]);
            //	Parser.out.println("min "+min+" "+max);

            long end = System.currentTimeMillis();
            Parser.out.println(PipeGen.getSecondsPerInstnace(cnt, (end - start)));
            Parser.out.println(PipeGen.getUsedTime(end - start));
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public SentenceData09 tag(SentenceData09 instance) throws IOException {
        InstancesTagger is = new InstancesTagger();
        is.init(1, pipe.mf);
        new CONLLReader09().insert(is, instance);
        is.fillChars(instance, 0, ExtractorT2._CEND);
        tag(is, instance);

        return instance;
    }

    private void tag(InstancesTagger is, SentenceData09 instance) {

        int length = instance.ppos.length;

        short[] pos = new short[instance.gpos.length];

        float sc[] = new float[instance.ppos.length];

        instance.ppos[0] = is2.io.CONLLReader09.ROOT_POS;
        pos[0] = (short) pipe.mf.getValue(ExtractorT2.POS, is2.io.CONLLReader09.ROOT_POS);

        for (int j = 1; j < length; j++) {

            short bestType = (short) pipe.fillFeatureVectorsOne(instance.forms[j], params, j, is, 0, pos, this.li, sc);
            pos[j] = bestType;
            instance.ppos[j] = pipe.types[bestType];
        }

        for (int j = 1; j < length; j++) {

            short bestType = (short) pipe.fillFeatureVectorsOne(instance.forms[j], params, j, is, 0, pos, this.li, sc);
            instance.ppos[j] = pipe.types[bestType];
            pos[j] = bestType;
        }
    }

    /**
     * Tag a single word and return a n-best list of Part-of-Speech tags.
     *
     * @param is set of sentences
     * @param instanceIndex index to the sentence in question
     * @param word word to be tagged
     * @return n-best list of Part-of-Speech tags
     */
    public ArrayList<POS> tag(InstancesTagger is, int instanceIndex, int word, String wordForm) {

        return pipe.classify(wordForm, params, word, is, instanceIndex, is.pposs[instanceIndex], li);

    }

    public ArrayList<String> tagStrings(InstancesTagger is, int instanceIndex, int word, String wordForm) {

        ArrayList<POS> plist = pipe.classify(wordForm, params, word, is, instanceIndex, is.pposs[instanceIndex], li);
        String pos[] = is2.tag.MFO.reverse(this.pipe.mf.getFeatureSet().get(ExtractorT2.POS));

        ArrayList<String> postags = null;
        for (POS p : plist) {
            try {
                postags.add(pos[p.p]);
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
        return postags;
    }

    /**
     * Tag a sentence
     *
     * @param options
     * @param pipe
     * @param parametersReranker
     * @throws IOException
     */
    public String[] tag(String[] words, String[] lemmas) {

        String[] pposs = new String[words.length];

        try {
            pipe.initValues();

            int length = words.length + 1;


            InstancesTagger is = new InstancesTagger();
            is.init(1, pipe.mf);
            is.createInstance09(length);

            SentenceData09 instance = new SentenceData09();
            instance.forms = new String[length];
            instance.forms[0] = is2.io.CONLLReader09.ROOT;

            instance.plemmas = new String[length];
            instance.plemmas[0] = is2.io.CONLLReader09.ROOT_LEMMA;

            for (int j = 0; j < words.length; j++) {
                instance.forms[j + 1] = words[j];
                instance.plemmas[j + 1] = lemmas[j];
            }

            for (int j = 0; j < length; j++) {
                is.setForm(0, j, instance.forms[j]);
                is.setLemma(0, j, instance.plemmas[j]);
            }

            instance.ppos = new String[length];

            is.fillChars(instance, 0, ExtractorT2._CEND);

            this.tag(is, instance);

            for (int j = 0; j < words.length; j++) {
                pposs[j] = instance.ppos[j + 1];
            }
        } catch (Exception e) {
            e.printStackTrace();
        }

        return pposs;
    }

    /*
     * (non-Javadoc) @see is2.tools.Tool#apply(is2.data.SentenceData09)
     */
    @Override
    public SentenceData09 apply(SentenceData09 snt09) {
        try {
            tag(snt09);
        } catch (Exception e) {
        }
        return snt09;
    }

    /*
     * (non-Javadoc) @see is2.tools.Train#writeModel(is2.util.OptionsSuper,
     * is2.mtag2.Pipe, is2.data.ParametersFloat)
     */
    @Override
    public void writeModel(OptionsSuper options, IPipe pipe, is2.data.ParametersFloat params) {
        try {
            ZipOutputStream zos = new ZipOutputStream(new BufferedOutputStream(new FileOutputStream(options.modelName)));
            zos.putNextEntry(new ZipEntry("data"));
            DataOutputStream dos = new DataOutputStream(new BufferedOutputStream(zos));

            this.pipe.mf.writeData(dos);

            DB.println("number of parameters " + params.parameters.length);
            dos.flush();

            params.write(dos);
            pipe.write(dos);
            dos.flush();
            dos.close();

        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}