Pipe.java example

Explorer
dependency-parsing-toolbox-master
- Source
package is2.parserR2;

import extractors.Extractor;
import extractors.ParallelExtract;
import is2.data.*;
import is2.io.CONLLReader09;
import is2.util.OptionsSuper;
import java.io.IOException;
import java.util.ArrayList;
import java.util.concurrent.ExecutorService;

final public class Pipe extends PipeGen {

    public Extractor[] extractor;
    final public MFB mf = new MFB();
    Cluster cl;
    private OptionsSuper options;
    public static long timeExtract;

    public Pipe(OptionsSuper o) {
        options = o;
    }

    public void createInstances(String file, Instances is) throws Exception {


        CONLLReader09 depReader = new CONLLReader09(file);

        mf.register(REL, "<root-type>");

        // register at least one predicate since the parsing data might not contain predicates as in 
        // the Japaness corpus but the development sets contains some

        long sl = 0;

        is2.parser.Parser.out.print("Registering feature parts of sentence: ");
        int ic = 0;
        int del = 0;
        while (true) {
            SentenceData09 instance = depReader.getNext();
            if (instance == null) {
                break;
            }
            ic++;

            sl += instance.labels.length;

            if (ic % 1000 == 0) {
                del = outValue(ic, del);
            }

            String[] labs1 = instance.labels;
            for (int i1 = 0; i1 < labs1.length; i1++) {
                mf.register(REL, labs1[i1]);
            }

            String[] w = instance.forms;
            for (int i1 = 0; i1 < w.length; i1++) {
                mf.register(WORD, depReader.normalize(w[i1]));
            }

            w = instance.plemmas;
            for (int i1 = 0; i1 < w.length; i1++) {
                mf.register(WORD, depReader.normalize(w[i1]));
            }


            w = instance.ppos;
            for (int i1 = 0; i1 < w.length; i1++) {
                mf.register(POS, w[i1]);
            }

            w = instance.gpos;
            for (int i1 = 0; i1 < w.length; i1++) {
                mf.register(POS, w[i1]);
            }

            if (instance.feats != null) {
                String fs[][] = instance.feats;
                for (int i1 = 0; i1 < fs.length; i1++) {
                    w = fs[i1];
                    if (w == null) {
                        continue;
                    }
                    for (int i2 = 0; i2 < w.length; i2++) {
                        mf.register(FEAT, w[i2]);
                    }
                }
            }

            if ((ic - 1) > options.count) {
                break;
            }
        }
        del = outValue(ic, del);


        for (Extractor e : extractor) {
            e.setMaxForm(mf.getFeatureCounter().get(WORD));
        }

        if (options.clusterFile == null) {
            cl = new Cluster();
        } else {
            cl = new Cluster(options.clusterFile, mf, 6);
        }

        mf.calculateBits();

        is2.parser.Parser.out.println("" + mf.toString());

        for (Extractor e : extractor) {
            e.initStat();
            e.init();
        }

        depReader.startReading(file);

        int num1 = 0;


        Edges.init(mf.getFeatureCounter().get(POS));


        is2.parser.Parser.out.print("Creating edge filters and read corpus: ");
        del = 0;

        is.init(ic, new MFB());

        while (true) {
            if (num1 % 100 == 0) {
                del = outValue(num1, del);
            }

            SentenceData09 instance1 = depReader.getNext(is);

            if (instance1 == null) {
                break;
            }

            int last = is.size() - 1;
            short[] pos = is.pposs[last];

            for (int k = 0; k < is.length(last); k++) {
                if (is.heads[last][k] < 0) {
                    continue;
                }
                Edges.put(pos[is.heads[last][k]], pos[k], k < is.heads[last][k], is.labels[last][k]);
            }

            if (!options.allFeatures && num1 > options.count) {
                break;
            }

            num1++;

        }
        del = outValue(num1, del);
        is2.parser.Parser.out.println();
        Edges.findDefault();
    }

    public void getInstances(String file, Instances is) throws Exception {
        CONLLReader09 depReader = new CONLLReader09(file);

        int ic = options.count + 2;

        is.init(ic, new MFB());

        int num1 = 0, del = 0;
        while (true) {
            if (num1 % 100 == 0) {
                del = outValue(num1, del);
            }

            SentenceData09 instance1 = depReader.getNext(is);

            if (instance1 == null) {
                break;
            }

            if (!options.allFeatures && num1 > options.count) {
                break;
            }

            num1++;

        }
        del = outValue(num1, del);
        is2.parser.Parser.out.println();

    }

    /**
     * Creates an instance for outputParses
     *
     * @param is
     * @return
     * @throws IOException
     */
    protected final SentenceData09 nextInstance(Instances is, CONLLReader09 depReader) throws Exception {

        SentenceData09 instance = depReader.getNext(is);
        if (instance == null || instance.forms == null) {
            return null;
        }

        return instance;
    }
    public static ExecutorService executerService = java.util.concurrent.Executors.newFixedThreadPool(Parser.THREADS);

    public DataF fillVector(F2SF params, Instances is, int inst, DataF d, Cluster cluster, int threads, Long2IntInterface li) throws InterruptedException {

        long ts = System.nanoTime();

        if (executerService.isShutdown()) {
            executerService = java.util.concurrent.Executors.newCachedThreadPool();
        }


        final int length = is.length(inst);
        if (d == null || d.len < length) {
            d = new DataF(length, mf.getFeatureCounter().get(PipeGen.REL).shortValue());
        }

        ArrayList<ParallelExtract> pe = new ArrayList<>();


        for (int i = 0; i < threads; i++) {

//			DB.println(""+((ExtractorClusterStackedR2)extractor[i]).s_dist);
            pe.add(new ParallelExtract(extractor[i], is, inst, d, (F2SF) params.clone(), cluster, li));
        }

        for (int w1 = 0; w1 < length; w1++) {
            for (int w2 = 0; w2 < length; w2++) {
                if (w1 == w2) {
                    continue;
                }
                ParallelExtract.add(w1, w2);
            }
        }
        executerService.invokeAll(pe);

        timeExtract += (System.nanoTime() - ts);


        return d;
    }

    /**
     * the loss function
     */
    public double errors(Instances is, int ic, Parse p) {

        if (p.heads == null) {
            p.signature2parse(p.signature());
        }
        short[] act = is.heads[ic];
        double correct = 0;

        // do not count root
        for (int i = 1; i < act.length; i++) {
            if (p.heads[i] == act[i]) {
                correct += 0.5;
                if (p.labels[i] == is.labels[ic][i]) {
                    correct += 0.5;
                }
            }
        }

        double x = ((double) act.length - 1 - correct);

        //p.f1 = (double)correct / (double)(act.length-1);

        return x;
    }
}