Benchmarks.java example

Explorer
Stanford-NLP-master
- CoreNLP-master
package edu.stanford.nlp.benchmarks;

import edu.stanford.nlp.classify.Dataset;
import edu.stanford.nlp.classify.LinearClassifierFactory;
import edu.stanford.nlp.classify.RVFDataset;
import edu.stanford.nlp.ie.crf.CRFClassifier;
import edu.stanford.nlp.ie.machinereading.structure.AnnotationUtils;
import edu.stanford.nlp.ling.*;
import edu.stanford.nlp.optimization.DiffFunction;
import edu.stanford.nlp.optimization.Minimizer;
import edu.stanford.nlp.optimization.SGDMinimizer;
import edu.stanford.nlp.stats.ClassicCounter;
import edu.stanford.nlp.stats.Counter;
import edu.stanford.nlp.util.Factory;

import java.util.*;

/**
 * Created by keenon on 6/19/15.
 *
 * Down and dirty (and not entirely representative) benchmarks to quickly judge improvement as we optimize stuff
 */
public class Benchmarks {
    /**
     * 67% of time spent in LogConditionalObjectiveFunction.rvfcalculate()
     * 29% of time spent in dataset construction (11% in RVFDataset.addFeatures(), 7% rvf incrementCount(), 11% rest)
     *
     * Single threaded, 4700 ms
     * Multi threaded, 700 ms
     *
     * With same data, seed 42, 245 ms
     * With reordered accesses for cacheing, 195 ms
     * Down to 80% of the time, not huge but a win nonetheless
     *
     * with 8 cpus, a 6.7x speedup -- almost, but not quite linear, pretty good
     */
    public static void benchmarkRVFLogisticRegression() {
        RVFDataset<String, String> data = new RVFDataset<>();
        for (int i = 0; i < 10000; i++) {
            Random r = new Random(42);
            Counter<String> features = new ClassicCounter<>();

            boolean cl = r.nextBoolean();

            for (int j = 0; j < 1000; j++) {
                double value;
                if (cl && i % 2 == 0) {
                    value = (r.nextDouble()*2.0)-0.6;
                }
                else {
                    value = (r.nextDouble()*2.0)-1.4;
                }
                features.incrementCount("f" + j, value);
            }

            data.add(new RVFDatum<>(features, "target:" + cl));
        }

        LinearClassifierFactory<String, String> factory = new LinearClassifierFactory<>();

        long msStart = System.currentTimeMillis();
        factory.trainClassifier(data);
        long delay = System.currentTimeMillis() - msStart;
        System.out.println("Training took "+delay+" ms");
    }

    /**
     * 57% of time spent in LogConditionalObjectiveFunction.calculateCLBatch()
     * 22% spent in constructing datums (expensive)
     *
     * Single threaded, 4100 ms
     * Multi threaded, 600 ms
     *
     * With same data, seed 42, 52 ms
     * With reordered accesses for cacheing, 38 ms
     * Down to 73% of the time
     *
     * with 8 cpus, a 6.8x speedup -- basically the same as with RVFDatum
     */
    public static void benchmarkLogisticRegression() {
        Dataset<String, String> data = new Dataset<>();
        for (int i = 0; i < 10000; i++) {
            Random r = new Random(42);
            Set<String> features = new HashSet<>();

            boolean cl = r.nextBoolean();

            for (int j = 0; j < 1000; j++) {
                if (cl && i % 2 == 0) {
                    if (r.nextDouble() > 0.3) {
                        features.add("f:"+j+":true");
                    }
                    else {
                        features.add("f:"+j+":false");
                    }
                }
                else {
                    if (r.nextDouble() > 0.3) {
                        features.add("f:" + j + ":false");
                    }
                    else {
                        features.add("f:"+j+":false");
                    }
                }
            }

            data.add(new BasicDatum<String, String>(features, "target:" + cl));
        }

        LinearClassifierFactory<String, String> factory = new LinearClassifierFactory<>();

        long msStart = System.currentTimeMillis();
        factory.trainClassifier(data);
        long delay = System.currentTimeMillis() - msStart;
        System.out.println("Training took "+delay+" ms");
    }

    /**
     * 29% in FactorTable.getValue()
     * 28% in CRFCliqueTree.getCalibratedCliqueTree()
     * 12.6% waiting for threads
     *
     * Single threaded: 15000 ms - 26000 ms
     * Multi threaded: 4500 ms - 7000 ms
     *
     * with 8 cpus, 3.3x - 3.7x speedup, around 800% utilization
     */
    public static void benchmarkCRF() {
        Properties props = new Properties();
        props.setProperty("macro", "true"); // use a generic CRF configuration
        props.setProperty("useIfInteger", "true");
        props.setProperty("featureFactory", "edu.stanford.nlp.benchmarks.BenchmarkFeatureFactory");
        props.setProperty("saveFeatureIndexToDisk", "false");

        CRFClassifier<CoreLabel> crf = new CRFClassifier<CoreLabel>(props);

        Random r = new Random(42);

        List<List<CoreLabel>> data = new ArrayList<>();
        for (int i = 0; i < 100; i++) {
            List<CoreLabel> sentence = new ArrayList<>();
            for (int j = 0; j < 20; j++) {
                CoreLabel l = new CoreLabel();

                l.setWord("j:"+j);

                boolean tag = j % 2 == 0 ^ (r.nextDouble() > 0.7);
                l.set(CoreAnnotations.AnswerAnnotation.class, "target:"+tag);
                sentence.add(l);
            }
            data.add(sentence);
        }

        long msStart = System.currentTimeMillis();
        crf.train(data);
        long delay = System.currentTimeMillis() - msStart;
        System.out.println("Training took "+delay+" ms");
    }


    public static void benchmarkSGD() {
        Dataset<String, String> data = new Dataset<>();
        for (int i = 0; i < 10000; i++) {
            Random r = new Random(42);
            Set<String> features = new HashSet<>();

            boolean cl = r.nextBoolean();

            for (int j = 0; j < 1000; j++) {
                if (cl && i % 2 == 0) {
                    if (r.nextDouble() > 0.3) {
                        features.add("f:"+j+":true");
                    }
                    else {
                        features.add("f:"+j+":false");
                    }
                }
                else {
                    if (r.nextDouble() > 0.3) {
                        features.add("f:" + j + ":false");
                    }
                    else {
                        features.add("f:"+j+":false");
                    }
                }
            }

            data.add(new BasicDatum<String, String>(features, "target:" + cl));
        }

        LinearClassifierFactory<String, String> factory = new LinearClassifierFactory<>();
        factory.setMinimizerCreator(new Factory<Minimizer<DiffFunction>>() {
            @Override
            public Minimizer<DiffFunction> create() {
                return new SGDMinimizer<DiffFunction>(0.1, 100, 0, 1000);
            }
        });

        long msStart = System.currentTimeMillis();
        factory.trainClassifier(data);
        long delay = System.currentTimeMillis() - msStart;
        System.out.println("Training took "+delay+" ms");
    }

    public static void benchmarkDatum() {
        long msStart = System.currentTimeMillis();
        Dataset<String, String> data = new Dataset<>();
        for (int i = 0; i < 10000; i++) {
            Random r = new Random(42);
            Set<String> features = new HashSet<>();

            boolean cl = r.nextBoolean();

            for (int j = 0; j < 1000; j++) {
                if (cl && i % 2 == 0) {
                    if (r.nextDouble() > 0.3) {
                        features.add("f:"+j+":true");
                    }
                    else {
                        features.add("f:"+j+":false");
                    }
                }
                else {
                    if (r.nextDouble() > 0.3) {
                        features.add("f:" + j + ":false");
                    }
                    else {
                        features.add("f:"+j+":false");
                    }
                }
            }

            data.add(new BasicDatum<String, String>(features, "target:" + cl));
        }
        long delay = System.currentTimeMillis() - msStart;
        System.out.println("Dataset construction took "+delay+" ms");

        msStart = System.currentTimeMillis();
        for (int i = 0; i < 10000; i++) {
            Random r = new Random(42);
            Set<String> features = new HashSet<>();

            boolean cl = r.nextBoolean();

            for (int j = 0; j < 1000; j++) {
                if (cl && i % 2 == 0) {
                    if (r.nextDouble() > 0.3) {

                    }
                    else {

                    }
                }
                else {
                    if (r.nextDouble() > 0.3) {

                    }
                    else {

                    }
                }
            }
        }
        delay = System.currentTimeMillis() - msStart;
        System.out.println("MultiVector took "+delay+" ms");
    }

    /**
     * on my machine this results in a factor of two gain, roughly
     */
    public static void testAdjacency() {
        double[][] sqar = new double[10000][1000];
        Random r = new Random();

        int k = 0;
        long msStart = System.currentTimeMillis();
        for (int i = 0; i < 10000; i++) {
            int loc = r.nextInt(10000);
            for (int j = 0; j < 1000; j++) {
                k+= sqar[loc][j];
            }
        }
        long delay = System.currentTimeMillis() - msStart;
        System.out.println("Scanning with cache friendly lookups took "+delay+" ms");

        int[] randLocs = new int[10000];
        for (int i = 0; i < 10000; i++) {
            randLocs[i] = r.nextInt(10000);
        }

        k = 0;
        msStart = System.currentTimeMillis();
        for (int j = 0; j < 1000; j++) {
            for (int i = 0; i < 10000; i++) {
                k+= sqar[randLocs[i]][j];
            }
        }
        delay = System.currentTimeMillis() - msStart;
        System.out.println("Scanning with cache UNfriendly lookups took "+delay+" ms");
    }

    public static void main(String[] args) {
        for (int i = 0; i < 100; i++) {
            // benchmarkRVFLogisticRegression();
            // benchmarkLogisticRegression();
            benchmarkSGD();
            // benchmarkCRF();
            // testAdjacency();
        }
    }
}