NGramWrapper.java example

Explorer
Bringers-of-Singularity-master
/*
OpenNLP can be found at: https://opennlp.apache.org/cgi-bin/download.cgi
 */

import java.io.*;
import java.lang.ArrayIndexOutOfBoundsException;
import java.lang.IllegalArgumentException;
import java.lang.String;
import java.lang.StringBuilder;
import java.util.*;

import opennlp.tools.ngram.NGramModel;
import opennlp.tools.util.StringList;

public class NGramWrapper {
    /**
     * The Stupid Backoff currently assumes that if it has never seen a word before it is equivalent to having seen it once (i.e. very unlikely).
     */
    public final static int STUPID_BACKOFF = 0;
    public final static double STUPID_BACKOFF_ALPHA = 0.01; //Following http://stackoverflow.com/questions/16383194/stupid-backoff-implementation-clarification except that we want to penalize the backoff more.
    public static double STUPID_BACKOFF_BASE = 0.0001;
    /*
    From Stanley F. Chen and Joshua Goodman (1998), “An Empirical Study of Smoothing Techniques for Language Modeling"
     */
    public final static int MODIFIED_KNESER_NEY = 1;
    public final static int JELINEK_MERCER = 2;
    /**
     * Sets which smoothing technique should be used.
     * Note, this is not thread-safe.
     */
    public static int smoothing = STUPID_BACKOFF;

    /*
    Todo ?
     */
    private static boolean padStart = false; //If {<s><s><s>I} is a correct 3-gram
    private static boolean padEnd = false; //If {be.<e><e><e>} is a correct 3-gram

    long numberOfSentences = 0;
    long numberOfTokens = 0;

    private int nGramLength;

    //NGramModel ngram = new NGramModel();
    NGramModel ngram[];

    long numberOfTokensInVocabulary = 0;
    long numberOfTokensOutOfVocabulary = 0;
    long numberOfNGramsInCoverage = 0;
    long numberOfNGramsOutofCoverage = 0;

    public static void main(String[] args) {
        File searchIn = new File("corpus.txt");
        int nGramLength=3;
        //NGramWrapper ngram = new NGramWrapper(nGramLength);
        for(int i = 0; i < args.length; i+=2) {
            if(args[i].equals("n-gram")) {
                nGramLength = Integer.parseInt(args[i+1]);
            } else if(args[i].equals("corpus")) {
                searchIn = new File(args[i+1]);
            } else {
                System.err.println(args[i]+ " is invalid.");
            }
        }
        NGramWrapper ngw = new NGramWrapper(nGramLength);
        ngw.readFile(searchIn);
    }
    public void resetOOV() {
        numberOfTokensInVocabulary=0;
        numberOfTokensOutOfVocabulary=0;
    }
    public double getOOV() {
        if(numberOfTokensInVocabulary>0) {
            return (double)numberOfTokensOutOfVocabulary/numberOfTokensInVocabulary;
        } else {
            return Double.NaN;
        }
    }
    /**
     * Assumes that the N-Gram is correct size.
     * @param s
     * @return
     */
    public double getCostOfNGram(String[] s) {
        return getCostOfNGram(s, this.smoothing);
    }
    public void updateOOV(String[] s) {
        for(int i = 0; i < s.length; i++) {
            if(ngram[0].contains(new StringList(s[i]))) {
                numberOfTokensInVocabulary++;
            } else {
                numberOfTokensOutOfVocabulary++;
            }
        }
    }
    public void resetCoverage() {
        numberOfNGramsInCoverage=0;
        numberOfNGramsOutofCoverage=0;
    }
    public void updateCoverage(String[] s) {
        for(int i = s.length-1; i >= nGramLength; i--) {
            if(ngram[ngram.length-1].contains(new StringList(Arrays.copyOfRange(s, i-nGramLength, i)))) {
                numberOfNGramsInCoverage++;
            } else {
                numberOfNGramsOutofCoverage++;
            }
        }
    }
    public double getCoverage() {
        if(numberOfNGramsInCoverage>0) {
            return (double)numberOfNGramsOutofCoverage/numberOfNGramsInCoverage;
        } else {
            return Double.NaN;
        }
    }
    public double getCostOfNGram(String[] s, int smoothing) {
        return getCostOfNGramRecursive(s, smoothing);
    }
    private double getCostOfNGramRecursive(String[] s, int smoothing) {
        double value = 0;
        switch (smoothing) {
            case STUPID_BACKOFF: //From http://stackoverflow.com/questions/16383194/stupid-backoff-implementation-clarification
                if(s.length>1) {
                    value = counts(s);
                    String argument[] = new String[s.length-1];
                    System.arraycopy(s, 1, argument, 0, argument.length);
                    if(value>0) {
                        value /= counts(argument);
                    } else {
                        value = STUPID_BACKOFF_ALPHA*getCostOfNGramRecursive(argument, STUPID_BACKOFF);
                    }
                } else { //This is only "valid" because we will have a small corpus
                    double counts = STUPID_BACKOFF_BASE;
                    if(s.length>0) {
                        counts=counts(s);
                    }
                    if(counts==0) {
                        counts=STUPID_BACKOFF_BASE;
                    }
                    double total = ngram[0].numberOfGrams();
                    value = counts/total;
                }
                break;
            case JELINEK_MERCER: //From http://nlp.stanford.edu/~wcmac/papers/20050421-smoothing-tutorial.pdf

                break;
            default:
                throw new IllegalArgumentException();
        }
        return value;
    }
    public NGramWrapper(int nGramLength) {
        this.nGramLength = nGramLength;
        ngram = new NGramModel[nGramLength];
    }
    public boolean exists(String[] s) {
        return ngram[s.length-1].contains(new StringList(s));
    }
    public int counts(String[] s) {
        return ngram[s.length-1].getCount(new StringList(s));
    }
    public NGramModel getNgram() {
        return ngram[ngram.length-1];
    }
    private int getNumberOfNGrams(NGramModel ngm) {
        Iterator<StringList> iterator = ngm.iterator();
        int count = 0;
        while(iterator.hasNext()) {
            iterator.next();
            count++;
        }
        return count;
    }
    public int getNGramLength() {
        return nGramLength;
    }
    public void readFile(File f) {
        System.err.println(f.getAbsolutePath());
        for(int i = 0; i < ngram.length; i++) {
            numberOfSentences = 0;
            long time = System.currentTimeMillis();
            try {
                BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(f), "UTF-16BE"));//new BufferedReader(new FileReader(f));
                String newLine = br.readLine();
                ngram[i] = new NGramModel();
                while (newLine != null) {
                    newLine = newLine.trim();
                    addNGrams(newLine, (i+1), ngram[i]);
                    numberOfSentences++;
                    newLine = br.readLine();
                }
                System.err.println("N-Gram size = "+(i+1));
                        /*
                        Fixa......................
                         */
                System.err.println("Total ngram length = " + getNumberOfNGrams(ngram[i]));//.numberOfGrams());
                System.err.println("Total lines = " + numberOfSentences);
                System.err.println("Total tokens = " + numberOfTokens);
            } catch (IOException e) {
                e.printStackTrace();
            }
            time = System.currentTimeMillis()-time;
            System.err.println("Loaded in "+(time/1000/60)+ " min.");
        }
        STUPID_BACKOFF_BASE = 1D/ngram[0].numberOfGrams();
    }
    private void addNGrams(String string, int length, NGramModel ngm) {
        String input[] = string.split("( )+");
        numberOfTokens += input.length;
        for(int i = 0; i < input.length-length+1; i++) {
            String[] ngram = new String[length];
            for(int j = 0; j < length; j++) {
                ngram[j] = input[i+j];
            }
            ngm.add(new StringList(ngram));
        }
    }
}