/* OpenNLP can be found at: https://opennlp.apache.org/cgi-bin/download.cgi */ import java.io.*; import java.lang.ArrayIndexOutOfBoundsException; import java.lang.IllegalArgumentException; import java.lang.String; import java.lang.StringBuilder; import java.util.*; import opennlp.tools.ngram.NGramModel; import opennlp.tools.util.StringList; public class NGramWrapper { /** * The Stupid Backoff currently assumes that if it has never seen a word before it is equivalent to having seen it once (i.e. very unlikely). */ public final static int STUPID_BACKOFF = 0; public final static double STUPID_BACKOFF_ALPHA = 0.01; //Following http://stackoverflow.com/questions/16383194/stupid-backoff-implementation-clarification except that we want to penalize the backoff more. public static double STUPID_BACKOFF_BASE = 0.0001; /* From Stanley F. Chen and Joshua Goodman (1998), “An Empirical Study of Smoothing Techniques for Language Modeling" */ public final static int MODIFIED_KNESER_NEY = 1; public final static int JELINEK_MERCER = 2; /** * Sets which smoothing technique should be used. * Note, this is not thread-safe. */ public static int smoothing = STUPID_BACKOFF; /* Todo ? */ private static boolean padStart = false; //If {<s><s><s>I} is a correct 3-gram private static boolean padEnd = false; //If {be.<e><e><e>} is a correct 3-gram long numberOfSentences = 0; long numberOfTokens = 0; private int nGramLength; //NGramModel ngram = new NGramModel(); NGramModel ngram[]; long numberOfTokensInVocabulary = 0; long numberOfTokensOutOfVocabulary = 0; long numberOfNGramsInCoverage = 0; long numberOfNGramsOutofCoverage = 0; public static void main(String[] args) { File searchIn = new File("corpus.txt"); int nGramLength=3; //NGramWrapper ngram = new NGramWrapper(nGramLength); for(int i = 0; i < args.length; i+=2) { if(args[i].equals("n-gram")) { nGramLength = Integer.parseInt(args[i+1]); } else if(args[i].equals("corpus")) { searchIn = new File(args[i+1]); } else { System.err.println(args[i]+ " is invalid."); } } NGramWrapper ngw = new NGramWrapper(nGramLength); ngw.readFile(searchIn); } public void resetOOV() { numberOfTokensInVocabulary=0; numberOfTokensOutOfVocabulary=0; } public double getOOV() { if(numberOfTokensInVocabulary>0) { return (double)numberOfTokensOutOfVocabulary/numberOfTokensInVocabulary; } else { return Double.NaN; } } /** * Assumes that the N-Gram is correct size. * @param s * @return */ public double getCostOfNGram(String[] s) { return getCostOfNGram(s, this.smoothing); } public void updateOOV(String[] s) { for(int i = 0; i < s.length; i++) { if(ngram[0].contains(new StringList(s[i]))) { numberOfTokensInVocabulary++; } else { numberOfTokensOutOfVocabulary++; } } } public void resetCoverage() { numberOfNGramsInCoverage=0; numberOfNGramsOutofCoverage=0; } public void updateCoverage(String[] s) { for(int i = s.length-1; i >= nGramLength; i--) { if(ngram[ngram.length-1].contains(new StringList(Arrays.copyOfRange(s, i-nGramLength, i)))) { numberOfNGramsInCoverage++; } else { numberOfNGramsOutofCoverage++; } } } public double getCoverage() { if(numberOfNGramsInCoverage>0) { return (double)numberOfNGramsOutofCoverage/numberOfNGramsInCoverage; } else { return Double.NaN; } } public double getCostOfNGram(String[] s, int smoothing) { return getCostOfNGramRecursive(s, smoothing); } private double getCostOfNGramRecursive(String[] s, int smoothing) { double value = 0; switch (smoothing) { case STUPID_BACKOFF: //From http://stackoverflow.com/questions/16383194/stupid-backoff-implementation-clarification if(s.length>1) { value = counts(s); String argument[] = new String[s.length-1]; System.arraycopy(s, 1, argument, 0, argument.length); if(value>0) { value /= counts(argument); } else { value = STUPID_BACKOFF_ALPHA*getCostOfNGramRecursive(argument, STUPID_BACKOFF); } } else { //This is only "valid" because we will have a small corpus double counts = STUPID_BACKOFF_BASE; if(s.length>0) { counts=counts(s); } if(counts==0) { counts=STUPID_BACKOFF_BASE; } double total = ngram[0].numberOfGrams(); value = counts/total; } break; case JELINEK_MERCER: //From http://nlp.stanford.edu/~wcmac/papers/20050421-smoothing-tutorial.pdf break; default: throw new IllegalArgumentException(); } return value; } public NGramWrapper(int nGramLength) { this.nGramLength = nGramLength; ngram = new NGramModel[nGramLength]; } public boolean exists(String[] s) { return ngram[s.length-1].contains(new StringList(s)); } public int counts(String[] s) { return ngram[s.length-1].getCount(new StringList(s)); } public NGramModel getNgram() { return ngram[ngram.length-1]; } private int getNumberOfNGrams(NGramModel ngm) { Iterator<StringList> iterator = ngm.iterator(); int count = 0; while(iterator.hasNext()) { iterator.next(); count++; } return count; } public int getNGramLength() { return nGramLength; } public void readFile(File f) { System.err.println(f.getAbsolutePath()); for(int i = 0; i < ngram.length; i++) { numberOfSentences = 0; long time = System.currentTimeMillis(); try { BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(f), "UTF-16BE"));//new BufferedReader(new FileReader(f)); String newLine = br.readLine(); ngram[i] = new NGramModel(); while (newLine != null) { newLine = newLine.trim(); addNGrams(newLine, (i+1), ngram[i]); numberOfSentences++; newLine = br.readLine(); } System.err.println("N-Gram size = "+(i+1)); /* Fixa...................... */ System.err.println("Total ngram length = " + getNumberOfNGrams(ngram[i]));//.numberOfGrams()); System.err.println("Total lines = " + numberOfSentences); System.err.println("Total tokens = " + numberOfTokens); } catch (IOException e) { e.printStackTrace(); } time = System.currentTimeMillis()-time; System.err.println("Loaded in "+(time/1000/60)+ " min."); } STUPID_BACKOFF_BASE = 1D/ngram[0].numberOfGrams(); } private void addNGrams(String string, int length, NGramModel ngm) { String input[] = string.split("( )+"); numberOfTokens += input.length; for(int i = 0; i < input.length-length+1; i++) { String[] ngram = new String[length]; for(int j = 0; j < length; j++) { ngram[j] = input[i+j]; } ngm.add(new StringList(ngram)); } } }