import java.io.*; import java.io.BufferedOutputStream; import java.io.BufferedReader; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStream; import java.io.OutputStreamWriter; import java.io.PrintWriter; import java.lang.String; import java.lang.StringBuilder; import java.util.Arrays; import java.util.Iterator; import opennlp.tools.util.StringList; import opennlp.tools.ngram.NGramModel; public class asketTest { final static String[] transitions = {" ", ",COMMA", ".PERIOD", "?QMARK", "!EXCL"}; //Change space to null public static void main(String[] args) { int nGramLength = 4; String trainOn = "ppCorpus.txt"; String evaluate = "testSentences.txt"; NGramWrapper ngw = new NGramWrapper(nGramLength); ngw.readFile(new File(trainOn)); try { BufferedReader br = new BufferedReader(new FileReader(evaluate)); int counter = 6; while(counter>0) { //Risky? String fix = br.readLine().trim(); System.err.println("---------------------"); System.err.println(fix); String input[] = fix.split("( )+"); if(input.length<nGramLength) { //This is not correct oneWordJump(input, input.length, ngw); } else { oneWordJump(input, nGramLength, ngw); } counter--; } } catch(IOException e) { e.printStackTrace(); } /* String[][] test = HFSA(args); for(int i = 0; i < test.length; i++) { for(int j = 0; j < test[i].length; j++) { System.err.print(test[i][j] + " "); } System.err.println(); } */ /* int nGramLength = 2; for(int i = 0; i < args.length; i += 2) { if(args[i].equals("n-gram")) { nGramLength = Integer.parseInt(args[i+1]); } } handleInput(nGramLength); */ } public static void removeMultiplePunctuations(File f) throws IOException { BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(f), "UTF-16BE")); StringBuffer buff = new StringBuffer(); while(br.ready()) { buff.append(br.readLine().replaceAll("( )+", " ").replaceAll("(\\.PERIOD )+", ".PERIOD ")); buff.append('\n'); } br.close(); OutputStreamWriter osw = new OutputStreamWriter(new FileOutputStream(f), "UTF-16BE"); osw.write(buff.toString()); osw.close(); } /** * The underlying assumption is that fakeHyperFSA()[0] is space and that space is the most common in-between in the corpus. * matrix[][][0] = the score * matrix[][][1] = used for backtracking purposes and terminates on -1. */ private static String[] dynProg(final String[] punctuation, final String[] words, final int NGramLength) { double[][][] matrix = new double[words.length][punctuation.length][2]; /* Init */ String[][] temp = fakeHyperFSA(words, 0, NGramLength); for(int i = 0; i < matrix[0].length; i++) { double NGramScore = fakeNGramValue(temp[i]); matrix[0][i][0] = NGramScore; matrix[0][i][1] = -1; //-1 will be used to stop the backtrack. } /* Calculating matrix ... */ for(int i = 1; i < matrix.length-NGramLength; i++) { String[][] NGrams = fakeHyperFSA(words, i, NGramLength); for(int j = 0; j < matrix[i].length; j++) { for(int k = 0; k < NGramLength; k++) { double NGramScore = fakeNGramValue(NGrams[k]); } } } /* Decode step */ return null; } private static String[][] fakeHyperFSA(String[] s, int from, int NGramLength) { return null; } private static double fakeNGramValue(String[] s) { return 0; } /** Continuously step through the input */ private static void oneWordStep(String words[], int NGramLength, NGramWrapper ngw) { /* Init must be all possible versions of the first n-gram size. */ int version = -1; char[] transitions = {' ', ',', '.', '?'}; int possibleCombinations = NGramLength*transitions.length*transitions.length+1; double maxValue = Double.MIN_VALUE; for(int i = 0; i < possibleCombinations; i++) { } } private static String[][] createPermuatedStrings(final String words[], final int numberOfPermuations) { String[][] permuations = new String[numberOfPermuations][words.length]; final char[] transitions = {' ', ',', '.', '?'}; String[] tempArg = new String[words.length/2]; System.arraycopy(words, 0, tempArg, 0, tempArg.length); String[] tempReturn = tempFSA(tempArg); String[][] moreTemporary = new String[numberOfPermuations][words.length]; for(int i = 0; i < tempReturn.length; i++) { System.arraycopy(tempReturn[i].split("[ ,.?!]+"), 0, moreTemporary[i], 0, tempReturn[i].split("[ ,.?!]+").length); } if(tempReturn[0].split("[ ,.?!]+").length!=words.length) { //A word is missing ... } if(permuations[permuations.length-1]!=null) { System.err.println("Last entry of permutations is != null"); System.exit(1); } permuations[permuations.length-1]=words; return permuations; } /** * Note that the method does definitvely not do this. If a . or ! or ? is detected, make the word afterward the start of a new sentence. */ private static void oneWordJump(String words[], int NGramLength, NGramWrapper ngw) { /* Init */ String[] start = initiateOneWordJump(words, NGramLength, ngw); StringBuilder result = new StringBuilder(); for(int i = 0; i < start.length; i++) { result.append(start[i]).append(' '); System.err.print(start[i]+" "); } System.err.println(); int startPos = 0; for(int i = 0; i < NGramLength; i++) { for(int j = startPos; j < NGramLength; j++) { if(start[i].equals(words[j])) { startPos=j+1; break; } } } for(int i = startPos; i < words.length; i++) { int index = 1; int emergencyBreak = 3; //To prevent repetition of the same word while(index>0 && emergencyBreak>0) { index = Integer.MAX_VALUE; System.arraycopy(start, 1, start, 0, NGramLength - 1); double maxValue = Double.NEGATIVE_INFINITY; for (int j = transitions.length-1; j >= 0; j--) { if (transitions[j] != transitions[0]) { start[start.length - 1] = transitions[j]; } else { start[start.length - 1] = words[i]; } double currentValue = ngw.getCostOfNGram(start); if(currentValue>maxValue) { maxValue=currentValue; index=j; } } if(index==0) { result.append(words[i]).append(' '); emergencyBreak++; } else { result.append(transitions[index]).append(' '); } emergencyBreak--; } if(emergencyBreak==0) { //If emergency brake hits... result.append(words[i]).append(' '); } } System.out.println(result.toString()); } /** * Initiates oneWordJump. * @param words * @param NGramLength * @param ngw * @return */ private static String[] initiateOneWordJump(String words[], int NGramLength, NGramWrapper ngw) { String[] initString = new String[NGramLength]; System.arraycopy(words, 0, initString, 0, NGramLength); String[][] potentialStartValues = HFSA(initString); double maxValue = Double.NEGATIVE_INFINITY; int index = -1; for(int i = 0; i < potentialStartValues.length; i++) { /* for(int j = 0; j < potentialStartValues[i].length; j++) { System.err.print(potentialStartValues[i][j]+" "); } System.err.println(); */ double currentValue = ngw.getCostOfNGram(potentialStartValues[i]); //System.err.println(currentValue); if(currentValue>maxValue) { maxValue=currentValue; index=i; } } return potentialStartValues[index]; } /** * A case insensitive HFSA ... * @param input * @return */ private static String[][] HFSA(String[] input) { int NGramLength = input.length; //String[] transitions = {" ", ",COMMA", ".PERIOD", "?QMARK", "!EXCL"}; //Change space to null int numberOfTransitionsInString = input.length/2; int internalCounters[] = new int[numberOfTransitionsInString]; Arrays.fill(internalCounters, transitions.length-1); int numberOfReturnValues = (int)Math.pow(transitions.length, numberOfTransitionsInString); String[][] returnValue = new String[numberOfReturnValues][NGramLength]; int counter = 0; while (internalCounters[0]>=0) { int positionInInnerArray = 0; int positionInOuterArray = 0; String[] string = new String[input.length]; for(int i = 0; i < numberOfTransitionsInString; i++) { string[positionInInnerArray] = input[positionInOuterArray]; positionInOuterArray++; positionInInnerArray++; if(!transitions[internalCounters[i]].equals(" ")) { string[positionInInnerArray] = transitions[internalCounters[i]]; positionInInnerArray++; } else { string[positionInInnerArray] = input[positionInOuterArray]; positionInInnerArray++; positionInOuterArray++; } } for(int i = positionInInnerArray; i < NGramLength; i++) { string[i] = input[positionInOuterArray]; positionInOuterArray++; } internalCounters[internalCounters.length-1]--; for(int i = internalCounters.length-1; i > 0; i--) { if(internalCounters[i]<0) { internalCounters[i] = transitions.length-1; internalCounters[i-1]--; } } returnValue[counter] = string; counter++; } return returnValue; } /** * Note, this method DOES NOT WORK. It SHOULD generate ArrayIndexOutOfBoundsException. * @param input * @return */ private static String[] tempFSA(String[] input) { char[] transitions = {' ', ',', '.', '?'}; int numberOfTransitions = transitions.length; String[] output = new String[input.length*input.length*numberOfTransitions]; int counter = 0; int internalCounters[] = new int[input.length]; Arrays.fill(internalCounters, (transitions.length - 1)); while (internalCounters[0]>=0) { StringBuilder sb = new StringBuilder(); for(int i = 0; i < input.length; i++) { sb.append(input[i]); sb.append(transitions[internalCounters[i]]); } //System.err.println(internalCounters[internalCounters.length-1]); internalCounters[internalCounters.length-1]--; for(int i = 1; i < internalCounters.length; i++) { if(internalCounters[i]<0) { internalCounters[i] = numberOfTransitions-1; internalCounters[i-1]--; } } //System.err.println(sb.toString()); output[counter] = sb.toString(); counter++; } return output; } private static void handleInput(int nGramLength) { NGramWrapper ngw = new NGramWrapper(nGramLength); ngw.readFile(new File("/Users/JAsketorp/Documents/DD2380/smsCorpusAsText.txt")); System.err.println("Corpus:"); System.err.println("Number of sentences: "+ngw.numberOfSentences); System.err.println("Number of tokens: "+ngw.numberOfTokens); System.err.println("Number of grams: "+ngw.getNgram().numberOfGrams()); //ngw.serialize((OutputStream)(new FileOutputStream("test.txt"))); try { BufferedReader br = new BufferedReader(new InputStreamReader(System.in)); String input = br.readLine(); while(input != null) { String[][] fromInput = createNGramsFromText(nGramLength, input.split(" ")); //String temp[] = input.split(" "); //String FSAInput[] = {temp[0], temp[1]}; /* HyperStringFSA versions = new HyperStringFSA(FSAInput); System.err.println("-----------------------HYPERSTRING---------------------------"); for(String s: versions.outputs) { System.err.println(s); } */ String[] extraOut = tempFSA(input.split(" ")); boolean found = true; for(String s: extraOut) { if(ngw.exists(s.split(" "))) { System.err.println("Possible string: " + s); System.err.println("Counts = " + ngw.counts(s.split(" "))); found = false; } } if(found) { System.err.println("Was not found in corpus"); } input = br.readLine(); } } catch(IOException e) { e.printStackTrace(); } } private static String[][] createNGramsFromText(int nGramLength, String input[]) { String output[][] = new String[input.length-nGramLength+1][nGramLength]; for(int i = 0; i < output.length; i++) { //StringBuilder sb = new StringBuilder(); for(int j = 0; j < nGramLength; j++) { output[i][j] = input[i+j]; } } return output; } private static void workAroundToSaveNGramModel(OutputStream out, NGramModel ngm) { PrintWriter pw = new PrintWriter(out); Iterator<StringList> iterator = ngm.iterator(); while (iterator.hasNext()) { StringList sl = iterator.next(); int count = ngm.getCount(sl); pw.print(sl.toString()); pw.print(' '); pw.print(count); pw.println(); } pw.flush(); } }