import java.io.*; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileReader; import java.io.IOException; import java.io.RandomAccessFile; import java.util.Arrays; public class Comparisons { public static void main(String[] args) { if(args.length!=2) { System.err.println("Correct usage is:\njava Comparisons <generated_file> <original_file>"); System.exit(1); } File generated = new File(args[0]); File original = new File(args[1]); System.err.println("Data for generated file:"); System.err.println("Question marks: "+countQuestionMark(generated)); System.err.println("Exclamation marks: "+countExclamationMark(generated)); System.err.println("Periods: "+countPeriod(generated)); System.err.println("Commas: "+countComma(generated)); System.err.println(); System.err.println("Data for original file:"); System.err.println("Question marks: "+countQuestionMark(original)); System.err.println("Exclamation marks: "+countExclamationMark(original)); System.err.println("Periods: "+countPeriod(original)); System.err.println("Commas: "+countComma(original)); System.err.println(); System.err.println("Additional data:"); char punctuations[] = {'.',',','?','!'}; double[] FScore = getFScore(punctuations, generated, original); for(int i = 0; i < punctuations.length; i++) { System.err.println("F-score for "+punctuations[i]+" = "+FScore[i]); } System.err.println("Total f-score is "+FScore[FScore.length-1]); } /** Not implemented yet. */ private static double getPerplexity(File generated, File original) { return Double.NaN; } /** Might be implemented. */ private static double[] getFScore(char[] punctuation, File generated, File original) { double FScore[] = new double[punctuation.length+1]; Arrays.fill(FScore, -1); try { //BufferedReader brGenerated = new BufferedReader(new FileReader(generated)); //BufferedReader brOriginal = new BufferedReader(new FileReader(original)); InputStreamReader brGenerated = new InputStreamReader(new FileInputStream(generated), "UTF-16BE"); InputStreamReader brOriginal = new InputStreamReader(new FileInputStream(original), "UTF-16BE"); /* The intention is to do a word by word comparison, if this is not possible, the amount of words in the generated compared to the original does not match which is _REALLY_ bad. */ int originalChar = brOriginal.read(); int generatedChar = brGenerated.read(); final int space = ' '; int truePositivePrecision = 0; int trueAndFalsePrecision = 0; int positiveRecall = 0; int charTruePositivePrecision[] = new int[punctuation.length]; int charTrueAndFalsePrecision[] = new int[punctuation.length]; int charPositiveRecall[] = new int[punctuation.length]; while(originalChar>=0) { //System.err.print(originalChar); //System.err.print(generatedChar+" "); //Check if first char is the punctuation looked for //If there is a generated punctuation there is either a false positive or a true positive /* Precision ... */ //if(generatedChar==punctuation) { int containsGenerated = contains(punctuation, generatedChar); int containsOriginal = contains(punctuation, originalChar); //if(contains(punctuation, generatedChar)) { //System.err.println(containsGenerated); //If a punctuation is detected in the generated document if(containsGenerated>=0) { trueAndFalsePrecision++; charTrueAndFalsePrecision[containsGenerated]++; } //If a punctuation is detected in the original document if(containsOriginal>=0) { positiveRecall++; charPositiveRecall[containsOriginal]++; } //If the punctuation is correct if((originalChar==generatedChar)&&containsGenerated>=0) { truePositivePrecision++; charTruePositivePrecision[containsGenerated]++; } //If there is a punctuation in the original document but not the generated if(containsOriginal>=0&&containsGenerated<0) { originalChar = findNextChar(brOriginal); } //If there is a punctuation in the generated document but not the original else if(containsGenerated>=0&&containsOriginal<0) { generatedChar = findNextChar(brGenerated); } //If there is a difference between the chars (e.g. a space at the start of the line in one but not the other) else if(originalChar!=generatedChar) { generatedChar = findNextChar(brGenerated); originalChar = findNextChar(brOriginal); } //If neither of them is a punctuation else if(containsOriginal<0&&containsGenerated<0) { originalChar = findNextChar(brOriginal); generatedChar = findNextChar(brGenerated); } //This should not happen else if(originalChar==generatedChar) { originalChar = findNextChar(brOriginal); generatedChar = findNextChar(brGenerated); } //This should definitively not happen else { System.err.println("Else is run"); originalChar=brOriginal.read(); generatedChar=brGenerated.read(); } } for(int i = 0; i < punctuation.length; i++) { double precision = (double)charTruePositivePrecision[i]/(double)charTrueAndFalsePrecision[i]; double recall = (double)charTruePositivePrecision[i]/(double)charPositiveRecall[i]; FScore[i] = (2*precision*recall)/(precision+recall); } double precision = (double)truePositivePrecision/(double)trueAndFalsePrecision; double recall = (double)truePositivePrecision/(double)positiveRecall; FScore[FScore.length-1] = (2*precision*recall)/(precision + recall); System.err.println("Overall precision = "+precision); System.err.println("Overall recall = "+recall); } catch(IOException e) { e.printStackTrace(); } return FScore; } private static int contains(char[] chars, int c) { for(int i = 0; i < chars.length; i++) { if(chars[i]==c) { return i; } } return -1; } /** * The underlying assumptions is that a word can only be separated by a space(' ') or a newline('\n'). * @param br * @throws IOException */ private static void skipThisWord(InputStreamReader br) throws IOException { int character = br.read(); while((character>=0)&&(character!=' ')&&(character!='\n')) { character=br.read(); } if(character==' ') { skipThisWord(br); } } private static int findNextChar(InputStreamReader br) throws IOException { int character = br.read(); while((character>=0)&&(character!=' ')&&(character!='\n')) { character=br.read(); } character=br.read(); while(character==' '||character=='\n') { character=br.read(); } return character; } private static long countExclamationMark(File f) { return countPunctuation('!', f); } private static long countQuestionMark(File f) { return countPunctuation('?', f); } private static long countComma(File f) { return countPunctuation(',', f); } private static long countPeriod(File f) { return countPunctuation('.', f); } /** Note that if the file is on one line, this is a fairly bad idea. br.read() should be investigate in that case. */ private static long countPunctuation(char punctuation, File f) { long hits = -1; try { //BufferedReader br = new BufferedReader(new FileReader(f)); InputStreamReader br = new InputStreamReader(new FileInputStream(f), "UTF-16BE"); //String temp = br.readLine(); hits++; /* This is only valid for ASCII... */ int intPunctuation = punctuation; int read = br.read(); while (read >= 0) { if (intPunctuation == read) { hits++; } read = br.read(); } /*while(temp!=null) { for(int i = 0; i < temp.length(); i++) { if(temp.charAt(i)==punctuation) { hits++; } } temp=br.readLine(); } */ br.close(); } catch(IOException e) { e.printStackTrace(); } return hits; } }