import java.io.*; import java.util.Arrays; import java.util.Random; /** * This class loads the file corpus.txt, reads the file line by line up to the nr of lines provided in args[0]. * Each line is processed according to a set of rules, and the resuls are written to ppCorpus.txt, line by line. * After next it reads the following lines args[0]+1 to args[0]+args[1] and preprocesses them for testing, * saving the results in testSentences.txt * * * @author Jasmin Suljkic */ public class CorpusPreprocess { /** * No arguments are taken in account. * Statically configured file corpus.txt is read and ppCorpus.txt as well as testSentences.txt is created and written to. * @param args -> X Y, (X: lines to for learning, Y: lines for testing) */ public static void main(String[] args) { //args[0] -> Amount of lines to preprocess for learning //args[1] -> Amount of lines to preprocess for testing String corpusStringPath = "corpus.txt"; String testSentenceStringPath = "testSentences"; String testSentencesCorrectionStringPath = "testSentencesCorrection"; // TODO Auto-generated method stub BufferedReader br; BufferedWriter bufferedWriterCorpus; BufferedWriter bufferedWriterTest; BufferedWriter bufferedWriterTestCorrection; StringBuffer sb = new StringBuffer(); StringBuffer sbt = new StringBuffer(); int nrLines=0; int toLearn = Integer.MAX_VALUE>>3; int toTest = Integer.MAX_VALUE>>3; if(args.length>=2) { toLearn = Integer.parseInt(args[0]); toTest = Integer.parseInt(args[1]); } try { //br = new BufferedReader(new FileReader("corpus.txt")); //br = new BufferedReader(new InputStreamReader(new FileInputStream("corpus.txt"), "UTF-8")); br = new BufferedReader(new InputStreamReader(new FileInputStream(corpusStringPath))); //bufferedWriterCorpus = new BufferedWriter(new FileWriter("ppCorpus.txt")); bufferedWriterCorpus = new BufferedWriter(new OutputStreamWriter(new FileOutputStream("ppCorpus.txt"), "UTF-16BE")); //bufferedWriterTest = new BufferedWriter(new FileWriter("testSentences.txt")); //bufferedWriterTestCorrection = new BufferedWriter(new FileWriter("testScentencesCorrect.txt")); String line; char[] lc; //Go trough the corpus and count the amount of lines present while ((line=br.readLine()) != null) { nrLines++; } br.close(); if((toLearn+toTest)>nrLines){ System.err.println("Request invalid: Number of lines requested > nr of lines available in corpus.\nThere are "+nrLines+" number of lines."); System.err.println("toLearn = "+toLearn); System.err.println("toTest = "+toTest); return; } int seed = 0; if(args.length>2) { seed=Integer.parseInt(args[2]); } Random r = new Random(seed); int[] uniqueLinesToLearn = new int[toLearn]; int[] uniqueLinesToTest = new int[toTest]; for(int j = 0; j < uniqueLinesToLearn.length; j++) { int lineNumber = r.nextInt(toLearn+toTest); boolean isNotAcceptable = true; while(isNotAcceptable) { isNotAcceptable=false; for (int i = j - 1; i >= 0; i--) { if(uniqueLinesToLearn[i]==lineNumber) { isNotAcceptable = true; lineNumber = r.nextInt(toLearn+toTest); break; } } } uniqueLinesToLearn[j]=lineNumber; } for(int j = 0; j < uniqueLinesToTest.length; j++) { int lineNumber = r.nextInt(uniqueLinesToTest.length+1); boolean isNotAcceptable = true; while(isNotAcceptable) { isNotAcceptable=false; for(int k = uniqueLinesToLearn.length-1; k>=0; k--) { if(uniqueLinesToLearn[k]==lineNumber) { isNotAcceptable=true; lineNumber = r.nextInt(toLearn+toTest); break; } } if(!isNotAcceptable) { for (int i = j - 1; i >= 0; i--) { if (uniqueLinesToTest[i] == lineNumber) { isNotAcceptable = true; lineNumber = r.nextInt(toLearn+toTest); break; } } } } uniqueLinesToTest[j]=lineNumber; } Arrays.sort(uniqueLinesToLearn); Arrays.sort(uniqueLinesToTest); //System.out.println(uniqueLinesToLearn.length); //System.out.println(uniqueLinesToTest.length); br=new BufferedReader(new FileReader("corpus.txt")); //Read a line from file (as long as there are lines in the file) //Process the line //Write the result to output file. int current = 0; boolean testing = false; OutputStreamWriter writeToTest[] = new OutputStreamWriter[1]; OutputStreamWriter writeToTestCorrection[] = new OutputStreamWriter[1]; for(int i = 0; i < writeToTest.length; i++) { writeToTest[i] = new OutputStreamWriter(new FileOutputStream(testSentenceStringPath+i+".txt"), "UTF-16BE"); writeToTestCorrection[i] = new OutputStreamWriter(new FileOutputStream(testSentencesCorrectionStringPath+i+".txt"), "UTF-16BE"); //writeToTest[i] = new OutputStreamWriter(new FileOutputStream("testSentences"+i+".txt")); //writeToTestCorrection[i] = new OutputStreamWriter(new FileOutputStream("testSentencesCorrection"+i+".txt")); } /* String[] temp = new String[10]; int index = 0; int length = 0; while ((line=br.readLine())!=null) { length += line.split(" ").length; if(length<=10) { temp[index]=line; index++; } else if(length>=3) { } current++; } */ while ((line=br.readLine()) != null) { /* if(current==toLearn+1){ testing=true; } if(current==(toLearn+toTest)){ break; }*/ boolean skip = true; if(Arrays.binarySearch(uniqueLinesToLearn, current)>=0) { testing = false; } else if(Arrays.binarySearch(uniqueLinesToTest, current)>=0) { testing = true; } else { skip=false; } /* Handling input on one line. */ if(skip) { int category = line.trim().split("( )+").length; //Very inefficient =) if (category >= 3 && category <= 10) { if (category >= writeToTest.length) { category = writeToTest.length - 1; } lc = line.toLowerCase().trim().toCharArray(); if (testing) { sbt.append("START "); writeToTest[category].write("START "); writeToTestCorrection[category].write("START "); } sb.append("START "); for (char c : lc) { if (c == '.') { if (testing) { sbt.append(" "); writeToTest[category].write(" "); writeToTestCorrection[category].write(" .PERIOD "); } sb.append(" .PERIOD "); } else if (c == '!') { if (testing) { sbt.append(" "); writeToTest[category].write(" "); writeToTestCorrection[category].write(" .PERIOD "); } //sb.append(" !EXCL "); sb.append(" .PERIOD "); } else if (c == '?') { if (testing) { sbt.append(" "); writeToTest[category].write(" "); writeToTestCorrection[category].write(" .PERIOD "); } //sb.append(" ?QMARK "); sb.append(" .PERIOD "); } else if(c==','){ /* if(testing){ sbt.append(" "); writeToTest[category].write(" "); writeToTestCorrection[category].write(" .PERIOD "); } //sb.append(" ,COMMA "); sb.append(" .PERIOD "); */ } else { if (testing) { sbt.append(c); writeToTest[category].write(c); writeToTestCorrection[category].write(c); } sb.append(c); } } if (testing) { sbt.append(" ¿EOL"); writeToTest[category].write(" ¿EOL"); writeToTestCorrection[category].write(" ¿EOL"); writeToTest[category].write('\n'); writeToTestCorrection[category].write('\n'); } sb.append(" ¿EOL"); if (testing) { //bufferedWriterTest.write(sbt.toString()); sbt = new StringBuffer(); //bufferedWriterTest.newLine(); //bufferedWriterTestCorrection.write(sb.toString()); sb = new StringBuffer(); //bufferedWriterTestCorrection.newLine(); } else { bufferedWriterCorpus.write(sb.toString()); sb = new StringBuffer(); bufferedWriterCorpus.newLine(); } } } current++; } br.close(); bufferedWriterCorpus.close(); //bufferedWriterTest.close(); System.err.println("Using encoding: "+writeToTest[0].getEncoding()); for(int k = 0; k < writeToTest.length; k++) { writeToTest[k].close(); writeToTestCorrection[k].close(); } } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } }