import java.io.*; import java.util.Arrays; import java.util.Random; /** * This class loads the file corpus.txt, reads the file line by line up to the nr of lines provided in args[0]. * Each line is processed according to a set of rules, and the resuls are written to ppCorpus.txt, line by line. * After next it reads the following lines args[0]+1 to args[0]+args[1] and preprocesses them for testing, * saving the results in testSentences.txt * * * @author Jasmin Suljkic */ public class CorpusPreprocessUK { /** * No arguments are taken in account. * Statically configured file corpus.txt is read and ppCorpus.txt as well as testSentences.txt is created and written to. * @param args -> X Y, (X: lines to for learning, Y: lines for testing) */ public static void main(String[] args) { //args[0] -> Amount of lines to preprocess for learning //args[1] -> Amount of lines to preprocess for testing String corpusStringPath = "corpus.txt"; String testSentenceStringPath = "testSentences"; String testSentencesCorrectionStringPath = "testSentencesCorrection"; // TODO Auto-generated method stub BufferedReader br; BufferedWriter bufferedWriterCorpus; BufferedWriter bufferedWriterTest; BufferedWriter bufferedWriterTestCorrection; StringBuffer sb = new StringBuffer(); StringBuffer sbt = new StringBuffer(); int nrLines=0; int toLearn = Integer.MAX_VALUE>>3; int toTest = Integer.MAX_VALUE>>3; if(args.length>=2) { toLearn = Integer.parseInt(args[0]); toTest = Integer.parseInt(args[1]); } try { //br = new BufferedReader(new FileReader("corpus.txt")); //br = new BufferedReader(new InputStreamReader(new FileInputStream("corpus.txt"), "UTF-8")); br = new BufferedReader(new InputStreamReader(new FileInputStream(corpusStringPath))); //bufferedWriterCorpus = new BufferedWriter(new FileWriter("ppCorpus.txt")); bufferedWriterCorpus = new BufferedWriter(new OutputStreamWriter(new FileOutputStream("ppCorpus.txt"), "UTF-16BE")); //bufferedWriterTest = new BufferedWriter(new FileWriter("testSentences.txt")); //bufferedWriterTestCorrection = new BufferedWriter(new FileWriter("testScentencesCorrect.txt")); String line; char[] lc; //Go trough the corpus and count the amount of lines present while ((line=br.readLine()) != null) { nrLines++; } br.close(); if((toLearn+toTest)>nrLines){ System.err.println("Request invalid: Number of lines requested > nr of lines available in corpus.\nThere are "+nrLines+" number of lines."); System.err.println("toLearn = "+toLearn); System.err.println("toTest = "+toTest); return; } br=new BufferedReader(new FileReader("corpus.txt")); //Read a line from file (as long as there are lines in the file) //Process the line //Write the result to output file. int current = 0; boolean testing = false; OutputStreamWriter writeToTest[] = new OutputStreamWriter[1]; OutputStreamWriter writeToTestCorrection[] = new OutputStreamWriter[1]; for(int i = 0; i < writeToTest.length; i++) { writeToTest[i] = new OutputStreamWriter(new FileOutputStream(testSentenceStringPath+i+".txt"), "UTF-16BE"); writeToTestCorrection[i] = new OutputStreamWriter(new FileOutputStream(testSentencesCorrectionStringPath+i+".txt"), "UTF-16BE"); //writeToTest[i] = new OutputStreamWriter(new FileOutputStream("testSentences"+i+".txt")); //writeToTestCorrection[i] = new OutputStreamWriter(new FileOutputStream("testSentencesCorrection"+i+".txt")); } int corpusSentences = 0; int trainingSentences = 0; int index = 0; int length = 0; String buffer[] = new String[10]; while ((line=br.readLine()) != null) { line = line.toLowerCase().replaceAll("[-,]", "").replaceAll("( )*[.!?]+( )*", " .PERIOD ").replaceAll("( )+", " ").replaceAll("(.PERIOD )+", ".PERIOD ").trim(); //System.err.println(line); int really = line.split("(( )+.PERIOD( )+)|( )+").length; int previous = length; if(really>0) { length += really; } if(length<=10&&really>0) { buffer[index]=line; index++; } else if(previous>=3&&previous<=10&&length>10&&index>0) { if(toLearn>corpusSentences) { bufferedWriterCorpus.write("START "); for(int i = 0; i < index; i++) { //System.err.println(buffer[i]); bufferedWriterCorpus.write(buffer[i]); if(i+1<index) { bufferedWriterCorpus.write(' '); } //System.err.println(buffer[i]); } //System.err.println(); bufferedWriterCorpus.append(" ¿EOL"); bufferedWriterCorpus.newLine(); corpusSentences++; } else if(trainingSentences<toTest) { writeToTestCorrection[0].write("START "); writeToTest[0].write("START "); for(int i = 0; i < index; i++) { writeToTestCorrection[0].write(buffer[i]); writeToTest[0].write(buffer[i].replaceAll("( )*.PERIOD( )*", " ")); if(i+1<index) { writeToTestCorrection[0].write(' '); writeToTest[0].write(' '); } } writeToTestCorrection[0].write(" ¿EOL"); writeToTestCorrection[0].write('\n'); writeToTest[0].write(" ¿EOL"); writeToTest[0].write('\n'); trainingSentences++; } else { break; } index = 0; length = 0; } else { index = 0; length = 0; } } br.close(); bufferedWriterCorpus.close(); //bufferedWriterTest.close(); System.err.println("Using encoding: "+writeToTest[0].getEncoding()); System.err.println(corpusSentences+" sentences in corpus."); System.err.println(trainingSentences+" sentences in training."); for(int k = 0; k < writeToTest.length; k++) { writeToTest[k].close(); writeToTestCorrection[k].close(); } } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } }