import java.io.*;
import java.util.Arrays;
import java.util.Random;
/**
* This class loads the file corpus.txt, reads the file line by line up to the nr of lines provided in args[0].
* Each line is processed according to a set of rules, and the resuls are written to ppCorpus.txt, line by line.
* After next it reads the following lines args[0]+1 to args[0]+args[1] and preprocesses them for testing,
* saving the results in testSentences.txt
*
*
* @author Jasmin Suljkic
*/
public class CorpusPreprocessUK {
/**
* No arguments are taken in account.
* Statically configured file corpus.txt is read and ppCorpus.txt as well as testSentences.txt is created and written to.
* @param args -> X Y, (X: lines to for learning, Y: lines for testing)
*/
public static void main(String[] args) {
//args[0] -> Amount of lines to preprocess for learning
//args[1] -> Amount of lines to preprocess for testing
String corpusStringPath = "corpus.txt";
String testSentenceStringPath = "testSentences";
String testSentencesCorrectionStringPath = "testSentencesCorrection";
// TODO Auto-generated method stub
BufferedReader br;
BufferedWriter bufferedWriterCorpus;
BufferedWriter bufferedWriterTest;
BufferedWriter bufferedWriterTestCorrection;
StringBuffer sb = new StringBuffer();
StringBuffer sbt = new StringBuffer();
int nrLines=0;
int toLearn = Integer.MAX_VALUE>>3;
int toTest = Integer.MAX_VALUE>>3;
if(args.length>=2) {
toLearn = Integer.parseInt(args[0]);
toTest = Integer.parseInt(args[1]);
}
try {
//br = new BufferedReader(new FileReader("corpus.txt"));
//br = new BufferedReader(new InputStreamReader(new FileInputStream("corpus.txt"), "UTF-8"));
br = new BufferedReader(new InputStreamReader(new FileInputStream(corpusStringPath)));
//bufferedWriterCorpus = new BufferedWriter(new FileWriter("ppCorpus.txt"));
bufferedWriterCorpus = new BufferedWriter(new OutputStreamWriter(new FileOutputStream("ppCorpus.txt"), "UTF-16BE"));
//bufferedWriterTest = new BufferedWriter(new FileWriter("testSentences.txt"));
//bufferedWriterTestCorrection = new BufferedWriter(new FileWriter("testScentencesCorrect.txt"));
String line;
char[] lc;
//Go trough the corpus and count the amount of lines present
while ((line=br.readLine()) != null) {
nrLines++;
}
br.close();
if((toLearn+toTest)>nrLines){
System.err.println("Request invalid: Number of lines requested > nr of lines available in corpus.\nThere are "+nrLines+" number of lines.");
System.err.println("toLearn = "+toLearn);
System.err.println("toTest = "+toTest);
return;
}
br=new BufferedReader(new FileReader("corpus.txt"));
//Read a line from file (as long as there are lines in the file)
//Process the line
//Write the result to output file.
int current = 0;
boolean testing = false;
OutputStreamWriter writeToTest[] = new OutputStreamWriter[1];
OutputStreamWriter writeToTestCorrection[] = new OutputStreamWriter[1];
for(int i = 0; i < writeToTest.length; i++) {
writeToTest[i] = new OutputStreamWriter(new FileOutputStream(testSentenceStringPath+i+".txt"), "UTF-16BE");
writeToTestCorrection[i] = new OutputStreamWriter(new FileOutputStream(testSentencesCorrectionStringPath+i+".txt"), "UTF-16BE");
//writeToTest[i] = new OutputStreamWriter(new FileOutputStream("testSentences"+i+".txt"));
//writeToTestCorrection[i] = new OutputStreamWriter(new FileOutputStream("testSentencesCorrection"+i+".txt"));
}
int corpusSentences = 0;
int trainingSentences = 0;
int index = 0;
int length = 0;
String buffer[] = new String[10];
while ((line=br.readLine()) != null) {
line = line.toLowerCase().replaceAll("[-,]", "").replaceAll("( )*[.!?]+( )*", " .PERIOD ").replaceAll("( )+", " ").replaceAll("(.PERIOD )+", ".PERIOD ").trim();
//System.err.println(line);
int really = line.split("(( )+.PERIOD( )+)|( )+").length;
int previous = length;
if(really>0) {
length += really;
}
if(length<=10&&really>0) {
buffer[index]=line;
index++;
} else if(previous>=3&&previous<=10&&length>10&&index>0) {
if(toLearn>corpusSentences) {
bufferedWriterCorpus.write("START ");
for(int i = 0; i < index; i++) {
//System.err.println(buffer[i]);
bufferedWriterCorpus.write(buffer[i]);
if(i+1<index) {
bufferedWriterCorpus.write(' ');
}
//System.err.println(buffer[i]);
}
//System.err.println();
bufferedWriterCorpus.append(" ¿EOL");
bufferedWriterCorpus.newLine();
corpusSentences++;
} else if(trainingSentences<toTest) {
writeToTestCorrection[0].write("START ");
writeToTest[0].write("START ");
for(int i = 0; i < index; i++) {
writeToTestCorrection[0].write(buffer[i]);
writeToTest[0].write(buffer[i].replaceAll("( )*.PERIOD( )*", " "));
if(i+1<index) {
writeToTestCorrection[0].write(' ');
writeToTest[0].write(' ');
}
}
writeToTestCorrection[0].write(" ¿EOL");
writeToTestCorrection[0].write('\n');
writeToTest[0].write(" ¿EOL");
writeToTest[0].write('\n');
trainingSentences++;
} else {
break;
}
index = 0;
length = 0;
} else {
index = 0;
length = 0;
}
}
br.close();
bufferedWriterCorpus.close();
//bufferedWriterTest.close();
System.err.println("Using encoding: "+writeToTest[0].getEncoding());
System.err.println(corpusSentences+" sentences in corpus.");
System.err.println(trainingSentences+" sentences in training.");
for(int k = 0; k < writeToTest.length; k++) {
writeToTest[k].close();
writeToTestCorrection[k].close();
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}