import java.io.*;
import java.util.Arrays;
import java.util.Random;
/**
* This class loads the file corpus.txt, reads the file line by line up to the nr of lines provided in args[0].
* Each line is processed according to a set of rules, and the resuls are written to ppCorpus.txt, line by line.
* After next it reads the following lines args[0]+1 to args[0]+args[1] and preprocesses them for testing,
* saving the results in testSentences.txt
*
*
* @author Jasmin Suljkic
*/
public class CorpusPreprocess {
/**
* No arguments are taken in account.
* Statically configured file corpus.txt is read and ppCorpus.txt as well as testSentences.txt is created and written to.
* @param args -> X Y, (X: lines to for learning, Y: lines for testing)
*/
public static void main(String[] args) {
//args[0] -> Amount of lines to preprocess for learning
//args[1] -> Amount of lines to preprocess for testing
String corpusStringPath = "corpus.txt";
String testSentenceStringPath = "testSentences";
String testSentencesCorrectionStringPath = "testSentencesCorrection";
// TODO Auto-generated method stub
BufferedReader br;
BufferedWriter bufferedWriterCorpus;
BufferedWriter bufferedWriterTest;
BufferedWriter bufferedWriterTestCorrection;
StringBuffer sb = new StringBuffer();
StringBuffer sbt = new StringBuffer();
int nrLines=0;
int toLearn = Integer.MAX_VALUE>>3;
int toTest = Integer.MAX_VALUE>>3;
if(args.length>=2) {
toLearn = Integer.parseInt(args[0]);
toTest = Integer.parseInt(args[1]);
}
try {
//br = new BufferedReader(new FileReader("corpus.txt"));
//br = new BufferedReader(new InputStreamReader(new FileInputStream("corpus.txt"), "UTF-8"));
br = new BufferedReader(new InputStreamReader(new FileInputStream(corpusStringPath)));
//bufferedWriterCorpus = new BufferedWriter(new FileWriter("ppCorpus.txt"));
bufferedWriterCorpus = new BufferedWriter(new OutputStreamWriter(new FileOutputStream("ppCorpus.txt"), "UTF-16BE"));
//bufferedWriterTest = new BufferedWriter(new FileWriter("testSentences.txt"));
//bufferedWriterTestCorrection = new BufferedWriter(new FileWriter("testScentencesCorrect.txt"));
String line;
char[] lc;
//Go trough the corpus and count the amount of lines present
while ((line=br.readLine()) != null) {
nrLines++;
}
br.close();
if((toLearn+toTest)>nrLines){
System.err.println("Request invalid: Number of lines requested > nr of lines available in corpus.\nThere are "+nrLines+" number of lines.");
System.err.println("toLearn = "+toLearn);
System.err.println("toTest = "+toTest);
return;
}
int seed = 0;
if(args.length>2) {
seed=Integer.parseInt(args[2]);
}
Random r = new Random(seed);
int[] uniqueLinesToLearn = new int[toLearn];
int[] uniqueLinesToTest = new int[toTest];
for(int j = 0; j < uniqueLinesToLearn.length; j++) {
int lineNumber = r.nextInt(toLearn+toTest);
boolean isNotAcceptable = true;
while(isNotAcceptable) {
isNotAcceptable=false;
for (int i = j - 1; i >= 0; i--) {
if(uniqueLinesToLearn[i]==lineNumber) {
isNotAcceptable = true;
lineNumber = r.nextInt(toLearn+toTest);
break;
}
}
}
uniqueLinesToLearn[j]=lineNumber;
}
for(int j = 0; j < uniqueLinesToTest.length; j++) {
int lineNumber = r.nextInt(uniqueLinesToTest.length+1);
boolean isNotAcceptable = true;
while(isNotAcceptable) {
isNotAcceptable=false;
for(int k = uniqueLinesToLearn.length-1; k>=0; k--) {
if(uniqueLinesToLearn[k]==lineNumber) {
isNotAcceptable=true;
lineNumber = r.nextInt(toLearn+toTest);
break;
}
}
if(!isNotAcceptable) {
for (int i = j - 1; i >= 0; i--) {
if (uniqueLinesToTest[i] == lineNumber) {
isNotAcceptable = true;
lineNumber = r.nextInt(toLearn+toTest);
break;
}
}
}
}
uniqueLinesToTest[j]=lineNumber;
}
Arrays.sort(uniqueLinesToLearn);
Arrays.sort(uniqueLinesToTest);
//System.out.println(uniqueLinesToLearn.length);
//System.out.println(uniqueLinesToTest.length);
br=new BufferedReader(new FileReader("corpus.txt"));
//Read a line from file (as long as there are lines in the file)
//Process the line
//Write the result to output file.
int current = 0;
boolean testing = false;
OutputStreamWriter writeToTest[] = new OutputStreamWriter[1];
OutputStreamWriter writeToTestCorrection[] = new OutputStreamWriter[1];
for(int i = 0; i < writeToTest.length; i++) {
writeToTest[i] = new OutputStreamWriter(new FileOutputStream(testSentenceStringPath+i+".txt"), "UTF-16BE");
writeToTestCorrection[i] = new OutputStreamWriter(new FileOutputStream(testSentencesCorrectionStringPath+i+".txt"), "UTF-16BE");
//writeToTest[i] = new OutputStreamWriter(new FileOutputStream("testSentences"+i+".txt"));
//writeToTestCorrection[i] = new OutputStreamWriter(new FileOutputStream("testSentencesCorrection"+i+".txt"));
}
/*
String[] temp = new String[10];
int index = 0;
int length = 0;
while ((line=br.readLine())!=null) {
length += line.split(" ").length;
if(length<=10) {
temp[index]=line;
index++;
} else if(length>=3) {
}
current++;
}
*/
while ((line=br.readLine()) != null) {
/*
if(current==toLearn+1){
testing=true;
}
if(current==(toLearn+toTest)){
break;
}*/
boolean skip = true;
if(Arrays.binarySearch(uniqueLinesToLearn, current)>=0) {
testing = false;
} else if(Arrays.binarySearch(uniqueLinesToTest, current)>=0) {
testing = true;
} else {
skip=false;
}
/*
Handling input on one line.
*/
if(skip) {
int category = line.trim().split("( )+").length; //Very inefficient =)
if (category >= 3 && category <= 10) {
if (category >= writeToTest.length) {
category = writeToTest.length - 1;
}
lc = line.toLowerCase().trim().toCharArray();
if (testing) {
sbt.append("START ");
writeToTest[category].write("START ");
writeToTestCorrection[category].write("START ");
}
sb.append("START ");
for (char c : lc) {
if (c == '.') {
if (testing) {
sbt.append(" ");
writeToTest[category].write(" ");
writeToTestCorrection[category].write(" .PERIOD ");
}
sb.append(" .PERIOD ");
} else if (c == '!') {
if (testing) {
sbt.append(" ");
writeToTest[category].write(" ");
writeToTestCorrection[category].write(" .PERIOD ");
}
//sb.append(" !EXCL ");
sb.append(" .PERIOD ");
} else if (c == '?') {
if (testing) {
sbt.append(" ");
writeToTest[category].write(" ");
writeToTestCorrection[category].write(" .PERIOD ");
}
//sb.append(" ?QMARK ");
sb.append(" .PERIOD ");
}
else if(c==','){
/*
if(testing){
sbt.append(" ");
writeToTest[category].write(" ");
writeToTestCorrection[category].write(" .PERIOD ");
}
//sb.append(" ,COMMA ");
sb.append(" .PERIOD ");
*/
}
else {
if (testing) {
sbt.append(c);
writeToTest[category].write(c);
writeToTestCorrection[category].write(c);
}
sb.append(c);
}
}
if (testing) {
sbt.append(" ¿EOL");
writeToTest[category].write(" ¿EOL");
writeToTestCorrection[category].write(" ¿EOL");
writeToTest[category].write('\n');
writeToTestCorrection[category].write('\n');
}
sb.append(" ¿EOL");
if (testing) {
//bufferedWriterTest.write(sbt.toString());
sbt = new StringBuffer();
//bufferedWriterTest.newLine();
//bufferedWriterTestCorrection.write(sb.toString());
sb = new StringBuffer();
//bufferedWriterTestCorrection.newLine();
} else {
bufferedWriterCorpus.write(sb.toString());
sb = new StringBuffer();
bufferedWriterCorpus.newLine();
}
}
}
current++;
}
br.close();
bufferedWriterCorpus.close();
//bufferedWriterTest.close();
System.err.println("Using encoding: "+writeToTest[0].getEncoding());
for(int k = 0; k < writeToTest.length; k++) {
writeToTest[k].close();
writeToTestCorrection[k].close();
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}