package is2.mtag;
import is2.parser.Parser;
import java.io.*;
import java.util.ArrayList;
/**
* @author Dr. Bernd Bohnet, 20.01.2010
*
*
*/
public class Convert {
public static void main(String[] args) throws IOException {
Options options = new Options(args);
split(options.trainfile);
}
/**
* @param trainfile
* @throws IOException
*/
private static void split(String trainfile) throws IOException {
String dir = "split";
boolean success = (new File("split")).mkdir();
if (success) {
Parser.out.println("Directory: " + dir + " created");
}
ArrayList<String> corpus = new ArrayList<>();
BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(trainfile), "UTF-8"), 32768);
String l;
int sentences = 0;
try {
while ((l = reader.readLine()) != null) {
corpus.add(l);
if (l.length() < 8) {
sentences++;
}
}
} catch (IOException e) {
e.printStackTrace();
}
Parser.out.println("Corpus has " + sentences + " sentences.");
int partSize = sentences / 20;
Parser.out.println("Prepare corpus for cross annotations with 20 parts with part size " + partSize + " number of lines " + corpus.size());
for (int k = 0; k < 20; k++) {
try (BufferedWriter br = new BufferedWriter(
new OutputStreamWriter(
new FileOutputStream("split/p-" + k), "UTF-8"))) {
try (BufferedWriter rest = new BufferedWriter(
new OutputStreamWriter(
new FileOutputStream("split/r-" + k), "UTF-8"))) {
int skip = k * partSize;
int countSentences = 0;
int countSentencesWrote = 0;
Parser.out.println("skip from " + skip + " to " + (skip + partSize - 1));
for (String x : corpus) {
if (countSentences >= skip && (countSentences < (skip + partSize) || k == 19)) {
rest.write(x);
rest.newLine();
if (x.length() < 8) {
countSentencesWrote++;
}
} else {
br.write(x);
br.newLine();
}
if (x.length() < 8) {
countSentences++;
}
}
Parser.out.println("wrote for this part " + countSentencesWrote);
rest.flush();
}
br.flush();
}
}
}
}