package ir.ac.iust.nlp.dependencyparser.inputoutput;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.LinkedList;
import java.util.List;
/**
*
* @author Mojtaba Khallash
*/
public class ReadCorpus {
public static void getStatistics(String source) {
int sen = 0;
int nonProjSen = 0;
int projSen = 0;
int word = 0;
int nonProj = 0;
int proj = 0;
int totlaLen = 0;
int maxLen = 0;
try {
try (BufferedReader reader = new BufferedReader(new InputStreamReader(
new FileInputStream(source), "UTF8"))) {
List<Integer> Min = new LinkedList<>();
List<Integer> Max = new LinkedList<>();
String line;
boolean isProjSen = true;
int len = 0;
while ((line = reader.readLine()) != null) {
if (line.trim().length() != 0) {
len++;
word++;
String[] parts = line.split("\t");
int current = Integer.parseInt(parts[0]);
int head = Integer.parseInt(parts[6]);
if (Math.abs(current - head) > 1) {
boolean isProj = true;
for (int i = 0; i < Min.size(); i++) {
int min = Min.get(i);
int max = Max.get(i);
if (min < head && head < max &&
(current < min || max < current)) {
nonProj++;
isProj = false;
isProjSen = false;
break;
}
else if (min < current && current < max &&
(head < min || max < head)) {
nonProj++;
isProj = false;
isProjSen = false;
break;
}
}
if (isProj == true) {
proj++;
}
Min.add(Math.min(current, head));
Max.add(Math.max(current, head));
}
else {
proj++;
}
}
else {
if (isProjSen == false) {
nonProjSen++;
}
else {
projSen++;
}
isProjSen = true;
totlaLen += len;
maxLen = Math.max(maxLen, len);
len = 0;
sen++;
Min = new LinkedList<>();
Max = new LinkedList<>();
}
}
}
}
catch(IOException | NumberFormatException ex) {
ex.printStackTrace();
}
String pattern = "%.2f";
System.out.println("# Sentences: " + sen);
System.out.println("# Words: " + word);
System.out.println("------------------------------------------------");
System.out.println("# Maximum length: " + maxLen);
System.out.println("# Average length: " + String.format(pattern, totlaLen / (float)sen));
System.out.println("------------------------------------------------");
System.out.println("# Projective Arcs: " + proj + " (" + String.format(pattern, (proj * 100.0 / word)) + "%)");
System.out.println("# Non-Projective Arcs: " + nonProj + " (" + String.format(pattern, (nonProj * 100.0 / word)) + "%)");
System.out.println("------------------------------------------------");
System.out.println("# Projective Sentences: " + projSen + " (" + String.format(pattern, (projSen * 100.0 / sen)) + "%)");
System.out.println("# Non-Projective Sentences: " + nonProjSen + " (" + String.format(pattern, (nonProjSen * 100.0 / sen)) + "%)");
}
}