package clear.experiment;
import clear.dep.DepNode;
import clear.dep.DepTree;
import clear.reader.DepReader;
import clear.util.IOUtil;
import java.io.File;
import java.io.PrintStream;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class DepKrClean {
final Pattern P_Q = Pattern.compile("[QU]=\\d+");
final Pattern P_Qd = Pattern.compile("^[QU]\\d+(.*)");
public DepKrClean(String inputDir, String outputDir) {
File dir = new File(inputDir);
DepReader reader;
DepTree tree;
PrintStream fout;
for (String filename : dir.list()) {
if (!filename.endsWith(".dep")) {
continue;
}
System.out.println(filename);
reader = new DepReader(inputDir + File.separator + filename, true);
fout = IOUtil.createPrintFileStream(outputDir + File.separator + filename);
while ((tree = reader.nextTree()) != null) {
if (check(tree)) {
fout.println(tree + "\n");
}
}
reader.close();
fout.close();
}
}
public boolean check(DepTree tree) {
if (tree.size() == 2) {
return false;
}
DepNode node;
Matcher m;
String fst, snd, form;
int idx;
for (int i = 1; i < tree.size(); i++) {
node = tree.get(i);
if (node.lemma.startsWith("+")) {
node.lemma = node.lemma.substring(1);
}
if (node.lemma.endsWith("+")) {
node.lemma = node.lemma.substring(0, node.lemma.length() - 1);
}
if (!node.form.contains("*")) {
node.lemma = node.lemma.replaceAll("\\*\\/", "/");
}
if (P_Q.matcher(node.form).find()) {
return false;
}
if (i == 1 && (m = P_Qd.matcher(node.form)).find()) {
idx = node.lemma.indexOf("/");
fst = node.lemma.substring(0, idx);
snd = node.lemma.substring(idx + 1);
form = m.group(1);
if (snd.startsWith("SN")) {
form = fst + form;
}
if (!form.isEmpty()) {
node.form = form;
}
}
}
return true;
}
static public void main(String[] args) {
new DepKrClean(args[0], args[1]);
}
}