package edu.stanford.nlp.international.arabic.pipeline;
import edu.stanford.nlp.util.logging.Redwood;
import java.io.*;
import java.util.regex.*;
import edu.stanford.nlp.trees.Tree;
public class LabeledATBDataset extends ATBArabicDataset {
/** A logger for this class */
private static Redwood.RedwoodChannels log = Redwood.channels(LabeledATBDataset.class);
@Override
public void build() {
for(File path : pathsToData) {
int prevSize = treebank.size();
if(splitFilter == null)
treebank.loadPath(path,treeFileExtension,false);
else
treebank.loadPath(path,splitFilter);
toStringBuffer.append(String.format(" Loaded %d trees from %s\n", treebank.size() - prevSize, path.getPath()));
}
PrintWriter outfile = null;
PrintWriter flatFile = null;
try {
outfile = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outFileName),"UTF-8")));
flatFile = (makeFlatFile) ? new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(flatFileName),"UTF-8"))) : null;
ArabicRawTreeNormalizer tv = new LabelingTreeNormalizer(outfile,flatFile);
treebank.apply(tv);
outputFileList.add(outFileName);
if(makeFlatFile) {
outputFileList.add(flatFileName);
toStringBuffer.append(" Made flat files\n");
}
} catch (UnsupportedEncodingException e) {
System.err.printf("%s: Filesystem does not support UTF-8 output\n", this.getClass().getName());
e.printStackTrace();
} catch (FileNotFoundException e) {
System.err.printf("%s: Could not open %s for writing\n", this.getClass().getName(), outFileName);
} finally {
if(outfile != null)
outfile.close();
if(flatFile != null)
flatFile.close();
}
}
protected class LabelingTreeNormalizer extends ArabicRawTreeNormalizer {
private final Pattern leftClitic;
private final Pattern rightClitic;
public LabelingTreeNormalizer(PrintWriter outFile, PrintWriter flatFile) {
super(outFile, flatFile);
leftClitic = Pattern.compile("^-");
rightClitic = Pattern.compile("-$");
}
@Override
protected void processPreterminal(Tree node) {
String rawTag = node.value();
if(rawTag.equals("-NONE-"))
return;
String rawWord = node.firstChild().value().trim();
Matcher left = leftClitic.matcher(rawWord);
boolean hasLeft = left.find();
Matcher right = rightClitic.matcher(rawWord);
boolean hasRight = right.find();
if(rawTag.equals("PUNC") || !(hasRight || hasLeft)) {
node.firstChild().setValue("XSEG");
} else if(hasRight && hasLeft){
node.firstChild().setValue("SEGC");
} else if(hasRight) {
node.firstChild().setValue("SEGL");
} else if(hasLeft) {
node.firstChild().setValue("SEGR");
} else {
throw new RuntimeException("Messy token: " + rawWord);
}
}
}
}