LabeledATBDataset.java example

Explorer
Stanford-NLP-master
- CoreNLP-master
package edu.stanford.nlp.international.arabic.pipeline; 
import edu.stanford.nlp.util.logging.Redwood;

import java.io.*;
import java.util.regex.*;

import edu.stanford.nlp.trees.Tree;

public class LabeledATBDataset extends ATBArabicDataset  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(LabeledATBDataset.class);

  @Override
  public void build() {
    for(File path : pathsToData) {
      int prevSize = treebank.size();
      if(splitFilter == null)
        treebank.loadPath(path,treeFileExtension,false);
      else
        treebank.loadPath(path,splitFilter);

      toStringBuffer.append(String.format(" Loaded %d trees from %s\n", treebank.size() - prevSize, path.getPath()));
    }

    PrintWriter outfile = null;
    PrintWriter flatFile = null;
    try {
      outfile = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outFileName),"UTF-8")));
      flatFile = (makeFlatFile) ? new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(flatFileName),"UTF-8"))) : null;

      ArabicRawTreeNormalizer tv = new LabelingTreeNormalizer(outfile,flatFile);

      treebank.apply(tv);

      outputFileList.add(outFileName);

      if(makeFlatFile) {
        outputFileList.add(flatFileName);
        toStringBuffer.append(" Made flat files\n");
      }

    } catch (UnsupportedEncodingException e) {
      System.err.printf("%s: Filesystem does not support UTF-8 output\n", this.getClass().getName());
      e.printStackTrace();
    } catch (FileNotFoundException e) {
      System.err.printf("%s: Could not open %s for writing\n", this.getClass().getName(), outFileName);
    } finally {
      if(outfile != null)
        outfile.close();
      if(flatFile != null)
        flatFile.close();
    }
  }
  
  protected class LabelingTreeNormalizer extends ArabicRawTreeNormalizer {
    
    private final Pattern leftClitic;
    private final Pattern rightClitic;
    
    public LabelingTreeNormalizer(PrintWriter outFile, PrintWriter flatFile) {
      super(outFile, flatFile);
    
      leftClitic = Pattern.compile("^-");
      rightClitic = Pattern.compile("-$");
    }

    @Override
    protected void processPreterminal(Tree node) {
      String rawTag = node.value();
      if(rawTag.equals("-NONE-"))
        return;
      
      String rawWord = node.firstChild().value().trim();
      
      Matcher left = leftClitic.matcher(rawWord);
      boolean hasLeft = left.find();
      Matcher right = rightClitic.matcher(rawWord);
      boolean hasRight = right.find();
      
      if(rawTag.equals("PUNC") || !(hasRight || hasLeft)) {
        node.firstChild().setValue("XSEG");
      
      } else if(hasRight && hasLeft){
        node.firstChild().setValue("SEGC");
      } else if(hasRight) {
        node.firstChild().setValue("SEGL");
      } else if(hasLeft) {
        node.firstChild().setValue("SEGR");
      } else {
        throw new RuntimeException("Messy token: " + rawWord);
      }
    }
  }
}