FrenchXMLTreeReader.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.trees.international.french; 
import edu.stanford.nlp.util.logging.Redwood;

import java.io.*;
import java.util.*;

import javax.xml.parsers.DocumentBuilder;

import edu.stanford.nlp.ling.*;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;

import edu.stanford.nlp.io.ReaderInputStream;
import edu.stanford.nlp.ling.SentenceUtils;
import edu.stanford.nlp.trees.LabeledScoredTreeFactory;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeFactory;
import edu.stanford.nlp.trees.TreeNormalizer;
import edu.stanford.nlp.trees.TreeReader;
import edu.stanford.nlp.trees.TreeReaderFactory;
import edu.stanford.nlp.trees.TreebankLanguagePack;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.XMLUtils;

/**
 * A reader for XML format French Treebank files. Note that the raw
 * XML files are in ISO-8859-1 format, so they must be converted to UTF-8.
 * <p>
 * Handles multiword expressions (MWEs).
 * <p>
 * One difference worth documenting between this and the
 * PennTreeReader is that this does not unescape \* and \/ the way the
 * PennTreeReader does.  The French Treebank we are using does not
 * use those escapings.
 *
 * @author Spence Green
 *
 */
public class FrenchXMLTreeReader implements TreeReader  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(FrenchXMLTreeReader.class);

  private InputStream stream;
  private final TreeNormalizer treeNormalizer;
  private final TreeFactory treeFactory;

  private static final String NODE_SENT = "SENT";
  private static final String NODE_WORD = "w";

  private static final String ATTR_NUMBER = "nb";
  private static final String ATTR_POS = "cat";
  private static final String ATTR_POS_MWE = "catint";
  private static final String ATTR_LEMMA = "lemma";
  private static final String ATTR_MORPH = "mph";
  private static final String ATTR_EE = "ee";
  private static final String ATTR_SUBCAT = "subcat";

  // Prefix for MWE nodes
  private static final String MWE_PHRASAL = "MW";

  public static final String EMPTY_LEAF = "-NONE-";
  public static final String MISSING_PHRASAL = "DUMMYP";
  public static final String MISSING_POS = "DUMMY";

  private NodeList sentences;
  private int sentIdx;

  /**
   * Read parse trees from a Reader.
   *
   * @param in The <code>Reader</code>
   */
  public FrenchXMLTreeReader(Reader in, boolean ccTagset) {
    this(in, new LabeledScoredTreeFactory(), new FrenchTreeNormalizer(ccTagset));
  }

  /**
   * Read parse trees from a Reader.
   *
   * @param in Reader
   * @param tf TreeFactory -- factory to create some kind of Tree
   * @param tn the method of normalizing trees
   */
  public FrenchXMLTreeReader(Reader in, TreeFactory tf, TreeNormalizer tn) {
    TreebankLanguagePack tlp = new FrenchTreebankLanguagePack();
    stream = new ReaderInputStream(in,tlp.getEncoding());
    treeFactory = tf;
    treeNormalizer = tn;

    DocumentBuilder parser = XMLUtils.getXmlParser();
    try {
      final Document xml = parser.parse(stream);
      final Element root = xml.getDocumentElement();
      sentences = root.getElementsByTagName(NODE_SENT);
      sentIdx = 0;

    } catch (SAXException e) {
      e.printStackTrace();
    } catch (IOException e) {
      e.printStackTrace();
    }
  }

  public void close() {
    try {
      if(stream != null) {
        stream.close();
        stream = null;
      }
    } catch (IOException e) {
      //Silently ignore
    }
  }

  public Tree readTree() {
    Tree t = null;
    while(t == null && sentences != null && sentIdx < sentences.getLength()) {
      Node sentRoot = sentences.item(sentIdx++);
      t = getTreeFromXML(sentRoot);

      if(t != null) {
        t = treeNormalizer.normalizeWholeTree(t, treeFactory);
        if(t.label() instanceof CoreLabel) {
          String ftbId = ((Element) sentRoot).getAttribute(ATTR_NUMBER);
          ((CoreLabel) t.label()).set(CoreAnnotations.SentenceIDAnnotation.class, ftbId);
        }
      }
    }
    return t;
  }

  //wsg2010: Sometimes the cat attribute is not present, in which case the POS
  //is in the attribute catint, which indicates a part of a compound / MWE
  private String getPOS(Element node) {
    String attrPOS = node.hasAttribute(ATTR_POS) ? node.getAttribute(ATTR_POS).trim() : "";
    String attrPOSMWE = node.hasAttribute(ATTR_POS_MWE) ? node.getAttribute(ATTR_POS_MWE).trim() : "";

    if(attrPOS != "")
      return attrPOS;
    else if(attrPOSMWE != "")
      return attrPOSMWE;

    return MISSING_POS;
  }

  /**
   * Extract the lemma attribute.
   *
   * @param node
   */
  private List<String> getLemma(Element node) {
    String lemma = node.getAttribute(ATTR_LEMMA);
    if (lemma == null || lemma.equals(""))
      return null;
    return getWordString(lemma);
  }

  /**
   * Extract the morphological analysis from a leaf. Note that the "ee" field
   * contains the relativizer flag.
   *
   * @param node
   */
  private String getMorph(Element node) {
    String ee = node.getAttribute(ATTR_EE);
    return ee == null ? "" : ee;
  }

  /**
   * Get the POS subcategory.
   *
   * @param node
   * @return
   */
  private String getSubcat(Element node) {
    String subcat = node.getAttribute(ATTR_SUBCAT);
    return subcat == null ? "" : subcat;
  }

  /**
   * Terminals may consist of one or more whitespace-delimited tokens.
   * <p>
   * wsg2010: Marie recommends replacing empty terminals with -NONE- instead of using the lemma
   * (these are usually the determiner)
   *
   * @param text
   */
  private List<String> getWordString(String text) {
    List<String> toks = new ArrayList<>();
    if(text == null || text.equals(""))
      toks.add(EMPTY_LEAF);
    else {
      //Strip spurious parens
      if(text.length() > 1)
        text = text.replaceAll("[\\(\\)]", "");

      //Check for numbers and punctuation
      String noWhitespaceStr = text.replaceAll("\\s+", "");
      if(noWhitespaceStr.matches("\\d+") || noWhitespaceStr.matches("\\p{Punct}+"))
        toks.add(noWhitespaceStr);
      else
        toks = Arrays.asList(text.split("\\s+"));
    }

    if(toks.size() == 0)
      throw new RuntimeException(this.getClass().getName() + ": Zero length token list for: " + text);

    return toks;
  }

  private Tree getTreeFromXML(Node root) {
    final Element eRoot = (Element) root;

    if (eRoot.getNodeName().equals(NODE_WORD) &&
        eRoot.getElementsByTagName(NODE_WORD).getLength() == 0) {
      String posStr = getPOS(eRoot);
      posStr = treeNormalizer.normalizeNonterminal(posStr);

      List<String> lemmas = getLemma(eRoot);
      String morph = getMorph(eRoot);
      List<String> leafToks = getWordString(eRoot.getTextContent().trim());
      String subcat = getSubcat(eRoot);

      if (lemmas != null && lemmas.size() != leafToks.size()) {
        // If this happens (and it does for a few poorly editted trees)
        // we assume something has gone wrong and ignore the lemmas.
        log.info("Lemmas don't match tokens, ignoring lemmas: " +
                           "lemmas " + lemmas + ", tokens " + leafToks);
        lemmas = null;
      }

      //Terminals can have multiple tokens (MWEs). Make these into a
      //flat structure for now.
      Tree t = null;
      List<Tree> kids = new ArrayList<>();
      if(leafToks.size() > 1) {
        for (int i = 0; i < leafToks.size(); ++i) {
          String tok = leafToks.get(i);
          String s = treeNormalizer.normalizeTerminal(tok);
          List<Tree> leafList = new ArrayList<>();
          Tree leafNode = treeFactory.newLeaf(s);
          if(leafNode.label() instanceof HasWord)
            ((HasWord) leafNode.label()).setWord(s);
          if (leafNode.label() instanceof CoreLabel && lemmas != null) {
            ((CoreLabel) leafNode.label()).setLemma(lemmas.get(i));
          }
          if(leafNode.label() instanceof HasContext) {
            ((HasContext) leafNode.label()).setOriginalText(morph);
          }
          if (leafNode.label() instanceof HasCategory) {
            ((HasCategory) leafNode.label()).setCategory(subcat);
          }
          leafList.add(leafNode);

          Tree posNode = treeFactory.newTreeNode(MISSING_POS, leafList);
          if(posNode.label() instanceof HasTag)
            ((HasTag) posNode.label()).setTag(MISSING_POS);

          kids.add(posNode);
        }
        t = treeFactory.newTreeNode(MISSING_PHRASAL, kids);

      } else {
        String leafStr = treeNormalizer.normalizeTerminal(leafToks.get(0));
        Tree leafNode = treeFactory.newLeaf(leafStr);
        if (leafNode.label() instanceof HasWord)
          ((HasWord) leafNode.label()).setWord(leafStr);
        if (leafNode.label() instanceof CoreLabel && lemmas != null) {
          ((CoreLabel) leafNode.label()).setLemma(lemmas.get(0));
        }
        if (leafNode.label() instanceof HasContext) {
          ((HasContext) leafNode.label()).setOriginalText(morph);
        }
        if (leafNode.label() instanceof HasCategory) {
          ((HasCategory) leafNode.label()).setCategory(subcat);
        }
        kids.add(leafNode);

        t = treeFactory.newTreeNode(posStr, kids);
        if (t.label() instanceof HasTag) ((HasTag) t.label()).setTag(posStr);
      }

      return t;
    }

    List<Tree> kids = new ArrayList<>();
    for(Node childNode = eRoot.getFirstChild(); childNode != null; childNode = childNode.getNextSibling()) {
      if(childNode.getNodeType() != Node.ELEMENT_NODE) continue;
      Tree t = getTreeFromXML(childNode);
      if(t == null) {
        System.err.printf("%s: Discarding empty tree (root: %s)%n", this.getClass().getName(),childNode.getNodeName());
      } else {
        kids.add(t);
      }
    }

    // MWEs have a label with a
    String rootLabel = eRoot.getNodeName().trim();
    boolean isMWE = rootLabel.equals("w") && eRoot.hasAttribute(ATTR_POS);
    if(isMWE)
      rootLabel = eRoot.getAttribute(ATTR_POS).trim();

    Tree t = (kids.size() == 0) ? null : treeFactory.newTreeNode(treeNormalizer.normalizeNonterminal(rootLabel), kids);

    if(t != null && isMWE)
      t = postProcessMWE(t);

    return t;
  }


  private Tree postProcessMWE(Tree t) {
    String tYield = SentenceUtils.listToString(t.yield()).replaceAll("\\s+", "");
    if(tYield.matches("[\\d\\p{Punct}]*")) {
      List<Tree> kids = new ArrayList<>();
      kids.add(treeFactory.newLeaf(tYield));
      t = treeFactory.newTreeNode(t.value(), kids);
    } else {
      t.setValue(MWE_PHRASAL + t.value());
    }
    return t;
  }


  /**
   * For debugging.
   *
   * @param args
   */
  public static void main(String[] args) {
    if(args.length < 1) {
      System.err.printf("Usage: java %s tree_file(s)%n%n",FrenchXMLTreeReader.class.getName());
      System.exit(-1);
    }

    List<File> fileList = new ArrayList<>();
    for (String arg : args) fileList.add(new File(arg));

    TreeReaderFactory trf = new FrenchXMLTreeReaderFactory(false);
    int totalTrees = 0;
    Set<String> morphAnalyses = Generics.newHashSet();
    try {
      for(File file : fileList) {
        TreeReader tr = trf.newTreeReader(new BufferedReader(new InputStreamReader(new FileInputStream(file),"UTF-8")));

        Tree t;
        int numTrees;
        String canonicalFileName = file.getName().substring(0, file.getName().lastIndexOf('.'));

        for(numTrees = 0; (t = tr.readTree()) != null; numTrees++) {
          String ftbID = ((CoreLabel) t.label()).get(CoreAnnotations.SentenceIDAnnotation.class);
          System.out.printf("%s-%s\t%s%n",canonicalFileName, ftbID, t.toString());
          List<Label> leaves = t.yield();
          for(Label label : leaves) {
            if(label instanceof CoreLabel)
              morphAnalyses.add(((CoreLabel) label).originalText());
          }
        }

        tr.close();
        System.err.printf("%s: %d trees%n",file.getName(),numTrees);
        totalTrees += numTrees;
      }

//wsg2011: Print out the observed morphological analyses
//      for(String analysis : morphAnalyses)
//        log.info(analysis);

      System.err.printf("%nRead %d trees%n",totalTrees);

    } catch (FileNotFoundException e) {
      e.printStackTrace();

    } catch (IOException e) {
      e.printStackTrace();
    }
  }
}