SpanishXMLTreeReader.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.trees.international.spanish; 
import edu.stanford.nlp.util.logging.Redwood;

import java.io.*;
import java.util.*;
import java.util.concurrent.Executors;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.regex.Pattern;

import javax.xml.parsers.DocumentBuilder;

import edu.stanford.nlp.util.*;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;

import edu.stanford.nlp.io.ReaderInputStream;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.HasLemma;
import edu.stanford.nlp.ling.HasTag;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.trees.LabeledScoredTreeFactory;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeFactory;
import edu.stanford.nlp.trees.TreeNormalizer;
import edu.stanford.nlp.trees.TreeReader;
import edu.stanford.nlp.trees.TreebankLanguagePack;

/**
 * A reader for XML format AnCora treebank files.
 *
 * This reader makes AnCora-specific fixes; see
 * {@link #getPOS(Element)}.
 *
 * @author Jon Gauthier
 * @author Spence Green (original French XML reader)
 *
 */
public class SpanishXMLTreeReader implements TreeReader  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(SpanishXMLTreeReader.class);

  private InputStream stream;
  private final TreeNormalizer treeNormalizer;
  private final TreeFactory treeFactory;

  private boolean simplifiedTagset;
  private boolean detailedAnnotations;

  private static final String NODE_SENT = "sentence";

  private static final String ATTR_WORD = "wd";
  private static final String ATTR_LEMMA = "lem";
  private static final String ATTR_FUNC = "func";
  private static final String ATTR_NAMED_ENTITY = "ne";
  private static final String ATTR_POS = "pos";
  private static final String ATTR_POSTYPE = "postype";
  private static final String ATTR_ELLIPTIC = "elliptic";
  private static final String ATTR_PUNCT = "punct";
  private static final String ATTR_GENDER = "gen";
  private static final String ATTR_NUMBER = "num";

  // Constituent annotations
  private static final String ATTR_COORDINATING = "coord";
  private static final String ATTR_CLAUSE_TYPE = "clausetype";

  private NodeList sentences;
  private int sentIdx;

  /**
   * Read parse trees from a Reader.
   *
   * @param filename
   * @param in The <code>Reader</code>
   * @param simplifiedTagset If `true`, convert part-of-speech labels to a
   *          simplified version of the EAGLES tagset, where the tags do not
   *          include extensive morphological analysis
   * @param aggressiveNormalization Perform aggressive "normalization"
   *          on the trees read from the provided corpus documents:
   *          split multi-word tokens into their constituent words (and
   *          infer parts of speech of the constituent words).
   * @param retainNER Retain NER information in preterminals (for later
   *          use in `MultiWordPreprocessor) and add NER-specific
   *          parents to single-word NE tokens
   * @param detailedAnnotations Retain detailed tree node annotations. These
   *          annotations on parse tree constituents may be useful for
   *          e.g. training a parser.
   */
  public SpanishXMLTreeReader(String filename, Reader in, boolean simplifiedTagset,
                              boolean aggressiveNormalization,
                              boolean retainNER, boolean detailedAnnotations) {
    TreebankLanguagePack tlp = new SpanishTreebankLanguagePack();

    this.simplifiedTagset = simplifiedTagset;
    this.detailedAnnotations = detailedAnnotations;

    stream = new ReaderInputStream(in, tlp.getEncoding());
    treeFactory = new LabeledScoredTreeFactory();
    treeNormalizer =
      new SpanishTreeNormalizer(simplifiedTagset,
                                aggressiveNormalization,
                                retainNER);

    DocumentBuilder parser = XMLUtils.getXmlParser();
    try {
      final Document xml = parser.parse(stream);
      final Element root = xml.getDocumentElement();
      sentences = root.getElementsByTagName(NODE_SENT);
      sentIdx = 0;

    } catch (SAXException e) {
      log.info("Parse exception while reading " + filename);
      e.printStackTrace();
    } catch (IOException e) {
      e.printStackTrace();
    }
  }

  public void close() {
    try {
      if(stream != null) {
        stream.close();
        stream = null;
      }
    } catch (IOException e) {
      //Silently ignore
    }
  }

  public Tree readTree() {
    Tree t = null;
    while(t == null && sentences != null && sentIdx < sentences.getLength()) {
      int thisSentenceId = sentIdx++;
      Node sentRoot = sentences.item(thisSentenceId);
      t = getTreeFromXML(sentRoot);

      if(t != null) {
        t = treeNormalizer.normalizeWholeTree(t, treeFactory);

        if(t.label() instanceof CoreLabel)
          ((CoreLabel) t.label()).set(CoreAnnotations.SentenceIDAnnotation.class,
                                      Integer.toString(thisSentenceId));
      }
    }
    return t;
  }

  private boolean isWordNode(Element node) {
    return node.hasAttribute(ATTR_WORD) && !node.hasChildNodes();
  }

  private boolean isEllipticNode(Element node) {
    return node.hasAttribute(ATTR_ELLIPTIC);
  }

  /**
   * Determine the part of speech of the given leaf node.
   *
   * Use some heuristics to make up for missing part-of-speech labels.
   */
  private String getPOS(Element node) {
    String pos = node.getAttribute(ATTR_POS);

    String namedAttribute = node.getAttribute(ATTR_NAMED_ENTITY);
    if (pos.startsWith("np") && pos.length() == 7
        && pos.charAt(pos.length() - 1) == '0') {
      // Some nouns are missing a named entity annotation in the final
      // character of their POS tags, but still have a proper named
      // entity annotation in the `ne` attribute. Fix this:
      char annotation = '0';
      if (namedAttribute.equals("location")) {
        annotation = 'l';
      } else if (namedAttribute.equals("person")) {
        annotation = 'p';
      } else if (namedAttribute.equals("organization")) {
        annotation = 'o';
      }

      pos = pos.substring(0, 6) + annotation;
    } else if (pos.equals("")) {
      // Make up for some missing part-of-speech tags
      String word = getWord(node);
      if (word.equals("."))
        return "fp";

      if (namedAttribute.equals("date")) {
        return "w";
      } else if (namedAttribute.equals("number")) {
        return "z0";
      }

      String tagName = node.getTagName();
      if (tagName.equals("i")) {
        return "i";
      } else if (tagName.equals("r")) {
        return "rg";
      } else if (tagName.equals("z")) {
        return "z0";
      }

      // Handle icky issues related to "que"
      String posType = node.getAttribute(ATTR_POSTYPE);
      if (tagName.equals("c") && posType.equals("subordinating")) {
        return "cs";
      } else if (tagName.equals("p") && posType.equals("relative")
                 && word.equalsIgnoreCase("que")) {
        return "pr0cn000";
      }

      if (tagName.equals("s") && (word.equalsIgnoreCase("de") || word.equalsIgnoreCase("del")
        || word.equalsIgnoreCase("en"))) {
        return "sps00";
      } else if (word.equals("REGRESA")) {
        return "vmip3s0";
      }

      if (simplifiedTagset) {
        // If we are using the simplified tagset, we can make some more
        // broad inferences
        if (word.equals("verme")) {
          return "vmn0000";
        } else if (tagName.equals("a")) {
          return "aq0000";
        } else if (posType.equals("proper")) {
          return "np00000";
        } else if (posType.equals("common")) {
          return "nc0s000";
        } else if (tagName.equals("d") && posType.equals("numeral")) {
          return "dn0000";
        } else if (tagName.equals("d")
          && (posType.equals("article") || word.equalsIgnoreCase("el") || word.equalsIgnoreCase("la"))) {
          return "da0000";
        } else if (tagName.equals("p") && posType.equals("relative")) {
          return "pr000000";
        } else if (tagName.equals("p") && posType.equals("personal")) {
          return "pp000000";
        } else if (tagName.equals("p") && posType.equals("indefinite")) {
          return "pi000000";
        } else if (tagName.equals("s") && word.equalsIgnoreCase("como")) {
          return "sp000";
        } else if (tagName.equals("n")) {
          String gen = node.getAttribute(ATTR_GENDER);
          String num = node.getAttribute(ATTR_NUMBER);

          char genCode = gen == null ? '0' : gen.charAt(0);
          char numCode = num == null ? '0' : num.charAt(0);
          return 'n' + genCode + '0' + numCode + "000";
        }
      }

      if (node.hasAttribute(ATTR_PUNCT)) {
        if (word.equals("\""))
          return "fe";
        else if (word.equals("'"))
          return "fz";
        else if (word.equals("-"))
          return "fg";
        else if (word.equals("("))
          return "fpa";
        else if (word.equals(")"))
          return "fpt";

        return "fz";
      }
    }

    return pos;
  }

  private String getWord(Element node) {
    String word = node.getAttribute(ATTR_WORD);
    if (word.equals(""))
      return SpanishTreeNormalizer.EMPTY_LEAF_VALUE;

    return word.trim();
  }

  private Tree getTreeFromXML(Node root) {
    final Element eRoot = (Element) root;

    if (isWordNode(eRoot)) {
      return buildWordNode(eRoot);
    } else if (isEllipticNode(eRoot)) {
      return buildEllipticNode(eRoot);
    } else {
      List<Tree> kids = new ArrayList<>();
      for (Node childNode = eRoot.getFirstChild(); childNode != null;
           childNode = childNode.getNextSibling()) {
        if (childNode.getNodeType() != Node.ELEMENT_NODE) continue;

        Tree t = getTreeFromXML(childNode);
        if (t == null) {
          System.err.printf("%s: Discarding empty tree (root: %s)%n", this.getClass().getName(), childNode.getNodeName());
        } else {
          kids.add(t);
        }
      }

      return (kids.size() == 0) ? null : buildConstituentNode(eRoot, kids);
    }
  }

  /**
   * Build a parse tree node corresponding to the word in the given XML node.
   */
  private Tree buildWordNode(Node root) {
    Element eRoot = (Element) root;

    String posStr = getPOS(eRoot);
    posStr = treeNormalizer.normalizeNonterminal(posStr);

    String lemma = eRoot.getAttribute(ATTR_LEMMA);
    String word = getWord(eRoot);

    String leafStr = treeNormalizer.normalizeTerminal(word);
    Tree leafNode = treeFactory.newLeaf(leafStr);
    if (leafNode.label() instanceof HasWord)
      ((HasWord) leafNode.label()).setWord(leafStr);
    if (leafNode.label() instanceof HasLemma && lemma != null)
      ((HasLemma) leafNode.label()).setLemma(lemma);

    List<Tree> kids = new ArrayList<>();
    kids.add(leafNode);

    Tree t = treeFactory.newTreeNode(posStr, kids);
    if (t.label() instanceof HasTag) ((HasTag) t.label()).setTag(posStr);

    return t;
  }

  /**
   * Build a parse tree node corresponding to an elliptic node in the parse XML.
   */
  private Tree buildEllipticNode(Node root) {
    Element eRoot = (Element) root;
    String constituentStr = eRoot.getNodeName();

    List<Tree> kids = new ArrayList<>();
    Tree leafNode = treeFactory.newLeaf(SpanishTreeNormalizer.EMPTY_LEAF_VALUE);
    if (leafNode.label() instanceof HasWord)
      ((HasWord) leafNode.label()).setWord(SpanishTreeNormalizer.EMPTY_LEAF_VALUE);

    kids.add(leafNode);
    Tree t = treeFactory.newTreeNode(constituentStr, kids);

    return t;
  }

  /**
   * Build a parse tree node corresponding to a constituent.
   *
   * @param root Node describing the constituent
   * @param children Collected child nodes, already parsed
   */
  private Tree buildConstituentNode(Node root, List<Tree> children) {
    Element eRoot = (Element) root;
    String label = eRoot.getNodeName().trim();

    if (detailedAnnotations) {
      if (eRoot.getAttribute(ATTR_COORDINATING).equals("yes")) {
        label += "-coord";
      } else if (eRoot.hasAttribute(ATTR_CLAUSE_TYPE)) {
        label += '-' + eRoot.getAttribute(ATTR_CLAUSE_TYPE);
      }
    }

    return treeFactory.newTreeNode(treeNormalizer.normalizeNonterminal(label), children);
  }

  /**
   * Determine if the given tree contains a leaf which matches the
   * part-of-speech and lexical criteria.
   *
   * @param pos Regular expression to match part of speech (may be null,
   *     in which case any POS is allowed)
   * @param pos Regular expression to match word (may be null, in which
   *     case any word is allowed)
   */
  public static boolean shouldPrintTree(Tree tree, Pattern pos, Pattern word) {
    for(Tree t : tree) {
      if(t.isPreTerminal()) {
        CoreLabel label = (CoreLabel) t.label();
        String tpos = label.value();

        Tree wordNode = t.firstChild();
        CoreLabel wordLabel = (CoreLabel) wordNode.label();
        String tword = wordLabel.value();

        if((pos == null || pos.matcher(tpos).find())
           && (word == null || word.matcher(tword).find()))
          return true;
      }
    }
    return false;
  }

  private static String toString(Tree tree, boolean plainPrint) {
    if (!plainPrint)
      return tree.toString();

    StringBuilder sb = new StringBuilder();
    List<Tree> leaves = tree.getLeaves();
    for (Tree leaf : leaves)
      sb.append(((CoreLabel) leaf.label()).value()).append(" ");

    return sb.toString();
  }

  /**
   * Read trees from the given file and output their processed forms to
   * standard output.
   */
  public static void process(File file, TreeReader tr,
                             Pattern posPattern, Pattern wordPattern,
                             boolean plainPrint) throws IOException {
    Tree t;
    int numTrees = 0, numTreesRetained = 0;
    String canonicalFileName = file.getName().substring(0, file.getName().lastIndexOf('.'));

    while ((t = tr.readTree()) != null) {
      numTrees++;
      if (!shouldPrintTree(t, posPattern, wordPattern))
        continue;
      numTreesRetained++;

      String ftbID = ((CoreLabel) t.label()).get(CoreAnnotations.SentenceIDAnnotation.class);
      String output = toString(t, plainPrint);

      System.out.printf("%s-%s\t%s%n", canonicalFileName, ftbID, output);
    }

    System.err.printf("%s: %d trees, %d matched and printed%n", file.getName(),
                      numTrees, numTreesRetained);
  }

  private static String usage() {
    StringBuilder sb = new StringBuilder();
    String nl = System.getProperty("line.separator");

    sb.append(String.format("Usage: java %s [OPTIONS] file(s)%n%n", SpanishXMLTreeReader.class.getName()));
    sb.append("Options:").append(nl);
    sb.append("   -help: Print this message").append(nl);
    sb.append("   -ner: Add NER-specific information to trees").append(nl);
    sb.append("   -detailedAnnotations: Retain detailed annotations on tree constituents (useful for making treebank for parser, etc.)").append(nl);
    sb.append("   -plain: Output corpus in plaintext rather than as trees").append(nl);
    sb.append("   -searchPos posRegex: Only print sentences which contain a token whose part of speech matches the given regular expression").append(nl);
    sb.append("   -searchWord wordRegex: Only print sentences which contain a token which matches the given regular expression").append(nl);

    return sb.toString();
  }

  private static Map<String, Integer> argOptionDefs() {
    Map<String, Integer> argOptionDefs = Generics.newHashMap();
    argOptionDefs.put("help", 0);
    argOptionDefs.put("ner", 0);
    argOptionDefs.put("detailedAnnotations", 0);
    argOptionDefs.put("plain", 0);

    argOptionDefs.put("searchPos", 1);
    argOptionDefs.put("searchWord", 1);
    return argOptionDefs;
  }

  public static void main(String[] args) {
    final Properties options = StringUtils.argsToProperties(args, argOptionDefs());
    if(args.length < 1 || options.containsKey("help")) {
      log.info(usage());
      return;
    }

    final Pattern posPattern = options.containsKey("searchPos")
      ? Pattern.compile(options.getProperty("searchPos")) : null;
    final Pattern wordPattern = options.containsKey("searchWord")
      ? Pattern.compile(options.getProperty("searchWord")) : null;
    final boolean plainPrint = PropertiesUtils.getBool(options, "plain", false);
    final boolean ner = PropertiesUtils.getBool(options, "ner", false);
    final boolean detailedAnnotations = PropertiesUtils.getBool(options, "detailedAnnotations", false);

    String[] remainingArgs = options.getProperty("").split(" ");
    List<File> fileList = new ArrayList<>();
    for (String remainingArg : remainingArgs) fileList.add(new File(remainingArg));

    final SpanishXMLTreeReaderFactory trf = new SpanishXMLTreeReaderFactory(true, true, ner, detailedAnnotations);
    ExecutorService pool =
      Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors());

    for (final File file : fileList) {
      pool.execute(new Runnable() {
          public void run() {
            try {
              Reader in = new BufferedReader(new InputStreamReader(new FileInputStream(file), "ISO-8859-1"));
              TreeReader tr = trf.newTreeReader(file.getPath(), in);
              process(file, tr, posPattern, wordPattern, plainPrint);
              tr.close();
            } catch (FileNotFoundException e) {
              e.printStackTrace();
            } catch (IOException e) {
              e.printStackTrace();
            }
          }
        });
    }

    pool.shutdown();
    try {
      pool.awaitTermination(Long.MAX_VALUE, TimeUnit.NANOSECONDS);
    } catch (InterruptedException e) {
      throw new RuntimeInterruptedException(e);
    }
  }
}