DefaultLexicalMapper.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.international.arabic.pipeline;

import java.io.File;
import java.io.Serializable;
import java.util.Arrays;
import java.util.Collections;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import edu.stanford.nlp.international.arabic.Buckwalter;
import edu.stanford.nlp.trees.treebank.Mapper;
import edu.stanford.nlp.trees.international.arabic.ATBTreeUtils;
import edu.stanford.nlp.util.Generics;

/**
 * Applies a default set of lexical transformations that have been empirically validated
 * in various Arabic tasks. This class automatically detects the input encoding and applies
 * the appropriate set of transformations.
 *
 * @author Spence Green
 *
 */
public class DefaultLexicalMapper implements Mapper, Serializable {

  private static final long serialVersionUID = -3798804368296999785L;

  private final Pattern utf8ArabicChart = Pattern.compile("[\u0600-\u06FF]");

  //Buckwalter patterns
  private final String bwAlefChar = "A"; //U+0627
  private final Pattern bwDiacritics = Pattern.compile("F|N|K|a|u|i|\\~|o");
  private final Pattern bwTatweel = Pattern.compile("_");
  private final Pattern bwAlef = Pattern.compile("\\{|\\||>|<");
  private final Pattern bwQuran = Pattern.compile("`");
  private final Pattern bwNullAnaphoraMarker = Pattern.compile("\\[nll\\]");

  public final Pattern latinPunc = Pattern.compile("([\u0021-\u002F\u003A-\u0040\\u005B-\u0060\u007B-\u007E\u00A1-\u00BF\u00F7\u2010-\u2027\u2030-\u205E\u20A0-\u20BA])+");
  public final Pattern arabicPunc = Pattern.compile("([\u00AB\u00BB\u0609-\u060D\u061B-\u061F\u066A\u066C-\u066D\u06D4])+");

  public final Pattern arabicDigit = Pattern.compile("([\u06F0-\u06F9\u0660-\u0669])+");

  //TODO Extend coverage to entire Arabic code chart
  //Obviously Buckwalter is a lossful conversion, but no assumptions should be made about
  //UTF-8 input from "the wild"
  private final Pattern utf8Diacritics = Pattern.compile("َ|ً|ُ|ٌ|ِ|ٍ|ّ|ْ|\u0670");
  private final Pattern utf8Tatweel = Pattern.compile("ـ");
  private final Pattern utf8Alef = Pattern.compile("ا|إ|أ|آ|\u0671");
  private final Pattern utf8Quran = Pattern.compile("[\u0615-\u061A\u06D6-\u06E5]");
  private final Pattern utf8ProDrop = Pattern.compile("\\[نلل\\]");

  //Patterns to fix segmentation issues observed in the ATB
  public final Pattern segmentationMarker = Pattern.compile("^-+|-+$");
  private final Pattern morphemeBoundary = Pattern.compile("\\+");

  private final Pattern hasDigit = Pattern.compile("\\d+");

  // Process the vocalized section for parsing
  private boolean useATBVocalizedSectionMapping = false;

  // Strip morpheme boundary markers in the vocalized section
  private boolean stripMorphemeMarkersInUTF8 = false;

  // Strip all morpheme and segmentation markers in UTF-8 Arabic
  private boolean stripSegmentationMarkersInUTF8 = false;

  //wsg: "LATIN" does not appear in the Bies tagset, so be sure to pass
  //in the extended POS tags during normalization
  private final String parentTagString = "PUNC LATIN -NONE-";
  private final Set<String> parentTagsToEscape;

  private final String utf8CliticString = "ل ف و ما ه ها هم هن نا كم تن تم ى ي هما ك ب م";
//  private final Set<String> utf8Clitics;
  private final Set<String> bwClitics;

  public DefaultLexicalMapper() {
    parentTagsToEscape =
      Collections.unmodifiableSet(Generics.newHashSet(Arrays.asList(parentTagString.split("\\s+"))));

//    utf8Clitics =
//      Collections.unmodifiableSet(Generics.newHashSet(Arrays.asList(utf8CliticString.split("\\s+"))));

    Buckwalter bw = new Buckwalter(true);
    String bwString = bw.apply(utf8CliticString);
    bwClitics =
      Collections.unmodifiableSet(Generics.newHashSet(Arrays.asList(bwString.split("\\s+"))));
  }

  private String mapUtf8(String element) {
    Matcher latinPuncOnly = latinPunc.matcher(element);
    Matcher arbPuncOnly = arabicPunc.matcher(element);
    if(latinPuncOnly.matches() || arbPuncOnly.matches()) return element;

    //Remove diacritics
    Matcher rmDiacritics = utf8Diacritics.matcher(element);
    element = rmDiacritics.replaceAll("");

    if(element.length() > 1) {
      Matcher rmTatweel = utf8Tatweel.matcher(element);
      element = rmTatweel.replaceAll("");
    }

    //Normalize alef
    Matcher normAlef = utf8Alef.matcher(element);
    element = normAlef.replaceAll("ا");

    //Remove characters that only appear in the Qur'an
    Matcher rmQuran = utf8Quran.matcher(element);
    element = rmQuran.replaceAll("");

    Matcher rmProDrop = utf8ProDrop.matcher(element);
    element = rmProDrop.replaceAll("");

    if (stripMorphemeMarkersInUTF8) {
      Matcher rmMorphemeBoundary = morphemeBoundary.matcher(element);
      String strippedElem = rmMorphemeBoundary.replaceAll("");
      if(strippedElem.length() > 0)
        element = strippedElem;
    }
    if (stripSegmentationMarkersInUTF8) {
      String strippedElem = segmentationMarker.matcher(element).replaceAll("");
      if(strippedElem.length() > 0)
        element = strippedElem;
    }

    return element;
  }

  private String mapBuckwalter(String element) {
    Matcher puncOnly = latinPunc.matcher(element);
    if(puncOnly.matches()) return element;

    //Remove diacritics
    Matcher rmDiacritics = bwDiacritics.matcher(element);
    element = rmDiacritics.replaceAll("");

    //Remove tatweel
    if(element.length() > 1) {
      Matcher rmTatweel = bwTatweel.matcher(element);
      element = rmTatweel.replaceAll("");
    }

    //Normalize alef
    Matcher normAlef = bwAlef.matcher(element);
    element = normAlef.replaceAll(bwAlefChar);

    //Remove characters that only appear in the Qur'an
    Matcher rmQuran = bwQuran.matcher(element);
    element = rmQuran.replaceAll("");

    Matcher rmProDrop = bwNullAnaphoraMarker.matcher(element);
    element = rmProDrop.replaceAll("");

    // This conditional is used for normalizing raw ATB trees
    // Morpheme boundaries are removed, and segmentation markers are retained on
    // segmented morphemes (not the tokens to which the morphemes were attached)
    if (useATBVocalizedSectionMapping && element.length() > 1) {
      Matcher rmMorphemeBoundary = morphemeBoundary.matcher(element);
      element = rmMorphemeBoundary.replaceAll("");

      //wsg: This is hairy due to tokens like this in the vocalized section:
      //        layos-+-a
      Matcher cliticMarker = segmentationMarker.matcher(element);
      if(cliticMarker.find() && !hasDigit.matcher(element).find()) {
        String strippedElem = cliticMarker.replaceAll("");
        if(strippedElem.length() > 0)
          element = bwClitics.contains(strippedElem) ? element : strippedElem;
      }

    } else if (element.length() > 1 && !ATBTreeUtils.reservedWords.contains(element)) {
      Matcher rmCliticMarker = segmentationMarker.matcher(element);
      element = rmCliticMarker.replaceAll("");
    }

    return element;
  }

  public String map(String parent, String element) {
    String elem = element.trim();

    if(parent != null && parentTagsToEscape.contains(parent))
      return elem;

    Matcher utf8Encoding = utf8ArabicChart.matcher(elem);
    return (utf8Encoding.find()) ? mapUtf8(elem) : mapBuckwalter(elem);
  }

  public void setup(File path, String... options) {
    if(options == null) return;

    for (final String opt : options) {
      switch (opt) {
        case "ATBVocalizedSection":
          useATBVocalizedSectionMapping = true;
          break;
        case "StripSegMarkersInUTF8":
          stripSegmentationMarkersInUTF8 = true;
          break;
        case "StripMorphMarkersInUTF8":
          stripMorphemeMarkersInUTF8 = true;
          break;
      }
    }
  }

  //Whether or not the encoding of this word can be converted to another encoding
  //from its current encoding (Buckwalter or UTF-8)
  public boolean canChangeEncoding(String parent, String element) {
    parent = parent.trim();
    element = element.trim();

    //Hack for LDC2008E22 idiosyncrasy
    //This is NUMERIC_COMMA in the raw trees. We allow conversion of this
    //token to UTF-8 since it would appear in this encoding in arbitrary
    //UTF-8 text input
    if(parent.contains("NUMERIC_COMMA") || (parent.contains("PUNC") && element.equals("r"))) //Numeric comma
      return true;

    Matcher numMatcher = hasDigit.matcher(element);
    return !(numMatcher.find() || parentTagsToEscape.contains(parent));
  }

  public static void main(String[] args) {
    Mapper m = new DefaultLexicalMapper();

    System.out.printf("< :-> %s\n",m.map(null, "FNKqq"));
  }
}