GaleP4LexMapper.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.international.arabic.pipeline;

import java.io.File;
import java.util.*;
import java.util.regex.*;

import edu.stanford.nlp.trees.treebank.Mapper;
import edu.stanford.nlp.util.Generics;

/**
 * Applies a default set of lexical transformations that have been empirically validated
 * in various Arabic tasks. This class automatically detects the input encoding and applies
 * the appropriate set of transformations.
 *
 * @author Spence Green
 *
 */
public class GaleP4LexMapper implements Mapper {

  private static final Pattern utf8ArabicChart = Pattern.compile("[\u0600-\u06FF]");

  //Buckwalter patterns
  private static final String bwAlefChar = "A"; //U+0627
  private static final Pattern bwDiacritics = Pattern.compile("F|N|K|a|u|i|\\~|o");
  private static final Pattern bwTatweel = Pattern.compile("_");
  private static final Pattern bwAlef = Pattern.compile("\\{");
  private static final Pattern bwQuran = Pattern.compile("`");

  //TODO Extend coverage to entire Arabic code chart
  //Obviously Buckwalter is a lossful conversion, but no assumptions should be made about
  //UTF-8 input from "the wild"
  private static final Pattern utf8Diacritics = Pattern.compile("َ|ً|ُ|ٌ|ِ|ٍ|ّ|ْ");
  private static final Pattern utf8Tatweel = Pattern.compile("ـ");
  private static final Pattern utf8Alef = Pattern.compile("\u0671");
  private static final Pattern utf8Quran = Pattern.compile("[\u0615-\u061A]|[\u06D6-\u06E5]");

  //Patterns to fix segmentation issues observed in the ATB
  private static final Pattern cliticMarker = Pattern.compile("^-|-$");

  private static final Pattern hasNum = Pattern.compile("\\d+");
  private final Set<String> parentTagsToEscape;

  public GaleP4LexMapper() {

    //Tags for the canChangeEncoding() method
    parentTagsToEscape = Generics.newHashSet();
    parentTagsToEscape.add("PUNC");
    parentTagsToEscape.add("LATIN");
    parentTagsToEscape.add("-NONE-");
  }

  private String mapUtf8(String element) {
    //Remove diacritics
    Matcher rmDiacritics = utf8Diacritics.matcher(element);
    element = rmDiacritics.replaceAll("");

    if(element.length() > 1) {
      Matcher rmTatweel = utf8Tatweel.matcher(element);
      element = rmTatweel.replaceAll("");
    }

    //Normalize alef
    Matcher normAlef = utf8Alef.matcher(element);
    element = normAlef.replaceAll("ا");

    //Remove characters that only appear in the Qur'an
    Matcher rmQuran = utf8Quran.matcher(element);
    element = rmQuran.replaceAll("");

    if(element.length() > 1) {
      Matcher rmCliticMarker = cliticMarker.matcher(element);
      element = rmCliticMarker.replaceAll("");
    }

    return element;
  }

  private String mapBuckwalter(String element) {
    //Remove diacritics
    Matcher rmDiacritics = bwDiacritics.matcher(element);
    element = rmDiacritics.replaceAll("");

    //Remove tatweel
    if(element.length() > 1) {
      Matcher rmTatweel = bwTatweel.matcher(element);
      element = rmTatweel.replaceAll("");
    }

    //Normalize alef
    Matcher normAlef = bwAlef.matcher(element);
    element = normAlef.replaceAll(bwAlefChar);

    //Remove characters that only appear in the Qur'an
    Matcher rmQuran = bwQuran.matcher(element);
    element = rmQuran.replaceAll("");

    if(element.length() > 1) {
      Matcher rmCliticMarker = cliticMarker.matcher(element);
      element = rmCliticMarker.replaceAll("");
    }

    return element;
  }

  public String map(String parent, String element) {
    String elem = element.trim();

    if(parentTagsToEscape.contains(parent))
      return elem;

    Matcher utf8Encoding = utf8ArabicChart.matcher(elem);
    return (utf8Encoding.find()) ? mapUtf8(elem) : mapBuckwalter(elem);
  }

  public void setup(File path, String... options) {}

  //Whether or not the encoding of this word can be converted to another encoding
  //from its current encoding (Buckwalter or UTF-8)
  public boolean canChangeEncoding(String parent, String element) {
    parent = parent.trim();
    element = element.trim();

    //Hack for LDC2008E22 idiosyncrasy
    //This is NUMERIC_COMMA in the raw trees. We allow conversion of this
    //token to UTF-8 since it would appear in this encoding in arbitrary
    //UTF-8 text input
    if(parent.contains("NUMERIC_COMMA") || (parent.contains("PUNC") && element.equals("r"))) //Numeric comma
      return true;

    Matcher numMatcher = hasNum.matcher(element);
    if(numMatcher.find() || parentTagsToEscape.contains(parent))
      return false;

    return true;
  }

}