package edu.stanford.nlp.international.arabic.pipeline; import java.io.File; import java.util.*; import java.util.regex.*; import edu.stanford.nlp.trees.treebank.Mapper; import edu.stanford.nlp.util.Generics; /** * Applies a default set of lexical transformations that have been empirically validated * in various Arabic tasks. This class automatically detects the input encoding and applies * the appropriate set of transformations. * * @author Spence Green * */ public class GaleP4LexMapper implements Mapper { private static final Pattern utf8ArabicChart = Pattern.compile("[\u0600-\u06FF]"); //Buckwalter patterns private static final String bwAlefChar = "A"; //U+0627 private static final Pattern bwDiacritics = Pattern.compile("F|N|K|a|u|i|\\~|o"); private static final Pattern bwTatweel = Pattern.compile("_"); private static final Pattern bwAlef = Pattern.compile("\\{"); private static final Pattern bwQuran = Pattern.compile("`"); //TODO Extend coverage to entire Arabic code chart //Obviously Buckwalter is a lossful conversion, but no assumptions should be made about //UTF-8 input from "the wild" private static final Pattern utf8Diacritics = Pattern.compile("َ|ً|ُ|ٌ|ِ|ٍ|ّ|ْ"); private static final Pattern utf8Tatweel = Pattern.compile("ـ"); private static final Pattern utf8Alef = Pattern.compile("\u0671"); private static final Pattern utf8Quran = Pattern.compile("[\u0615-\u061A]|[\u06D6-\u06E5]"); //Patterns to fix segmentation issues observed in the ATB private static final Pattern cliticMarker = Pattern.compile("^-|-$"); private static final Pattern hasNum = Pattern.compile("\\d+"); private final Set<String> parentTagsToEscape; public GaleP4LexMapper() { //Tags for the canChangeEncoding() method parentTagsToEscape = Generics.newHashSet(); parentTagsToEscape.add("PUNC"); parentTagsToEscape.add("LATIN"); parentTagsToEscape.add("-NONE-"); } private String mapUtf8(String element) { //Remove diacritics Matcher rmDiacritics = utf8Diacritics.matcher(element); element = rmDiacritics.replaceAll(""); if(element.length() > 1) { Matcher rmTatweel = utf8Tatweel.matcher(element); element = rmTatweel.replaceAll(""); } //Normalize alef Matcher normAlef = utf8Alef.matcher(element); element = normAlef.replaceAll("ا"); //Remove characters that only appear in the Qur'an Matcher rmQuran = utf8Quran.matcher(element); element = rmQuran.replaceAll(""); if(element.length() > 1) { Matcher rmCliticMarker = cliticMarker.matcher(element); element = rmCliticMarker.replaceAll(""); } return element; } private String mapBuckwalter(String element) { //Remove diacritics Matcher rmDiacritics = bwDiacritics.matcher(element); element = rmDiacritics.replaceAll(""); //Remove tatweel if(element.length() > 1) { Matcher rmTatweel = bwTatweel.matcher(element); element = rmTatweel.replaceAll(""); } //Normalize alef Matcher normAlef = bwAlef.matcher(element); element = normAlef.replaceAll(bwAlefChar); //Remove characters that only appear in the Qur'an Matcher rmQuran = bwQuran.matcher(element); element = rmQuran.replaceAll(""); if(element.length() > 1) { Matcher rmCliticMarker = cliticMarker.matcher(element); element = rmCliticMarker.replaceAll(""); } return element; } public String map(String parent, String element) { String elem = element.trim(); if(parentTagsToEscape.contains(parent)) return elem; Matcher utf8Encoding = utf8ArabicChart.matcher(elem); return (utf8Encoding.find()) ? mapUtf8(elem) : mapBuckwalter(elem); } public void setup(File path, String... options) {} //Whether or not the encoding of this word can be converted to another encoding //from its current encoding (Buckwalter or UTF-8) public boolean canChangeEncoding(String parent, String element) { parent = parent.trim(); element = element.trim(); //Hack for LDC2008E22 idiosyncrasy //This is NUMERIC_COMMA in the raw trees. We allow conversion of this //token to UTF-8 since it would appear in this encoding in arbitrary //UTF-8 text input if(parent.contains("NUMERIC_COMMA") || (parent.contains("PUNC") && element.equals("r"))) //Numeric comma return true; Matcher numMatcher = hasNum.matcher(element); if(numMatcher.find() || parentTagsToEscape.contains(parent)) return false; return true; } }