package edu.stanford.nlp.international.arabic.pipeline;
import java.io.File;
import java.io.Serializable;
import java.util.Arrays;
import java.util.Collections;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import edu.stanford.nlp.international.arabic.Buckwalter;
import edu.stanford.nlp.trees.treebank.Mapper;
import edu.stanford.nlp.trees.international.arabic.ATBTreeUtils;
import edu.stanford.nlp.util.Generics;
/**
* Applies a default set of lexical transformations that have been empirically validated
* in various Arabic tasks. This class automatically detects the input encoding and applies
* the appropriate set of transformations.
*
* @author Spence Green
*
*/
public class DefaultLexicalMapper implements Mapper, Serializable {
private static final long serialVersionUID = -3798804368296999785L;
private final Pattern utf8ArabicChart = Pattern.compile("[\u0600-\u06FF]");
//Buckwalter patterns
private final String bwAlefChar = "A"; //U+0627
private final Pattern bwDiacritics = Pattern.compile("F|N|K|a|u|i|\\~|o");
private final Pattern bwTatweel = Pattern.compile("_");
private final Pattern bwAlef = Pattern.compile("\\{|\\||>|<");
private final Pattern bwQuran = Pattern.compile("`");
private final Pattern bwNullAnaphoraMarker = Pattern.compile("\\[nll\\]");
public final Pattern latinPunc = Pattern.compile("([\u0021-\u002F\u003A-\u0040\\u005B-\u0060\u007B-\u007E\u00A1-\u00BF\u00F7\u2010-\u2027\u2030-\u205E\u20A0-\u20BA])+");
public final Pattern arabicPunc = Pattern.compile("([\u00AB\u00BB\u0609-\u060D\u061B-\u061F\u066A\u066C-\u066D\u06D4])+");
public final Pattern arabicDigit = Pattern.compile("([\u06F0-\u06F9\u0660-\u0669])+");
//TODO Extend coverage to entire Arabic code chart
//Obviously Buckwalter is a lossful conversion, but no assumptions should be made about
//UTF-8 input from "the wild"
private final Pattern utf8Diacritics = Pattern.compile("َ|ً|ُ|ٌ|ِ|ٍ|ّ|ْ|\u0670");
private final Pattern utf8Tatweel = Pattern.compile("ـ");
private final Pattern utf8Alef = Pattern.compile("ا|إ|أ|آ|\u0671");
private final Pattern utf8Quran = Pattern.compile("[\u0615-\u061A\u06D6-\u06E5]");
private final Pattern utf8ProDrop = Pattern.compile("\\[نلل\\]");
//Patterns to fix segmentation issues observed in the ATB
public final Pattern segmentationMarker = Pattern.compile("^-+|-+$");
private final Pattern morphemeBoundary = Pattern.compile("\\+");
private final Pattern hasDigit = Pattern.compile("\\d+");
// Process the vocalized section for parsing
private boolean useATBVocalizedSectionMapping = false;
// Strip morpheme boundary markers in the vocalized section
private boolean stripMorphemeMarkersInUTF8 = false;
// Strip all morpheme and segmentation markers in UTF-8 Arabic
private boolean stripSegmentationMarkersInUTF8 = false;
//wsg: "LATIN" does not appear in the Bies tagset, so be sure to pass
//in the extended POS tags during normalization
private final String parentTagString = "PUNC LATIN -NONE-";
private final Set<String> parentTagsToEscape;
private final String utf8CliticString = "ل ف و ما ه ها هم هن نا كم تن تم ى ي هما ك ب م";
// private final Set<String> utf8Clitics;
private final Set<String> bwClitics;
public DefaultLexicalMapper() {
parentTagsToEscape =
Collections.unmodifiableSet(Generics.newHashSet(Arrays.asList(parentTagString.split("\\s+"))));
// utf8Clitics =
// Collections.unmodifiableSet(Generics.newHashSet(Arrays.asList(utf8CliticString.split("\\s+"))));
Buckwalter bw = new Buckwalter(true);
String bwString = bw.apply(utf8CliticString);
bwClitics =
Collections.unmodifiableSet(Generics.newHashSet(Arrays.asList(bwString.split("\\s+"))));
}
private String mapUtf8(String element) {
Matcher latinPuncOnly = latinPunc.matcher(element);
Matcher arbPuncOnly = arabicPunc.matcher(element);
if(latinPuncOnly.matches() || arbPuncOnly.matches()) return element;
//Remove diacritics
Matcher rmDiacritics = utf8Diacritics.matcher(element);
element = rmDiacritics.replaceAll("");
if(element.length() > 1) {
Matcher rmTatweel = utf8Tatweel.matcher(element);
element = rmTatweel.replaceAll("");
}
//Normalize alef
Matcher normAlef = utf8Alef.matcher(element);
element = normAlef.replaceAll("ا");
//Remove characters that only appear in the Qur'an
Matcher rmQuran = utf8Quran.matcher(element);
element = rmQuran.replaceAll("");
Matcher rmProDrop = utf8ProDrop.matcher(element);
element = rmProDrop.replaceAll("");
if (stripMorphemeMarkersInUTF8) {
Matcher rmMorphemeBoundary = morphemeBoundary.matcher(element);
String strippedElem = rmMorphemeBoundary.replaceAll("");
if(strippedElem.length() > 0)
element = strippedElem;
}
if (stripSegmentationMarkersInUTF8) {
String strippedElem = segmentationMarker.matcher(element).replaceAll("");
if(strippedElem.length() > 0)
element = strippedElem;
}
return element;
}
private String mapBuckwalter(String element) {
Matcher puncOnly = latinPunc.matcher(element);
if(puncOnly.matches()) return element;
//Remove diacritics
Matcher rmDiacritics = bwDiacritics.matcher(element);
element = rmDiacritics.replaceAll("");
//Remove tatweel
if(element.length() > 1) {
Matcher rmTatweel = bwTatweel.matcher(element);
element = rmTatweel.replaceAll("");
}
//Normalize alef
Matcher normAlef = bwAlef.matcher(element);
element = normAlef.replaceAll(bwAlefChar);
//Remove characters that only appear in the Qur'an
Matcher rmQuran = bwQuran.matcher(element);
element = rmQuran.replaceAll("");
Matcher rmProDrop = bwNullAnaphoraMarker.matcher(element);
element = rmProDrop.replaceAll("");
// This conditional is used for normalizing raw ATB trees
// Morpheme boundaries are removed, and segmentation markers are retained on
// segmented morphemes (not the tokens to which the morphemes were attached)
if (useATBVocalizedSectionMapping && element.length() > 1) {
Matcher rmMorphemeBoundary = morphemeBoundary.matcher(element);
element = rmMorphemeBoundary.replaceAll("");
//wsg: This is hairy due to tokens like this in the vocalized section:
// layos-+-a
Matcher cliticMarker = segmentationMarker.matcher(element);
if(cliticMarker.find() && !hasDigit.matcher(element).find()) {
String strippedElem = cliticMarker.replaceAll("");
if(strippedElem.length() > 0)
element = bwClitics.contains(strippedElem) ? element : strippedElem;
}
} else if (element.length() > 1 && !ATBTreeUtils.reservedWords.contains(element)) {
Matcher rmCliticMarker = segmentationMarker.matcher(element);
element = rmCliticMarker.replaceAll("");
}
return element;
}
public String map(String parent, String element) {
String elem = element.trim();
if(parent != null && parentTagsToEscape.contains(parent))
return elem;
Matcher utf8Encoding = utf8ArabicChart.matcher(elem);
return (utf8Encoding.find()) ? mapUtf8(elem) : mapBuckwalter(elem);
}
public void setup(File path, String... options) {
if(options == null) return;
for (final String opt : options) {
switch (opt) {
case "ATBVocalizedSection":
useATBVocalizedSectionMapping = true;
break;
case "StripSegMarkersInUTF8":
stripSegmentationMarkersInUTF8 = true;
break;
case "StripMorphMarkersInUTF8":
stripMorphemeMarkersInUTF8 = true;
break;
}
}
}
//Whether or not the encoding of this word can be converted to another encoding
//from its current encoding (Buckwalter or UTF-8)
public boolean canChangeEncoding(String parent, String element) {
parent = parent.trim();
element = element.trim();
//Hack for LDC2008E22 idiosyncrasy
//This is NUMERIC_COMMA in the raw trees. We allow conversion of this
//token to UTF-8 since it would appear in this encoding in arbitrary
//UTF-8 text input
if(parent.contains("NUMERIC_COMMA") || (parent.contains("PUNC") && element.equals("r"))) //Numeric comma
return true;
Matcher numMatcher = hasDigit.matcher(element);
return !(numMatcher.find() || parentTagsToEscape.contains(parent));
}
public static void main(String[] args) {
Mapper m = new DefaultLexicalMapper();
System.out.printf("< :-> %s\n",m.map(null, "FNKqq"));
}
}