package edu.stanford.nlp.trees.international.arabic; import edu.stanford.nlp.util.logging.Redwood; import edu.stanford.nlp.io.EncodingPrintWriter; import edu.stanford.nlp.util.Generics; import edu.stanford.nlp.util.StringUtils; import java.io.IOException; import java.io.BufferedReader; import java.io.InputStreamReader; import java.io.FileInputStream; import java.util.*; import java.util.Map.Entry; /** * This class contains tools for dealing with arabic text, in particular conversion to IBM normalized Arabic. * * The code was adapted to java from the perl script ar_normalize_v5.pl * * @author Alex Kleeman */ public class ArabicUtils { /** A logger for this class */ private static Redwood.RedwoodChannels log = Redwood.channels(ArabicUtils.class); public static Map<String,String> presToLogicalMap(){ Map<String,String> rules = Generics.newHashMap(); // PRESENTATION FORM TO LOGICAL FORM NORMALIZATION (presentation form is rarely used - but some UN documents have it). rules.put("\\ufc5e","\u0020\u064c\u0651"); // ligature shadda with dammatan isloated rules.put("\\ufc5f","\u0020\u064d\u0651"); // ligature shadda with kasratan isloated rules.put("\\ufc60","\u0020\u064e\u0651"); // ligature shadda with fatha isloated rules.put("\\ufc61","\u0020\u064f\u0651"); // ligature shadda with damma isloated rules.put("\\ufc62","\u0020\u0650\u0651"); // ligature shadda with kasra isloated // Arabic Presentation Form-B to Logical Form rules.put("\\ufe80","\u0621"); // isolated hamza rules.put("[\\ufe81\\ufe82]","\u0622"); // alef with madda rules.put("[\\ufe83\\ufe84]","\u0623"); // alef with hamza above rules.put("[\\ufe85\\ufe86]","\u0624"); // waw with hamza above rules.put("[\\ufe87\\ufe88]","\u0625"); // alef with hamza below rules.put("[\\ufe89\\ufe8a\\ufe8b\\ufe8c]","\u0626"); // yeh with hamza above rules.put("[\\ufe8d\\ufe8e]","\u0627"); // alef rules.put("[\\ufe8f\\ufe90\\ufe91\\ufe92]","\u0628"); // beh rules.put("[\\ufe93\\ufe94]","\u0629"); // teh marbuta rules.put("[\\ufe95\\ufe96\\ufe97\\ufe98]","\u062a"); // teh rules.put("[\\ufe99\\ufe9a\\ufe9b\\ufe9c]","\u062b"); // theh rules.put("[\\ufe9d\\ufe9e\\ufe9f\\ufea0]","\u062c"); // jeem rules.put("[\\ufea1\\ufea2\\ufea3\\ufea4]","\u062d"); // haa rules.put("[\\ufea5\\ufea6\\ufea7\\ufea8]","\u062e"); // khaa rules.put("[\\ufea9\\ufeaa]","\u062f"); // dal rules.put("[\\ufeab\\ufeac]","\u0630"); // dhal rules.put("[\\ufead\\ufeae]","\u0631"); // reh rules.put("[\\ufeaf\\ufeb0]","\u0632"); // zain rules.put("[\\ufeb1\\ufeb2\\ufeb3\\ufeb4]","\u0633"); // seen rules.put("[\\ufeb5\\ufeb6\\ufeb7\\ufeb8]","\u0634"); // sheen rules.put("[\\ufeb9\\ufeba\\ufebb\\ufebc]","\u0635"); // sad rules.put("[\\ufebd\\ufebe\\ufebf\\ufec0]","\u0636"); // dad rules.put("[\\ufec1\\ufec2\\ufec3\\ufec4]","\u0637"); // tah rules.put("[\\ufec5\\ufec6\\ufec7\\ufec8]","\u0638"); // zah rules.put("[\\ufec9\\ufeca\\ufecb\\ufecc]","\u0639"); // ain rules.put("[\\ufecd\\ufece\\ufecf\\ufed0]","\u063a"); // ghain rules.put("[\\ufed1\\ufed2\\ufed3\\ufed4]","\u0641"); // feh rules.put("[\\ufed5\\ufed6\\ufed7\\ufed8]","\u0642"); // qaf rules.put("[\\ufed9\\ufeda\\ufedb\\ufedc]","\u0643"); // kaf rules.put("[\\ufedd\\ufede\\ufedf\\ufee0]","\u0644"); // ghain rules.put("[\\ufee1\\ufee2\\ufee3\\ufee4]","\u0645"); // meem rules.put("[\\ufee5\\ufee6\\ufee7\\ufee8]","\u0646"); // noon rules.put("[\\ufee9\\ufeea\\ufeeb\\ufeec]","\u0647"); // heh rules.put("[\\ufeed\\ufeee]","\u0648"); // waw rules.put("[\\ufeef\\ufef0]","\u0649"); // alef maksura rules.put("[\\ufef1\\ufef2\\ufef3\\ufef4]","\u064a"); // yeh rules.put("[\\ufef5\\ufef6]","\u0644\u0622"); // ligature: lam and alef with madda above rules.put("[\\ufef7\\ufef8]","\u0644\u0623"); // ligature: lam and alef with hamza above rules.put("[\\ufef9\\ufefa]","\u0644\u0625"); // ligature: lam and alef with hamza below rules.put("[\\ufefb\\ufefc]","\u0644\u0627"); // ligature: lam and alef return rules; } public static Map<String,String> getArabicIBMNormalizerMap(){ Map<String,String> rules = Generics.newHashMap(); try{ rules.put("[\\u0622\\u0623\\u0625]","\u0627"); // hamza normalization: maddah-n-alef, hamza-on-alef, hamza-under-alef mapped to bare alef rules.put("[\\u0649]","\u064A"); // 'alif maqSuura mapped to yaa rules.put("[\\u064B\\u064C\\u064D\\u064E\\u064F\\u0650\\u0651\\u0652\\u0653\\u0670]",""); // fatHatayn, Dammatayn, kasratayn, fatHa, Damma, kasra, shaddah, sukuun, and dagger alef (delete) rules.put("\\u0640(?=\\s*\\S)",""); // tatweel, delete except when trailing rules.put("(\\S)\\u0640","$1"); // tatweel, delete if preceeded by non-white-space rules.put("[\\ufeff\\u00a0]"," "); // white space normalization // punctuation normalization rules.put("\\u060c",","); // Arabic comma rules.put("\\u061b",";"); // Arabic semicolon rules.put("\\u061f","?"); // Arabic question mark rules.put("\\u066a","%"); // Arabic percent sign rules.put("\\u066b","."); // Arabic decimal separator rules.put("\\u066c",","); // Arabic thousand separator (comma) rules.put("\\u066d","*"); // Arabic asterisk rules.put("\\u06d4","."); // Arabic full stop // Arabic/Arabic indic/eastern Arabic/ digits normalization rules.put("[\\u0660\\u06f0\\u0966]","0"); rules.put("[\\u0661\\u06f1\\u0967]","1"); rules.put("[\\u0662\\u06f2\\u0968]","2"); rules.put("[\\u0663\\u06f3\\u0969]","3"); rules.put("[\\u0664\\u06f4\\u096a]","4"); rules.put("[\\u0665\\u06f5\\u096b]","5"); rules.put("[\\u0666\\u06f6\\u096c]","6"); rules.put("[\\u0667\\u06f7\\u096d]","7"); rules.put("[\\u0668\\u06f8\\u096e]","8"); rules.put("[\\u0669\\u06f9\\u096f]","9"); // Arabic combining hamza above/below and dagger(superscript) alef rules.put("[\\u0654\\u0655\\u0670]",""); // replace yaa followed by hamza with hamza on kursi (yaa) rules.put("\\u064A\\u0621","\u0626"); // Normalization Rules Suggested by Ralf Brown (CMU): rules.put("\\u2013","-"); // EN-dash to ASCII hyphen rules.put("\\u2014","--"); // EM-dash to double ASII hyphen // code point 0x91 - latin-1 left single quote // code point 0x92 - latin-1 right single quote // code point 0x2018 = left single quote; convert to ASCII single quote // code point 0x2019 = right single quote; convert to ASCII single quote rules.put("[\\u0091\\u0092\\u2018\\u2019]","\'"); // code point 0x93 - latin-1 left double quote // code point 0x94 - latin-1 right double quote // code points 0x201C/201D = left/right double quote -> ASCII double quote rules.put("[\\u0093\\u0094\\u201C\\u201D]","\""); }catch(Exception e){ log.info("Caught exception creating Arabic normalizer map: " + e.toString() ); } return rules; } /** This will normalize a Unicode String by applying all the normalization rules from the IBM normalization and * conversion from Presentation to Logical from. * * * @param in The String to be normalized */ public static String normalize(String in) { Map<String,String> ruleMap = getArabicIBMNormalizerMap(); //Get the IBM Normalization rules ruleMap.putAll(presToLogicalMap()); // Get the presentation to logical form rules Set<Map.Entry<String, String>> rules = ruleMap.entrySet(); Iterator<Entry<String, String>> ruleIter = rules.iterator(); String out = in; //Iteratively apply each rule to the string. while(ruleIter.hasNext()){ Map.Entry<String,String> thisRule = ruleIter.next(); out = out.replaceAll(thisRule.getKey(),thisRule.getValue()); } return out; } public static void main(String[] args) throws IOException { Properties p = StringUtils.argsToProperties(args); if (p.containsKey("input")){ FileInputStream fis = new FileInputStream(p.getProperty("input")); InputStreamReader isr = new InputStreamReader(fis,"UTF-8"); BufferedReader reader = new BufferedReader(isr); String thisLine; while( (thisLine = reader.readLine()) != null){ EncodingPrintWriter.out.println(normalize(thisLine),"UTF-8"); } } } }