ArabicUtils.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.trees.international.arabic; 
import edu.stanford.nlp.util.logging.Redwood;

import edu.stanford.nlp.io.EncodingPrintWriter;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.StringUtils;

import java.io.IOException;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.FileInputStream;
import java.util.*;
import java.util.Map.Entry;


/**
 * This class contains tools for dealing with arabic text, in particular conversion to IBM normalized Arabic.
 *
 * The code was adapted to java from the perl script ar_normalize_v5.pl
 *
 * @author Alex Kleeman
 */
public class ArabicUtils  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(ArabicUtils.class);


  public static Map<String,String> presToLogicalMap(){
    Map<String,String> rules = Generics.newHashMap();

        // PRESENTATION FORM TO LOGICAL FORM NORMALIZATION (presentation form is rarely used - but some UN documents have it).
    rules.put("\\ufc5e","\u0020\u064c\u0651"); // ligature shadda with dammatan isloated
    rules.put("\\ufc5f","\u0020\u064d\u0651"); // ligature shadda with kasratan isloated
    rules.put("\\ufc60","\u0020\u064e\u0651"); // ligature shadda with fatha isloated
    rules.put("\\ufc61","\u0020\u064f\u0651"); // ligature shadda with damma isloated
    rules.put("\\ufc62","\u0020\u0650\u0651"); // ligature shadda with kasra isloated
    // Arabic Presentation Form-B to Logical Form
    rules.put("\\ufe80","\u0621"); // isolated hamza
    rules.put("[\\ufe81\\ufe82]","\u0622"); // alef with madda
    rules.put("[\\ufe83\\ufe84]","\u0623"); // alef with hamza above
    rules.put("[\\ufe85\\ufe86]","\u0624"); // waw with hamza above
    rules.put("[\\ufe87\\ufe88]","\u0625"); // alef with hamza below
    rules.put("[\\ufe89\\ufe8a\\ufe8b\\ufe8c]","\u0626"); // yeh with hamza above
    rules.put("[\\ufe8d\\ufe8e]","\u0627"); // alef
    rules.put("[\\ufe8f\\ufe90\\ufe91\\ufe92]","\u0628"); // beh
    rules.put("[\\ufe93\\ufe94]","\u0629"); // teh marbuta
    rules.put("[\\ufe95\\ufe96\\ufe97\\ufe98]","\u062a"); // teh
    rules.put("[\\ufe99\\ufe9a\\ufe9b\\ufe9c]","\u062b"); // theh
    rules.put("[\\ufe9d\\ufe9e\\ufe9f\\ufea0]","\u062c"); // jeem
    rules.put("[\\ufea1\\ufea2\\ufea3\\ufea4]","\u062d"); // haa
    rules.put("[\\ufea5\\ufea6\\ufea7\\ufea8]","\u062e"); // khaa
    rules.put("[\\ufea9\\ufeaa]","\u062f"); // dal
    rules.put("[\\ufeab\\ufeac]","\u0630"); // dhal
    rules.put("[\\ufead\\ufeae]","\u0631"); // reh
    rules.put("[\\ufeaf\\ufeb0]","\u0632"); // zain
    rules.put("[\\ufeb1\\ufeb2\\ufeb3\\ufeb4]","\u0633"); // seen
    rules.put("[\\ufeb5\\ufeb6\\ufeb7\\ufeb8]","\u0634"); // sheen
    rules.put("[\\ufeb9\\ufeba\\ufebb\\ufebc]","\u0635"); // sad
    rules.put("[\\ufebd\\ufebe\\ufebf\\ufec0]","\u0636"); // dad
    rules.put("[\\ufec1\\ufec2\\ufec3\\ufec4]","\u0637"); // tah
    rules.put("[\\ufec5\\ufec6\\ufec7\\ufec8]","\u0638"); // zah
    rules.put("[\\ufec9\\ufeca\\ufecb\\ufecc]","\u0639"); // ain
    rules.put("[\\ufecd\\ufece\\ufecf\\ufed0]","\u063a"); // ghain
    rules.put("[\\ufed1\\ufed2\\ufed3\\ufed4]","\u0641"); // feh
    rules.put("[\\ufed5\\ufed6\\ufed7\\ufed8]","\u0642"); // qaf
    rules.put("[\\ufed9\\ufeda\\ufedb\\ufedc]","\u0643"); // kaf
    rules.put("[\\ufedd\\ufede\\ufedf\\ufee0]","\u0644"); // ghain
    rules.put("[\\ufee1\\ufee2\\ufee3\\ufee4]","\u0645"); // meem
    rules.put("[\\ufee5\\ufee6\\ufee7\\ufee8]","\u0646"); // noon
    rules.put("[\\ufee9\\ufeea\\ufeeb\\ufeec]","\u0647"); // heh
    rules.put("[\\ufeed\\ufeee]","\u0648"); // waw
    rules.put("[\\ufeef\\ufef0]","\u0649"); // alef maksura
    rules.put("[\\ufef1\\ufef2\\ufef3\\ufef4]","\u064a"); // yeh
    rules.put("[\\ufef5\\ufef6]","\u0644\u0622");  // ligature: lam and alef with madda above
    rules.put("[\\ufef7\\ufef8]","\u0644\u0623");  // ligature: lam and alef with hamza above
    rules.put("[\\ufef9\\ufefa]","\u0644\u0625"); // ligature: lam and alef with hamza below
    rules.put("[\\ufefb\\ufefc]","\u0644\u0627"); // ligature: lam and alef

    return rules;

  }


  public static Map<String,String> getArabicIBMNormalizerMap(){

    Map<String,String> rules = Generics.newHashMap();

    try{
      rules.put("[\\u0622\\u0623\\u0625]","\u0627"); // hamza normalization: maddah-n-alef, hamza-on-alef, hamza-under-alef mapped to bare alef

      rules.put("[\\u0649]","\u064A");  // 'alif maqSuura mapped to yaa

      rules.put("[\\u064B\\u064C\\u064D\\u064E\\u064F\\u0650\\u0651\\u0652\\u0653\\u0670]","");  //  fatHatayn, Dammatayn, kasratayn, fatHa, Damma, kasra, shaddah, sukuun, and dagger alef (delete)

      rules.put("\\u0640(?=\\s*\\S)",""); // tatweel, delete except when trailing
      rules.put("(\\S)\\u0640","$1"); // tatweel, delete if preceeded by non-white-space


      rules.put("[\\ufeff\\u00a0]"," "); // white space normalization

      // punctuation normalization

      rules.put("\\u060c",","); // Arabic comma
      rules.put("\\u061b",";"); // Arabic semicolon
      rules.put("\\u061f","?"); // Arabic question mark
      rules.put("\\u066a","%"); // Arabic percent sign
      rules.put("\\u066b","."); // Arabic decimal separator
      rules.put("\\u066c",","); // Arabic thousand separator (comma)
      rules.put("\\u066d","*"); // Arabic asterisk
      rules.put("\\u06d4","."); // Arabic full stop

      // Arabic/Arabic indic/eastern Arabic/ digits normalization

      rules.put("[\\u0660\\u06f0\\u0966]","0");
      rules.put("[\\u0661\\u06f1\\u0967]","1");
      rules.put("[\\u0662\\u06f2\\u0968]","2");
      rules.put("[\\u0663\\u06f3\\u0969]","3");
      rules.put("[\\u0664\\u06f4\\u096a]","4");
      rules.put("[\\u0665\\u06f5\\u096b]","5");
      rules.put("[\\u0666\\u06f6\\u096c]","6");
      rules.put("[\\u0667\\u06f7\\u096d]","7");
      rules.put("[\\u0668\\u06f8\\u096e]","8");
      rules.put("[\\u0669\\u06f9\\u096f]","9");

      // Arabic combining hamza above/below and dagger(superscript)  alef
      rules.put("[\\u0654\\u0655\\u0670]","");

      // replace yaa followed by hamza with hamza on kursi (yaa)
      rules.put("\\u064A\\u0621","\u0626");

      // Normalization Rules Suggested by Ralf Brown (CMU):


      rules.put("\\u2013","-"); // EN-dash to ASCII hyphen
      rules.put("\\u2014","--"); // EM-dash to double ASII hyphen

      // code point 0x91 - latin-1 left single quote
      // code point 0x92 - latin-1 right single quote
      // code point 0x2018 = left single quote; convert to ASCII single quote
      // code point 0x2019 = right single quote; convert to ASCII single quote

      rules.put("[\\u0091\\u0092\\u2018\\u2019]","\'");

      // code point 0x93 - latin-1 left double quote
      // code point 0x94 - latin-1 right double quote
      // code points 0x201C/201D = left/right double quote -> ASCII double quote

      rules.put("[\\u0093\\u0094\\u201C\\u201D]","\"");

    }catch(Exception e){
      log.info("Caught exception creating Arabic normalizer map: " + e.toString() );
    }

    return rules;
  }


  /** This will normalize a Unicode String by applying all the normalization rules from the IBM normalization and
   *    conversion from Presentation to Logical from.
   *
   *
   *  @param in The String to be normalized
   */
  public static String normalize(String in) {

    Map<String,String> ruleMap = getArabicIBMNormalizerMap();   //Get the IBM Normalization rules

    ruleMap.putAll(presToLogicalMap());   //  Get the presentation to logical form rules

    Set<Map.Entry<String, String>> rules = ruleMap.entrySet();

    Iterator<Entry<String, String>> ruleIter = rules.iterator();

    String out = in;

    //Iteratively apply each rule to the string.
    while(ruleIter.hasNext()){
      Map.Entry<String,String> thisRule = ruleIter.next();
      out = out.replaceAll(thisRule.getKey(),thisRule.getValue());
    }

    return out;
  }


  public static void main(String[] args) throws IOException {

    Properties p = StringUtils.argsToProperties(args);

    if (p.containsKey("input")){
      FileInputStream fis = new FileInputStream(p.getProperty("input"));
      InputStreamReader isr = new InputStreamReader(fis,"UTF-8");

      BufferedReader reader = new BufferedReader(isr);
      String thisLine;
      while( (thisLine = reader.readLine()) != null){
        EncodingPrintWriter.out.println(normalize(thisLine),"UTF-8");
      }

    }

  }



}