TextNormalize.java example

Explorer
with-aes-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.pdfbox.util;

import java.util.HashMap;

/**
 * This class allows a caller to normalize text in various ways.
 * It will load the ICU4J jar file if it is defined on the classpath.
 * 
 * @author <a href="mailto:carrier@digital-evidence.org">Brian Carrier</a>
 * @version $Revision: 1.0 $
 */
public class TextNormalize 
{
    private ICU4JImpl icu4j = null;
    private static final HashMap DIACHASH = new HashMap();
    private String outputEncoding;

    /**
     * 
     * @param encoding The Encoding that the text will eventually be written as (or null)
     */
    public TextNormalize(String encoding) 
    {
        findICU4J();
        populateDiacHash();
        this.outputEncoding = encoding;
    }

    private void findICU4J() 
    {
        // see if we can load the icu4j classes from the classpath
        try 
        {
            this.getClass().getClassLoader().loadClass("com.ibm.icu.text.Bidi");
            this.getClass().getClassLoader().loadClass("com.ibm.icu.text.Normalizer");
            icu4j = new ICU4JImpl();
        } 
        catch (ClassNotFoundException e) 
        {
            icu4j = null;
        }
    }
    /*
     * Adds non-decomposing diacritics to the hash with their related
     * combining character. These are values that the unicode spec claims
     * are equivalent but are not mapped in the form NFKC normalization method.
     * Determined by going through the Combining Diacritical Marks section of 
     * the Unicode spec and identifying which characters are not mapped to by 
     * the normalization. 
     */
    private void populateDiacHash()
    {
        DIACHASH.put(new Integer(0x0060), "\u0300");
        DIACHASH.put(new Integer(0x02CB), "\u0300");
        DIACHASH.put(new Integer(0x0027), "\u0301");
        DIACHASH.put(new Integer(0x02B9), "\u0301");
        DIACHASH.put(new Integer(0x02CA), "\u0301");
        DIACHASH.put(new Integer(0x005e), "\u0302");
        DIACHASH.put(new Integer(0x02C6), "\u0302");
        DIACHASH.put(new Integer(0x007E), "\u0303");
        DIACHASH.put(new Integer(0x02C9), "\u0304");
        DIACHASH.put(new Integer(0x00B0), "\u030A");
        DIACHASH.put(new Integer(0x02BA), "\u030B");
        DIACHASH.put(new Integer(0x02C7), "\u030C");
        DIACHASH.put(new Integer(0x02C8), "\u030D");
        DIACHASH.put(new Integer(0x0022), "\u030E");
        DIACHASH.put(new Integer(0x02BB), "\u0312");
        DIACHASH.put(new Integer(0x02BC), "\u0313");
        DIACHASH.put(new Integer(0x0486), "\u0313");
        DIACHASH.put(new Integer(0x055A), "\u0313");
        DIACHASH.put(new Integer(0x02BD), "\u0314");
        DIACHASH.put(new Integer(0x0485), "\u0314");
        DIACHASH.put(new Integer(0x0559), "\u0314");
        DIACHASH.put(new Integer(0x02D4), "\u031D");
        DIACHASH.put(new Integer(0x02D5), "\u031E");
        DIACHASH.put(new Integer(0x02D6), "\u031F");
        DIACHASH.put(new Integer(0x02D7), "\u0320");
        DIACHASH.put(new Integer(0x02B2), "\u0321");
        DIACHASH.put(new Integer(0x02CC), "\u0329");
        DIACHASH.put(new Integer(0x02B7), "\u032B");
        DIACHASH.put(new Integer(0x02CD), "\u0331");
        DIACHASH.put(new Integer(0x005F), "\u0332");
        DIACHASH.put(new Integer(0x204E), "\u0359");
    }

    /**
     * Takes a line of text in presentation order and converts it to logical order.
     * For most text other than Arabic and Hebrew, the presentation and logical
     * orders are the same. However, for Arabic and Hebrew, they are different and
     * if the text involves both RTL and LTR text then the Unicode BIDI algorithm
     * must be used to determine how to map between them.  
     * 
     * @param str Presentation form of line to convert (i.e. left most char is first char)
     * @param isRtlDominant true if the PAGE has a dominant right to left ordering
     * @return Logical form of string (or original string if ICU4J library is not on classpath)
     */
    public String makeLineLogicalOrder(String str, boolean isRtlDominant) 
    {
        if (icu4j != null) 
        {
            return icu4j.makeLineLogicalOrder(str, isRtlDominant);
        }
        else 
        {
            return str;
        }
    }

    /**
     * Normalize the presentation forms of characters in the string.
     * For example, convert the single "fi" ligature to "f" and "i".
     * 
     * @param str String to normalize
     * @return Normalized string (or original string if ICU4J library is not on classpath)
     */
    public String normalizePres(String str) 
    {
        if (icu4j != null) 
        {
            return icu4j.normalizePres(str);
        }
        else 
        {
            return str;
        }
    }
    
    /**
     * Normalize the diacritic, for example, 
     * convert non-combining diacritic characters to their combining
     * counterparts. 
     * 
     * @param str String to normalize 
     * @return Normalized string (or original string if ICU4J library is not on classpath)
     */
    public String normalizeDiac(String str)
    {
        /*
         * Unicode contains special combining forms of the diacritic characters
         * and we want to use these. 
         */
        if(outputEncoding != null && outputEncoding.toUpperCase().startsWith("UTF"))
        {
            Integer c = new Integer(str.charAt(0));
            // convert the characters not defined in the Unicode spec
            if(DIACHASH.containsKey(c))
            {
                return (String)DIACHASH.get(c);
            }
            else if (icu4j != null) 
            {
                return icu4j.normalizeDiac(str);
            }
            else 
            {
                return str;
            }
        }
        else
        {
            return str;
        }
    }
}