/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.pdfbox.util; import java.util.HashMap; /** * This class allows a caller to normalize text in various ways. * It will load the ICU4J jar file if it is defined on the classpath. * * @author <a href="mailto:carrier@digital-evidence.org">Brian Carrier</a> * @version $Revision: 1.0 $ */ public class TextNormalize { private ICU4JImpl icu4j = null; private static final HashMap DIACHASH = new HashMap(); private String outputEncoding; /** * * @param encoding The Encoding that the text will eventually be written as (or null) */ public TextNormalize(String encoding) { findICU4J(); populateDiacHash(); this.outputEncoding = encoding; } private void findICU4J() { // see if we can load the icu4j classes from the classpath try { this.getClass().getClassLoader().loadClass("com.ibm.icu.text.Bidi"); this.getClass().getClassLoader().loadClass("com.ibm.icu.text.Normalizer"); icu4j = new ICU4JImpl(); } catch (ClassNotFoundException e) { icu4j = null; } } /* * Adds non-decomposing diacritics to the hash with their related * combining character. These are values that the unicode spec claims * are equivalent but are not mapped in the form NFKC normalization method. * Determined by going through the Combining Diacritical Marks section of * the Unicode spec and identifying which characters are not mapped to by * the normalization. */ private void populateDiacHash() { DIACHASH.put(new Integer(0x0060), "\u0300"); DIACHASH.put(new Integer(0x02CB), "\u0300"); DIACHASH.put(new Integer(0x0027), "\u0301"); DIACHASH.put(new Integer(0x02B9), "\u0301"); DIACHASH.put(new Integer(0x02CA), "\u0301"); DIACHASH.put(new Integer(0x005e), "\u0302"); DIACHASH.put(new Integer(0x02C6), "\u0302"); DIACHASH.put(new Integer(0x007E), "\u0303"); DIACHASH.put(new Integer(0x02C9), "\u0304"); DIACHASH.put(new Integer(0x00B0), "\u030A"); DIACHASH.put(new Integer(0x02BA), "\u030B"); DIACHASH.put(new Integer(0x02C7), "\u030C"); DIACHASH.put(new Integer(0x02C8), "\u030D"); DIACHASH.put(new Integer(0x0022), "\u030E"); DIACHASH.put(new Integer(0x02BB), "\u0312"); DIACHASH.put(new Integer(0x02BC), "\u0313"); DIACHASH.put(new Integer(0x0486), "\u0313"); DIACHASH.put(new Integer(0x055A), "\u0313"); DIACHASH.put(new Integer(0x02BD), "\u0314"); DIACHASH.put(new Integer(0x0485), "\u0314"); DIACHASH.put(new Integer(0x0559), "\u0314"); DIACHASH.put(new Integer(0x02D4), "\u031D"); DIACHASH.put(new Integer(0x02D5), "\u031E"); DIACHASH.put(new Integer(0x02D6), "\u031F"); DIACHASH.put(new Integer(0x02D7), "\u0320"); DIACHASH.put(new Integer(0x02B2), "\u0321"); DIACHASH.put(new Integer(0x02CC), "\u0329"); DIACHASH.put(new Integer(0x02B7), "\u032B"); DIACHASH.put(new Integer(0x02CD), "\u0331"); DIACHASH.put(new Integer(0x005F), "\u0332"); DIACHASH.put(new Integer(0x204E), "\u0359"); } /** * Takes a line of text in presentation order and converts it to logical order. * For most text other than Arabic and Hebrew, the presentation and logical * orders are the same. However, for Arabic and Hebrew, they are different and * if the text involves both RTL and LTR text then the Unicode BIDI algorithm * must be used to determine how to map between them. * * @param str Presentation form of line to convert (i.e. left most char is first char) * @param isRtlDominant true if the PAGE has a dominant right to left ordering * @return Logical form of string (or original string if ICU4J library is not on classpath) */ public String makeLineLogicalOrder(String str, boolean isRtlDominant) { if (icu4j != null) { return icu4j.makeLineLogicalOrder(str, isRtlDominant); } else { return str; } } /** * Normalize the presentation forms of characters in the string. * For example, convert the single "fi" ligature to "f" and "i". * * @param str String to normalize * @return Normalized string (or original string if ICU4J library is not on classpath) */ public String normalizePres(String str) { if (icu4j != null) { return icu4j.normalizePres(str); } else { return str; } } /** * Normalize the diacritic, for example, * convert non-combining diacritic characters to their combining * counterparts. * * @param str String to normalize * @return Normalized string (or original string if ICU4J library is not on classpath) */ public String normalizeDiac(String str) { /* * Unicode contains special combining forms of the diacritic characters * and we want to use these. */ if(outputEncoding != null && outputEncoding.toUpperCase().startsWith("UTF")) { Integer c = new Integer(str.charAt(0)); // convert the characters not defined in the Unicode spec if(DIACHASH.containsKey(c)) { return (String)DIACHASH.get(c); } else if (icu4j != null) { return icu4j.normalizeDiac(str); } else { return str; } } else { return str; } } }