DevanagariScriptProcessor.java example

Explorer
fop-master
- fop-trunk
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/* $Id$ */

package org.apache.fop.complexscripts.scripts;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import org.apache.fop.complexscripts.util.CharAssociation;
import org.apache.fop.complexscripts.util.GlyphSequence;

// CSOFF: LineLengthCheck

/**
 * <p>The <code>DevanagariScriptProcessor</code> class implements a script processor for
 * performing glyph substitution and positioning operations on content associated with the Devanagari script.</p>
 *
 * <p>This work was originally authored by Glenn Adams (gadams@apache.org).</p>
 */
public class DevanagariScriptProcessor extends IndicScriptProcessor {

    /** logging instance */
    private static final Log log = LogFactory.getLog(DevanagariScriptProcessor.class);

    DevanagariScriptProcessor(String script) {
        super(script);
    }

    @Override
    protected Class<? extends DevanagariSyllabizer> getSyllabizerClass() {
        return DevanagariSyllabizer.class;
    }

    @Override
    // find rightmost pre-base matra
    protected int findPreBaseMatra(GlyphSequence gs) {
        int   ng = gs.getGlyphCount();
        int   lk = -1;
        for (int i = ng; i > 0; i--) {
            int k = i - 1;
            if (containsPreBaseMatra(gs, k)) {
                lk = k;
                break;
            }
        }
        return lk;
    }

    @Override
    // find leftmost pre-base matra target, starting from source
    protected int findPreBaseMatraTarget(GlyphSequence gs, int source) {
        int   ng = gs.getGlyphCount();
        int   lk = -1;
        for (int i = (source < ng) ? source : ng; i > 0; i--) {
            int k = i - 1;
            if (containsConsonant(gs, k)) {
                if (containsHalfConsonant(gs, k)) {
                    lk = k;
                } else if (lk == -1) {
                    lk = k;
                } else {
                    break;
                }
            }
        }
        return lk;
    }

    private static boolean containsPreBaseMatra(GlyphSequence gs, int k) {
        CharAssociation a = gs.getAssociation(k);
        int[] ca = gs.getCharacterArray(false);
        for (int i = a.getStart(), e = a.getEnd(); i < e; i++) {
            if (isPreM(ca [ i ])) {
                return true;
            }
        }
        return false;
    }

    private static boolean containsConsonant(GlyphSequence gs, int k) {
        CharAssociation a = gs.getAssociation(k);
        int[] ca = gs.getCharacterArray(false);
        for (int i = a.getStart(), e = a.getEnd(); i < e; i++) {
            if (isC(ca [ i ])) {
                return true;
            }
        }
        return false;
    }

    private static boolean containsHalfConsonant(GlyphSequence gs, int k) {
        Boolean half = (Boolean) gs.getAssociation(k) .getPredication("half");
        return (half != null) ? half : false;
    }

    @Override
    protected int findReph(GlyphSequence gs) {
        int   ng = gs.getGlyphCount();
        int   li = -1;
        for (int i = 0; i < ng; i++) {
            if (containsReph(gs, i)) {
                li = i;
                break;
            }
        }
        return li;
    }

    @Override
    protected int findRephTarget(GlyphSequence gs, int source) {
        int   ng = gs.getGlyphCount();
        int   c1 = -1;
        int   c2 = -1;
        // first candidate target is after first non-half consonant
        for (int i = 0; i < ng; i++) {
            if ((i != source) && containsConsonant(gs, i)) {
                if (!containsHalfConsonant(gs, i)) {
                    c1 = i + 1;
                    break;
                }
            }
        }
        // second candidate target is after last non-prebase matra after first candidate or before first syllable or vedic mark
        for (int i = (c1 >= 0) ? c1 : 0; i < ng; i++) {
            if (containsMatra(gs, i) && !containsPreBaseMatra(gs, i)) {
                c2 = i + 1;
            } else if (containsOtherMark(gs, i)) {
                c2 = i;
                break;
            }
        }
        if (c2 >= 0) {
            return c2;
        } else if (c1 >= 0) {
            return c1;
        } else {
            return source;
        }
    }

    private static boolean containsReph(GlyphSequence gs, int k) {
        Boolean rphf = (Boolean) gs.getAssociation(k) .getPredication("rphf");
        return (rphf != null) ? rphf : false;
    }

    private static boolean containsMatra(GlyphSequence gs, int k) {
        CharAssociation a = gs.getAssociation(k);
        int[] ca = gs.getCharacterArray(false);
        for (int i = a.getStart(), e = a.getEnd(); i < e; i++) {
            if (isM(ca [ i ])) {
                return true;
            }
        }
        return false;
    }

    private static boolean containsOtherMark(GlyphSequence gs, int k) {
        CharAssociation a = gs.getAssociation(k);
        int[] ca = gs.getCharacterArray(false);
        for (int i = a.getStart(), e = a.getEnd(); i < e; i++) {
            switch (typeOf(ca [ i ])) {
            case C_T:   // tone (e.g., udatta, anudatta)
            case C_A:   // accent (e.g., acute, grave)
            case C_O:   // other (e.g., candrabindu, anusvara, visarga, etc)
                return true;
            default:
                break;
            }
        }
        return false;
    }

    private static class DevanagariSyllabizer extends DefaultSyllabizer {
        DevanagariSyllabizer(String script, String language) {
            super(script, language);
        }
        @Override
        // | C ...
        protected int findStartOfSyllable(int[] ca, int s, int e) {
            if ((s < 0) || (s >= e)) {
                return -1;
            } else {
                while (s < e) {
                    int c = ca [ s ];
                    if (isC(c)) {
                        break;
                    } else {
                        s++;
                    }
                }
                return s;
            }
        }
        @Override
        // D* L? | ...
        protected int findEndOfSyllable(int[] ca, int s, int e) {
            if ((s < 0) || (s >= e)) {
                return -1;
            } else {
                int nd = 0;
                int nl = 0;
                int i;
                // consume dead consonants
                while ((i = isDeadConsonant(ca, s, e)) > s) {
                    s = i;
                    nd++;
                }
                // consume zero or one live consonant
                if ((i = isLiveConsonant(ca, s, e)) > s) {
                    s = i;
                    nl++;
                }
                return ((nd > 0) || (nl > 0)) ? s : -1;
            }
        }
        // D := ( C N? H )?
        private int isDeadConsonant(int[] ca, int s, int e) {
            if (s < 0) {
                return -1;
            } else {
                int c;
                int i = 0;
                int nc = 0;
                int nh = 0;
                do {
                    // C
                    if ((s + i) < e) {
                        c = ca [ s + i ];
                        if (isC(c)) {
                            i++;
                            nc++;
                        } else {
                            break;
                        }
                    }
                    // N?
                    if ((s + i) < e) {
                        c = ca [ s + 1 ];
                        if (isN(c)) {
                            i++;
                        }
                    }
                    // H
                    if ((s + i) < e) {
                        c = ca [ s + i ];
                        if (isH(c)) {
                            i++;
                            nh++;
                        } else {
                            break;
                        }
                    }
                } while (false);
                return (nc > 0) && (nh > 0) ? s + i : -1;
            }
        }
        // L := ( (C|V) N? X* )?; where X = ( MATRA | ACCENT MARK | TONE MARK | OTHER MARK )
        private int isLiveConsonant(int[] ca, int s, int e) {
            if (s < 0) {
                return -1;
            } else {
                int c;
                int i = 0;
                int nc = 0;
                int nv = 0;
                int nx = 0;
                do {
                    // C
                    if ((s + i) < e) {
                        c = ca [ s + i ];
                        if (isC(c)) {
                            i++;
                            nc++;
                        } else if (isV(c)) {
                            i++;
                            nv++;
                        } else {
                            break;
                        }
                    }
                    // N?
                    if ((s + i) < e) {
                        c = ca [ s + i ];
                        if (isN(c)) {
                            i++;
                        }
                    }
                    // X*
                    while ((s + i) < e) {
                        c = ca [ s + i ];
                        if (isX(c)) {
                            i++;
                            nx++;
                        } else {
                            break;
                        }
                    }
                } while (false);
                // if no X but has H, then ignore C|I
                if (nx == 0) {
                    if ((s + i) < e) {
                        c = ca [ s + i ];
                        if (isH(c)) {
                            if (nc > 0) {
                                nc--;
                            } else if (nv > 0) {
                                nv--;
                            }
                        }
                    }
                }
                return ((nc > 0) || (nv > 0)) ? s + i : -1;
            }
        }
    }

    // devanagari character types
    static final short C_U          = 0;            // unassigned
    static final short C_C          = 1;            // consonant
    static final short C_V          = 2;            // vowel
    static final short C_M          = 3;            // vowel sign (matra)
    static final short C_S          = 4;            // symbol or sign
    static final short C_T          = 5;            // tone mark
    static final short C_A          = 6;            // accent mark
    static final short C_P          = 7;            // punctuation
    static final short C_D          = 8;            // digit
    static final short C_H          = 9;            // halant (virama)
    static final short C_O          = 10;           // other signs
    static final short C_N          = 0x0100;       // nukta(ized)
    static final short C_R          = 0x0200;       // reph(ized)
    static final short C_PRE        = 0x0400;       // pre-base
    static final short C_M_TYPE     = 0x00FF;       // type mask
    static final short C_M_FLAGS    = 0x7F00;       // flag mask
    // devanagari block range
    static final int CCA_START       =  0x0900;      // first code point mapped by cca
    static final int CCA_END         =  0x0980;      // last code point + 1 mapped by cca
    // devanagari character type lookups
    static final short[] CCA = {
        C_O,                        // 0x0900       // INVERTED CANDRABINDU
        C_O,                        // 0x0901       // CANDRABINDU
        C_O,                        // 0x0902       // ANUSVARA
        C_O,                        // 0x0903       // VISARGA
        C_V,                        // 0x0904       // SHORT A
        C_V,                        // 0x0905       // A
        C_V,                        // 0x0906       // AA
        C_V,                        // 0x0907       // I
        C_V,                        // 0x0908       // II
        C_V,                        // 0x0909       // U
        C_V,                        // 0x090A       // UU
        C_V,                        // 0x090B       // VOCALIC R
        C_V,                        // 0x090C       // VOCALIC L
        C_V,                        // 0x090D       // CANDRA E
        C_V,                        // 0x090E       // SHORT E
        C_V,                        // 0x090F       // E
        C_V,                        // 0x0910       // AI
        C_V,                        // 0x0911       // CANDRA O
        C_V,                        // 0x0912       // SHORT O
        C_V,                        // 0x0913       // O
        C_V,                        // 0x0914       // AU
        C_C,                        // 0x0915       // KA
        C_C,                        // 0x0916       // KHA
        C_C,                        // 0x0917       // GA
        C_C,                        // 0x0918       // GHA
        C_C,                        // 0x0919       // NGA
        C_C,                        // 0x091A       // CA
        C_C,                        // 0x091B       // CHA
        C_C,                        // 0x091C       // JA
        C_C,                        // 0x091D       // JHA
        C_C,                        // 0x091E       // NYA
        C_C,                        // 0x091F       // TTA
        C_C,                        // 0x0920       // TTHA
        C_C,                        // 0x0921       // DDA
        C_C,                        // 0x0922       // DDHA
        C_C,                        // 0x0923       // NNA
        C_C,                        // 0x0924       // TA
        C_C,                        // 0x0925       // THA
        C_C,                        // 0x0926       // DA
        C_C,                        // 0x0927       // DHA
        C_C,                        // 0x0928       // NA
        C_C,                        // 0x0929       // NNNA
        C_C,                        // 0x092A       // PA
        C_C,                        // 0x092B       // PHA
        C_C,                        // 0x092C       // BA
        C_C,                        // 0x092D       // BHA
        C_C,                        // 0x092E       // MA
        C_C,                        // 0x092F       // YA
        C_C | C_R,                  // 0x0930       // RA
        C_C | C_R | C_N,            // 0x0931       // RRA          = 0930+093C
        C_C,                        // 0x0932       // LA
        C_C,                        // 0x0933       // LLA
        C_C,                        // 0x0934       // LLLA
        C_C,                        // 0x0935       // VA
        C_C,                        // 0x0936       // SHA
        C_C,                        // 0x0937       // SSA
        C_C,                        // 0x0938       // SA
        C_C,                        // 0x0939       // HA
        C_M,                        // 0x093A       // OE (KASHMIRI)
        C_M,                        // 0x093B       // OOE (KASHMIRI)
        C_N,                        // 0x093C       // NUKTA
        C_S,                        // 0x093D       // AVAGRAHA
        C_M,                        // 0x093E       // AA
        C_M | C_PRE,                // 0x093F       // I
        C_M,                        // 0x0940       // II
        C_M,                        // 0x0941       // U
        C_M,                        // 0x0942       // UU
        C_M,                        // 0x0943       // VOCALIC R
        C_M,                        // 0x0944       // VOCALIC RR
        C_M,                        // 0x0945       // CANDRA E
        C_M,                        // 0x0946       // SHORT E
        C_M,                        // 0x0947       // E
        C_M,                        // 0x0948       // AI
        C_M,                        // 0x0949       // CANDRA O
        C_M,                        // 0x094A       // SHORT O
        C_M,                        // 0x094B       // O
        C_M,                        // 0x094C       // AU
        C_H,                        // 0x094D       // VIRAMA (HALANT)
        C_M,                        // 0x094E       // PRISHTHAMATRA E
        C_M,                        // 0x094F       // AW
        C_S,                        // 0x0950       // OM
        C_T,                        // 0x0951       // UDATTA
        C_T,                        // 0x0952       // ANUDATTA
        C_A,                        // 0x0953       // GRAVE
        C_A,                        // 0x0954       // ACUTE
        C_M,                        // 0x0955       // CANDRA LONG E
        C_M,                        // 0x0956       // UE
        C_M,                        // 0x0957       // UUE
        C_C | C_N,                  // 0x0958       // QA
        C_C | C_N,                  // 0x0959       // KHHA
        C_C | C_N,                  // 0x095A       // GHHA
        C_C | C_N,                  // 0x095B       // ZA
        C_C | C_N,                  // 0x095C       // DDDHA
        C_C | C_N,                  // 0x095D       // RHA
        C_C | C_N,                  // 0x095E       // FA
        C_C | C_N,                  // 0x095F       // YYA
        C_V,                        // 0x0960       // VOCALIC RR
        C_V,                        // 0x0961       // VOCALIC LL
        C_M,                        // 0x0962       // VOCALIC RR
        C_M,                        // 0x0963       // VOCALIC LL
        C_P,                        // 0x0964       // DANDA
        C_P,                        // 0x0965       // DOUBLE DANDA
        C_D,                        // 0x0966       // ZERO
        C_D,                        // 0x0967       // ONE
        C_D,                        // 0x0968       // TWO
        C_D,                        // 0x0969       // THREE
        C_D,                        // 0x096A       // FOUR
        C_D,                        // 0x096B       // FIVE
        C_D,                        // 0x096C       // SIX
        C_D,                        // 0x096D       // SEVEN
        C_D,                        // 0x096E       // EIGHT
        C_D,                        // 0x096F       // NINE
        C_S,                        // 0x0970       // ABBREVIATION SIGN
        C_S,                        // 0x0971       // HIGH SPACING DOT
        C_V,                        // 0x0972       // CANDRA A (MARATHI)
        C_V,                        // 0x0973       // OE (KASHMIRI)
        C_V,                        // 0x0974       // OOE (KASHMIRI)
        C_V,                        // 0x0975       // AW (KASHMIRI)
        C_V,                        // 0x0976       // UE (KASHMIRI)
        C_V,                        // 0x0977       // UUE (KASHMIRI)
        C_U,                        // 0x0978       // UNASSIGNED
        C_C,                        // 0x0979       // ZHA
        C_C,                        // 0x097A       // HEAVY YA
        C_C,                        // 0x097B       // GGAA (SINDHI)
        C_C,                        // 0x097C       // JJA (SINDHI)
        C_C,                        // 0x097D       // GLOTTAL STOP (LIMBU)
        C_C,                        // 0x097E       // DDDA (SINDHI)
        C_C                         // 0x097F       // BBA (SINDHI)
    };
    static int typeOf(int c) {
        if ((c >= CCA_START) && (c < CCA_END)) {
            return CCA [ c - CCA_START ] & C_M_TYPE;
        } else {
            return C_U;
        }
    }
    static boolean isType(int c, int t) {
        return typeOf(c) == t;
    }
    static boolean hasFlag(int c, int f) {
        if ((c >= CCA_START) && (c < CCA_END)) {
            return (CCA [ c - CCA_START ] & f) == f;
        } else {
            return false;
        }
    }
    static boolean isC(int c) {
        return isType(c, C_C);
    }
    static boolean isR(int c) {
        return isType(c, C_C) && hasR(c);
    }
    static boolean isV(int c) {
        return isType(c, C_V);
    }
    static boolean isN(int c) {
        return c == 0x093C;
    }
    static boolean isH(int c) {
        return c == 0x094D;
    }
    static boolean isM(int c) {
        return isType(c, C_M);
    }
    static boolean isPreM(int c) {
        return isType(c, C_M) && hasFlag(c, C_PRE);
    }
    static boolean isX(int c) {
        switch (typeOf(c)) {
        case C_M: // matra (combining vowel)
        case C_A: // accent mark
        case C_T: // tone mark
        case C_O: // other (modifying) mark
            return true;
        default:
            return false;
        }
    }
    static boolean hasR(int c) {
        return hasFlag(c, C_R);
    }
    static boolean hasN(int c) {
        return hasFlag(c, C_N);
    }

}