TamilScriptProcessor.java example

Explorer
fop-master
- fop-trunk
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/* $Id$ */

package org.apache.fop.complexscripts.scripts;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import org.apache.fop.complexscripts.util.CharAssociation;
import org.apache.fop.complexscripts.util.GlyphSequence;

// CSOFF: LineLengthCheck

/**
 * <p>The <code>TamilScriptProcessor</code> class implements a script processor for
 * performing glyph substitution and positioning operations on content associated with the Tamil script.</p>
 *
 * <p>This work was originally authored by Glenn Adams (gadams@apache.org).</p>
 */
public class TamilScriptProcessor extends IndicScriptProcessor {

    /** logging instance */
    private static final Log log = LogFactory.getLog(TamilScriptProcessor.class);

    TamilScriptProcessor(String script) {
        super(script);
    }

    @Override
    protected Class<? extends TamilSyllabizer> getSyllabizerClass() {
        return TamilSyllabizer.class;
    }

    @Override
    // find rightmost pre-base matra
    protected int findPreBaseMatra(GlyphSequence gs) {
        int   ng = gs.getGlyphCount();
        int   lk = -1;
        for (int i = ng; i > 0; i--) {
            int k = i - 1;
            if (containsPreBaseMatra(gs, k)) {
                lk = k;
                break;
            }
        }
        return lk;
    }

    @Override
    // find leftmost pre-base matra target, starting from source
    protected int findPreBaseMatraTarget(GlyphSequence gs, int source) {
        int   ng = gs.getGlyphCount();
        int   lk = -1;
        for (int i = (source < ng) ? source : ng; i > 0; i--) {
            int k = i - 1;
            if (containsConsonant(gs, k)) {
                if (containsHalfConsonant(gs, k)) {
                    lk = k;
                } else if (lk == -1) {
                    lk = k;
                } else {
                    break;
                }
            }
        }
        return lk;
    }

    private static boolean containsPreBaseMatra(GlyphSequence gs, int k) {
        CharAssociation a = gs.getAssociation(k);
        int[] ca = gs.getCharacterArray(false);
        for (int i = a.getStart(), e = a.getEnd(); i < e; i++) {
            if (isPreM(ca [ i ])) {
                return true;
            }
        }
        return false;
    }

    private static boolean containsConsonant(GlyphSequence gs, int k) {
        CharAssociation a = gs.getAssociation(k);
        int[] ca = gs.getCharacterArray(false);
        for (int i = a.getStart(), e = a.getEnd(); i < e; i++) {
            if (isC(ca [ i ])) {
                return true;
            }
        }
        return false;
    }

    private static boolean containsHalfConsonant(GlyphSequence gs, int k) {
        Boolean half = (Boolean) gs.getAssociation(k).getPredication("half");
        return (half != null) ? half : false;
    }

    @Override
    protected int findReph(GlyphSequence gs) {
        int   ng = gs.getGlyphCount();
        int   li = -1;
        for (int i = 0; i < ng; i++) {
            if (containsReph(gs, i)) {
                li = i;
                break;
            }
        }
        return li;
    }

    @Override
    protected int findRephTarget(GlyphSequence gs, int source) {
        int   ng = gs.getGlyphCount();
        int   c1 = -1;
        int   c2 = -1;
        // first candidate target is after first non-half consonant
        for (int i = 0; i < ng; i++) {
            if ((i != source) && containsConsonant(gs, i)) {
                if (!containsHalfConsonant(gs, i)) {
                    c1 = i + 1;
                    break;
                }
            }
        }
        // second candidate target is after last non-prebase matra after first candidate or before first syllable or vedic mark
        for (int i = (c1 >= 0) ? c1 : 0; i < ng; i++) {
            if (containsMatra(gs, i) && !containsPreBaseMatra(gs, i)) {
                c2 = i + 1;
            } else if (containsOtherMark(gs, i)) {
                c2 = i;
                break;
            }
        }
        if (c2 >= 0) {
            return c2;
        } else if (c1 >= 0) {
            return c1;
        } else {
            return source;
        }
    }

    private static boolean containsReph(GlyphSequence gs, int k) {
        Boolean rphf = (Boolean) gs.getAssociation(k).getPredication("rphf");
        return (rphf != null) ? rphf : false;
    }

    private static boolean containsMatra(GlyphSequence gs, int k) {
        CharAssociation a = gs.getAssociation(k);
        int[] ca = gs.getCharacterArray(false);
        for (int i = a.getStart(), e = a.getEnd(); i < e; i++) {
            if (isM(ca [ i ])) {
                return true;
            }
        }
        return false;
    }

    private static boolean containsOtherMark(GlyphSequence gs, int k) {
        CharAssociation a = gs.getAssociation(k);
        int[] ca = gs.getCharacterArray(false);
        for (int i = a.getStart(), e = a.getEnd(); i < e; i++) {
            switch (typeOf(ca [ i ])) {
            case C_T:   // tone (e.g., udatta, anudatta)
            case C_A:   // accent (e.g., acute, grave)
            case C_O:   // other (e.g., candrabindu, anusvara, visarga, etc)
                return true;
            default:
                break;
            }
        }
        return false;
    }

    private static class TamilSyllabizer extends DefaultSyllabizer {
        TamilSyllabizer(String script, String language) {
            super(script, language);
        }
        @Override
        // | C ...
        protected int findStartOfSyllable(int[] ca, int s, int e) {
            if ((s < 0) || (s >= e)) {
                return -1;
            } else {
                while (s < e) {
                    int c = ca [ s ];
                    if (isC(c)) {
                        break;
                    } else {
                        s++;
                    }
                }
                return s;
            }
        }
        @Override
        // D* L? | ...
        protected int findEndOfSyllable(int[] ca, int s, int e) {
            if ((s < 0) || (s >= e)) {
                return -1;
            } else {
                int nd = 0;
                int nl = 0;
                int i;
                // consume dead consonants
                while ((i = isDeadConsonant(ca, s, e)) > s) {
                    s = i;
                    nd++;
                }
                // consume zero or one live consonant
                if ((i = isLiveConsonant(ca, s, e)) > s) {
                    s = i;
                    nl++;
                }
                return ((nd > 0) || (nl > 0)) ? s : -1;
            }
        }
        // D := ( C N? H )?
        private int isDeadConsonant(int[] ca, int s, int e) {
            if (s < 0) {
                return -1;
            } else {
                int c;
                int i = 0;
                int nc = 0;
                int nh = 0;
                do {
                    // C
                    if ((s + i) < e) {
                        c = ca [ s + i ];
                        if (isC(c)) {
                            i++;
                            nc++;
                        } else {
                            break;
                        }
                    }
                    // N?
                    if ((s + i) < e) {
                        c = ca [ s + 1 ];
                        if (isN(c)) {
                            i++;
                        }
                    }
                    // H
                    if ((s + i) < e) {
                        c = ca [ s + i ];
                        if (isH(c)) {
                            i++;
                            nh++;
                        } else {
                            break;
                        }
                    }
                } while (false);
                return (nc > 0) && (nh > 0) ? s + i : -1;
            }
        }
        // L := ( (C|V) N? X* )?; where X = ( MATRA | ACCENT MARK | TONE MARK | OTHER MARK )
        private int isLiveConsonant(int[] ca, int s, int e) {
            if (s < 0) {
                return -1;
            } else {
                int c;
                int i = 0;
                int nc = 0;
                int nv = 0;
                int nx = 0;
                do {
                    // C
                    if ((s + i) < e) {
                        c = ca [ s + i ];
                        if (isC(c)) {
                            i++;
                            nc++;
                        } else if (isV(c)) {
                            i++;
                            nv++;
                        } else {
                            break;
                        }
                    }
                    // N?
                    if ((s + i) < e) {
                        c = ca [ s + i ];
                        if (isN(c)) {
                            i++;
                        }
                    }
                    // X*
                    while ((s + i) < e) {
                        c = ca [ s + i ];
                        if (isX(c)) {
                            i++;
                            nx++;
                        } else {
                            break;
                        }
                    }
                } while (false);
                // if no X but has H, then ignore C|I
                if (nx == 0) {
                    if ((s + i) < e) {
                        c = ca [ s + i ];
                        if (isH(c)) {
                            if (nc > 0) {
                                nc--;
                            } else if (nv > 0) {
                                nv--;
                            }
                        }
                    }
                }
                return ((nc > 0) || (nv > 0)) ? s + i : -1;
            }
        }
    }

    // tamil character types
    static final short C_U          = 0;                        // unassigned
    static final short C_C          = 1;                        // consonant
    static final short C_V          = 2;                        // vowel
    static final short C_M          = 3;                        // vowel sign (matra)
    static final short C_S          = 4;                        // symbol or sign
    static final short C_T          = 5;                        // tone mark
    static final short C_A          = 6;                        // accent mark
    static final short C_P          = 7;                        // punctuation
    static final short C_D          = 8;                        // digit
    static final short C_H          = 9;                        // halant (virama)
    static final short C_O          = 10;                       // other signs
    static final short C_N          = 0x0100;                   // nukta(ized)
    static final short C_R          = 0x0200;                   // reph(ized)
    static final short C_PRE        = 0x0400;                   // pre-base
    static final short C_POST       = 0x1000;                   // post-base
    static final short C_WRAP       = C_PRE | C_POST;           // wrap (two part) vowel
    static final short C_M_TYPE     = 0x00FF;                   // type mask
    static final short C_M_FLAGS    = 0x7F00;                   // flag mask
    // tamil block range
    static final int CCA_START       =  0x0B80;                 // first code point mapped by cca
    static final int CCA_END         =  0x0C00;                 // last code point + 1 mapped by cca
    // tamil character type lookups
    static final short[] CCA = {
        C_U,                        // 0x0B80                   //
        C_U,                        // 0x0B81                   //
        C_O,                        // 0x0B82                   // ANUSVARA
        C_O,                        // 0x0B83                   // VISARGA
        C_U,                        // 0x0B84                   //
        C_V,                        // 0x0B85                   // A
        C_V,                        // 0x0B86                   // AA
        C_V,                        // 0x0B87                   // I
        C_V,                        // 0x0B88                   // II
        C_V,                        // 0x0B89                   // U
        C_V,                        // 0x0B8A                   // UU
        C_U,                        // 0x0B8B                   //
        C_U,                        // 0x0B8C                   //
        C_U,                        // 0x0B8D                   //
        C_V,                        // 0x0B8E                   // E
        C_V,                        // 0x0B8F                   // EE
        C_V,                        // 0x0B90                   // AI
        C_U,                        // 0x0B91                   //
        C_V,                        // 0x0B92                   // O
        C_V,                        // 0x0B93                   // OO
        C_V,                        // 0x0B94                   // AU
        C_C,                        // 0x0B95                   // KA
        C_U,                        // 0x0B96                   //
        C_U,                        // 0x0B97                   //
        C_U,                        // 0x0B98                   //
        C_C,                        // 0x0B99                   // NGA
        C_C,                        // 0x0B9A                   // CA
        C_U,                        // 0x0B9B                   //
        C_C,                        // 0x0B9C                   // JA
        C_U,                        // 0x0B9D                   //
        C_C,                        // 0x0B9E                   // NYA
        C_C,                        // 0x0B9F                   // TTA
        C_U,                        // 0x0BA0                   //
        C_U,                        // 0x0BA1                   //
        C_U,                        // 0x0BA2                   //
        C_C,                        // 0x0BA3                   // NNA
        C_C,                        // 0x0BA4                   // TA
        C_U,                        // 0x0BA5                   //
        C_U,                        // 0x0BA6                   //
        C_U,                        // 0x0BA7                   //
        C_C,                        // 0x0BA8                   // NA
        C_C,                        // 0x0BA9                   // NNNA
        C_C,                        // 0x0BAA                   // PA
        C_U,                        // 0x0BAB                   //
        C_U,                        // 0x0BAC                   //
        C_U,                        // 0x0BAD                   //
        C_C,                        // 0x0BAE                   // MA
        C_C,                        // 0x0BAF                   // YA
        C_C | C_R,                  // 0x0BB0                   // RA
        C_C | C_R,                  // 0x0BB1                   // RRA
        C_C,                        // 0x0BB2                   // LA
        C_C,                        // 0x0BB3                   // LLA
        C_C,                        // 0x0BB4                   // LLLA
        C_C,                        // 0x0BB5                   // VA
        C_C,                        // 0x0BB6                   // SHA
        C_C,                        // 0x0BB7                   // SSA
        C_C,                        // 0x0BB8                   // SA
        C_C,                        // 0x0BB9                   // HA
        C_U,                        // 0x0BBA                   //
        C_U,                        // 0x0BBB                   //
        C_U,                        // 0x0BBC                   //
        C_U,                        // 0x0BBD                   //
        C_M,                        // 0x0BBE                   // AA
        C_M,                        // 0x0BBF                   // I
        C_M,                        // 0x0BC0                   // II
        C_M,                        // 0x0BC1                   // U
        C_M,                        // 0x0BC2                   // UU
        C_U,                        // 0x0BC3                   //
        C_U,                        // 0x0BC4                   //
        C_U,                        // 0x0BC5                   //
        C_M | C_PRE,                // 0x0BC6                   // E
        C_M | C_PRE,                // 0x0BC7                   // EE
        C_M | C_PRE,                // 0x0BC8                   // AI
        C_U,                        // 0x0BC9                   //
        C_M | C_WRAP,               // 0x0BCA                   // O
        C_M | C_WRAP,               // 0x0BCB                   // OO
        C_M | C_WRAP,               // 0x0BCC                   // AU
        C_H,                        // 0x0BCD                   // VIRAMA (HALANT)
        C_U,                        // 0x0BCE                   //
        C_U,                        // 0x0BCF                   //
        C_S,                        // 0x0BD0                   // OM
        C_U,                        // 0x0BD1                   //
        C_U,                        // 0x0BD2                   //
        C_U,                        // 0x0BD3                   //
        C_U,                        // 0x0BD4                   //
        C_U,                        // 0x0BD5                   //
        C_U,                        // 0x0BD6                   //
        C_M,                        // 0x0BD7                   // AU LENGTH MARK
        C_U,                        // 0x0BD8                   //
        C_U,                        // 0x0BD9                   //
        C_U,                        // 0x0BDA                   //
        C_U,                        // 0x0BDB                   //
        C_U,                        // 0x0BDC                   //
        C_U,                        // 0x0BDD                   //
        C_U,                        // 0x0BDE                   //
        C_U,                        // 0x0BDF                   //
        C_U,                        // 0x0BE0                   //
        C_U,                        // 0x0BE1                   //
        C_U,                        // 0x0BE2                   //
        C_U,                        // 0x0BE3                   //
        C_U,                        // 0x0BE4                   //
        C_U,                        // 0x0BE5                   //
        C_D,                        // 0x0BE6                   // ZERO
        C_D,                        // 0x0BE7                   // ONE
        C_D,                        // 0x0BE8                   // TWO
        C_D,                        // 0x0BE9                   // THREE
        C_D,                        // 0x0BEA                   // FOUR
        C_D,                        // 0x0BEB                   // FIVE
        C_D,                        // 0x0BEC                   // SIX
        C_D,                        // 0x0BED                   // SEVEN
        C_D,                        // 0x0BEE                   // EIGHT
        C_D,                        // 0x0BEF                   // NINE
        C_S,                        // 0x0BF0                   // TEN
        C_S,                        // 0x0BF1                   // ONE HUNDRED
        C_S,                        // 0x0BF2                   // ONE THOUSAND
        C_S,                        // 0x0BF3                   // DAY SIGN (naal)
        C_S,                        // 0x0BF4                   // MONTH SIGN (maatham)
        C_S,                        // 0x0BF5                   // YEAR SIGN (varudam)
        C_S,                        // 0x0BF6                   // DEBIT SIGN (patru)
        C_S,                        // 0x0BF7                   // CREDIT SIGN (varavu)
        C_S,                        // 0x0BF8                   // AS ABOVE SIGN (merpadi)
        C_S,                        // 0x0BF9                   // RUPEE SIGN (rupai)
        C_S,                        // 0x0BFA                   // NUMBER SIGN (enn)
        C_U,                        // 0x0BFB                   //
        C_U,                        // 0x0BFC                   //
        C_U,                        // 0x0BFD                   //
        C_U,                        // 0x0BFE                   //
        C_U                         // 0x0BFF                   //
    };
    static int typeOf(int c) {
        if ((c >= CCA_START) && (c < CCA_END)) {
            return CCA [ c - CCA_START ] & C_M_TYPE;
        } else {
            return C_U;
        }
    }
    static boolean isType(int c, int t) {
        return typeOf(c) == t;
    }
    static boolean hasFlag(int c, int f) {
        if ((c >= CCA_START) && (c < CCA_END)) {
            return (CCA [ c - CCA_START ] & f) == f;
        } else {
            return false;
        }
    }
    static boolean isC(int c) {
        return isType(c, C_C);
    }
    static boolean isR(int c) {
        return isType(c, C_C) && hasR(c);
    }
    static boolean isV(int c) {
        return isType(c, C_V);
    }
    static boolean isN(int c) {
        return c == 0x093C;
    }
    static boolean isH(int c) {
        return c == 0x094D;
    }
    static boolean isM(int c) {
        return isType(c, C_M);
    }
    static boolean isPreM(int c) {
        return isType(c, C_M) && hasFlag(c, C_PRE);
    }
    static boolean isX(int c) {
        switch (typeOf(c)) {
        case C_M: // matra (combining vowel)
        case C_A: // accent mark
        case C_T: // tone mark
        case C_O: // other (modifying) mark
            return true;
        default:
            return false;
        }
    }
    static boolean hasR(int c) {
        return hasFlag(c, C_R);
    }
    static boolean hasN(int c) {
        return hasFlag(c, C_N);
    }

}