/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* $Id$ */ package org.apache.fop.complexscripts.scripts; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.fop.complexscripts.util.CharAssociation; import org.apache.fop.complexscripts.util.GlyphSequence; // CSOFF: LineLengthCheck /** * <p>The <code>GurmukhiScriptProcessor</code> class implements a script processor for * performing glyph substitution and positioning operations on content associated with the Gurmukhi script.</p> * * <p>This work was originally authored by Glenn Adams (gadams@apache.org).</p> */ public class GurmukhiScriptProcessor extends IndicScriptProcessor { /** logging instance */ private static final Log log = LogFactory.getLog(GurmukhiScriptProcessor.class); GurmukhiScriptProcessor(String script) { super(script); } @Override protected Class<? extends GurmukhiSyllabizer> getSyllabizerClass() { return GurmukhiSyllabizer.class; } @Override // find rightmost pre-base matra protected int findPreBaseMatra(GlyphSequence gs) { int ng = gs.getGlyphCount(); int lk = -1; for (int i = ng; i > 0; i--) { int k = i - 1; if (containsPreBaseMatra(gs, k)) { lk = k; break; } } return lk; } @Override // find leftmost pre-base matra target, starting from source protected int findPreBaseMatraTarget(GlyphSequence gs, int source) { int ng = gs.getGlyphCount(); int lk = -1; for (int i = (source < ng) ? source : ng; i > 0; i--) { int k = i - 1; if (containsConsonant(gs, k)) { if (containsHalfConsonant(gs, k)) { lk = k; } else if (lk == -1) { lk = k; } else { break; } } } return lk; } private static boolean containsPreBaseMatra(GlyphSequence gs, int k) { CharAssociation a = gs.getAssociation(k); int[] ca = gs.getCharacterArray(false); for (int i = a.getStart(), e = a.getEnd(); i < e; i++) { if (isPreM(ca [ i ])) { return true; } } return false; } private static boolean containsConsonant(GlyphSequence gs, int k) { CharAssociation a = gs.getAssociation(k); int[] ca = gs.getCharacterArray(false); for (int i = a.getStart(), e = a.getEnd(); i < e; i++) { if (isC(ca [ i ])) { return true; } } return false; } private static boolean containsHalfConsonant(GlyphSequence gs, int k) { Boolean half = (Boolean) gs.getAssociation(k) .getPredication("half"); return (half != null) ? half : false; } @Override protected int findReph(GlyphSequence gs) { int ng = gs.getGlyphCount(); int li = -1; for (int i = 0; i < ng; i++) { if (containsReph(gs, i)) { li = i; break; } } return li; } @Override protected int findRephTarget(GlyphSequence gs, int source) { int ng = gs.getGlyphCount(); int c1 = -1; int c2 = -1; // first candidate target is after first non-half consonant for (int i = 0; i < ng; i++) { if ((i != source) && containsConsonant(gs, i)) { if (!containsHalfConsonant(gs, i)) { c1 = i + 1; break; } } } // second candidate target is after last non-prebase matra after first candidate or before first syllable or vedic mark for (int i = (c1 >= 0) ? c1 : 0; i < ng; i++) { if (containsMatra(gs, i) && !containsPreBaseMatra(gs, i)) { c2 = i + 1; } else if (containsOtherMark(gs, i)) { c2 = i; break; } } if (c2 >= 0) { return c2; } else if (c1 >= 0) { return c1; } else { return source; } } private static boolean containsReph(GlyphSequence gs, int k) { Boolean rphf = (Boolean) gs.getAssociation(k) .getPredication("rphf"); return (rphf != null) ? rphf : false; } private static boolean containsMatra(GlyphSequence gs, int k) { CharAssociation a = gs.getAssociation(k); int[] ca = gs.getCharacterArray(false); for (int i = a.getStart(), e = a.getEnd(); i < e; i++) { if (isM(ca [ i ])) { return true; } } return false; } private static boolean containsOtherMark(GlyphSequence gs, int k) { CharAssociation a = gs.getAssociation(k); int[] ca = gs.getCharacterArray(false); for (int i = a.getStart(), e = a.getEnd(); i < e; i++) { switch (typeOf(ca [ i ])) { case C_T: // tone (e.g., udatta, anudatta) case C_A: // accent (e.g., acute, grave) case C_O: // other (e.g., candrabindu, anusvara, visarga, etc) return true; default: break; } } return false; } private static class GurmukhiSyllabizer extends DefaultSyllabizer { GurmukhiSyllabizer(String script, String language) { super(script, language); } @Override // | C ... protected int findStartOfSyllable(int[] ca, int s, int e) { if ((s < 0) || (s >= e)) { return -1; } else { while (s < e) { int c = ca [ s ]; if (isC(c)) { break; } else { s++; } } return s; } } @Override // D* L? | ... protected int findEndOfSyllable(int[] ca, int s, int e) { if ((s < 0) || (s >= e)) { return -1; } else { int nd = 0; int nl = 0; int i; // consume dead consonants while ((i = isDeadConsonant(ca, s, e)) > s) { s = i; nd++; } // consume zero or one live consonant if ((i = isLiveConsonant(ca, s, e)) > s) { s = i; nl++; } return ((nd > 0) || (nl > 0)) ? s : -1; } } // D := ( C N? H )? private int isDeadConsonant(int[] ca, int s, int e) { if (s < 0) { return -1; } else { int c; int i = 0; int nc = 0; int nh = 0; do { // C if ((s + i) < e) { c = ca [ s + i ]; if (isC(c)) { i++; nc++; } else { break; } } // N? if ((s + i) < e) { c = ca [ s + 1 ]; if (isN(c)) { i++; } } // H if ((s + i) < e) { c = ca [ s + i ]; if (isH(c)) { i++; nh++; } else { break; } } } while (false); return (nc > 0) && (nh > 0) ? s + i : -1; } } // L := ( (C|V) N? X* )?; where X = ( MATRA | ACCENT MARK | TONE MARK | OTHER MARK ) private int isLiveConsonant(int[] ca, int s, int e) { if (s < 0) { return -1; } else { int c; int i = 0; int nc = 0; int nv = 0; int nx = 0; do { // C if ((s + i) < e) { c = ca [ s + i ]; if (isC(c)) { i++; nc++; } else if (isV(c)) { i++; nv++; } else { break; } } // N? if ((s + i) < e) { c = ca [ s + i ]; if (isN(c)) { i++; } } // X* while ((s + i) < e) { c = ca [ s + i ]; if (isX(c)) { i++; nx++; } else { break; } } } while (false); // if no X but has H, then ignore C|I if (nx == 0) { if ((s + i) < e) { c = ca [ s + i ]; if (isH(c)) { if (nc > 0) { nc--; } else if (nv > 0) { nv--; } } } } return ((nc > 0) || (nv > 0)) ? s + i : -1; } } } // gurmukhi character types static final short C_U = 0; // unassigned static final short C_C = 1; // consonant static final short C_V = 2; // vowel static final short C_M = 3; // vowel sign (matra) static final short C_S = 4; // symbol or sign static final short C_T = 5; // tone mark static final short C_A = 6; // accent mark static final short C_P = 7; // punctuation static final short C_D = 8; // digit static final short C_H = 9; // halant (virama) static final short C_O = 10; // other signs static final short C_N = 0x0100; // nukta(ized) static final short C_R = 0x0200; // reph(ized) static final short C_PRE = 0x0400; // pre-base static final short C_M_TYPE = 0x00FF; // type mask static final short C_M_FLAGS = 0x7F00; // flag mask // gurmukhi block range static final int CCA_START = 0x0A00; // first code point mapped by cca static final int CCA_END = 0x0A80; // last code point + 1 mapped by cca // gurmukhi character type lookups static final short[] CCA = { C_U, // 0x0A00 // UNASSIGNED C_O, // 0x0A01 // ADAK BINDI C_O, // 0x0A02 // BINDI C_O, // 0x0A03 // VISARGA C_U, // 0x0A04 // UNASSIGNED C_V, // 0x0A05 // A C_V, // 0x0A06 // AA C_V, // 0x0A07 // I C_V, // 0x0A08 // II C_V, // 0x0A09 // U C_V, // 0x0A0A // UU C_U, // 0x0A0B // UNASSIGNED C_U, // 0x0A0C // UNASSIGNED C_U, // 0x0A0D // UNASSIGNED C_U, // 0x0A0E // UNASSIGNED C_V, // 0x0A0F // E C_V, // 0x0A10 // AI C_U, // 0x0A11 // UNASSIGNED C_U, // 0x0A12 // UNASSIGNED C_V, // 0x0A13 // O C_V, // 0x0A14 // AU C_C, // 0x0A15 // KA C_C, // 0x0A16 // KHA C_C, // 0x0A17 // GA C_C, // 0x0A18 // GHA C_C, // 0x0A19 // NGA C_C, // 0x0A1A // CA C_C, // 0x0A1B // CHA C_C, // 0x0A1C // JA C_C, // 0x0A1D // JHA C_C, // 0x0A1E // NYA C_C, // 0x0A1F // TTA C_C, // 0x0A20 // TTHA C_C, // 0x0A21 // DDA C_C, // 0x0A22 // DDHA C_C, // 0x0A23 // NNA C_C, // 0x0A24 // TA C_C, // 0x0A25 // THA C_C, // 0x0A26 // DA C_C, // 0x0A27 // DHA C_C, // 0x0A28 // NA C_U, // 0x0A29 // UNASSIGNED C_C, // 0x0A2A // PA C_C, // 0x0A2B // PHA C_C, // 0x0A2C // BA C_C, // 0x0A2D // BHA C_C, // 0x0A2E // MA C_C, // 0x0A2F // YA C_C | C_R, // 0x0A30 // RA C_U, // 0x0A31 // UNASSIGNED C_C, // 0x0A32 // LA C_C, // 0x0A33 // LLA C_U, // 0x0A34 // UNASSIGNED C_C, // 0x0A35 // VA C_C, // 0x0A36 // SHA C_U, // 0x0A37 // UNASSIGNED C_C, // 0x0A38 // SA C_C, // 0x0A39 // HA C_U, // 0x0A3A // UNASSIGNED C_U, // 0x0A3B // UNASSIGNED C_N, // 0x0A3C // NUKTA C_U, // 0x0A3D // UNASSIGNED C_M, // 0x0A3E // AA C_M | C_PRE, // 0x0A3F // I C_M, // 0x0A40 // II C_M, // 0x0A41 // U C_M, // 0x0A42 // UU C_U, // 0x0A43 // UNASSIGNED C_U, // 0x0A44 // UNASSIGNED C_U, // 0x0A45 // UNASSIGNED C_U, // 0x0A46 // UNASSIGNED C_M, // 0x0A47 // EE C_M, // 0x0A48 // AI C_U, // 0x0A49 // UNASSIGNED C_U, // 0x0A4A // UNASSIGNED C_M, // 0x0A4B // OO C_M, // 0x0A4C // AU C_H, // 0x0A4D // VIRAMA (HALANT) C_U, // 0x0A4E // UNASSIGNED C_U, // 0x0A4F // UNASSIGNED C_U, // 0x0A50 // UNASSIGNED C_T, // 0x0A51 // UDATTA C_U, // 0x0A52 // UNASSIGNED C_U, // 0x0A53 // UNASSIGNED C_U, // 0x0A54 // UNASSIGNED C_U, // 0x0A55 // UNASSIGNED C_U, // 0x0A56 // UNASSIGNED C_U, // 0x0A57 // UNASSIGNED C_U, // 0x0A58 // UNASSIGNED C_C | C_N, // 0x0A59 // KHHA C_C | C_N, // 0x0A5A // GHHA C_C | C_N, // 0x0A5B // ZA C_C | C_N, // 0x0A5C // RRA C_U, // 0x0A5D // UNASSIGNED C_C | C_N, // 0x0A5E // FA C_U, // 0x0A5F // UNASSIGNED C_U, // 0x0A60 // UNASSIGNED C_U, // 0x0A61 // UNASSIGNED C_U, // 0x0A62 // UNASSIGNED C_U, // 0x0A63 // UNASSIGNED C_U, // 0x0A64 // UNASSIGNED C_U, // 0x0A65 // UNASSIGNED C_D, // 0x0A66 // ZERO C_D, // 0x0A67 // ONE C_D, // 0x0A68 // TWO C_D, // 0x0A69 // THREE C_D, // 0x0A6A // FOUR C_D, // 0x0A6B // FIVE C_D, // 0x0A6C // SIX C_D, // 0x0A6D // SEVEN C_D, // 0x0A6E // EIGHT C_D, // 0x0A6F // NINE C_O, // 0x0A70 // TIPPI C_O, // 0x0A71 // ADDAK C_V, // 0x0A72 // IRI C_V, // 0x0A73 // URA C_S, // 0x0A74 // EK ONKAR C_O, // 0x0A75 // YAKASH C_U, // 0x0A76 // UNASSIGNED C_U, // 0x0A77 // UNASSIGNED C_U, // 0x0A78 // UNASSIGNED C_U, // 0x0A79 // UNASSIGNED C_U, // 0x0A7A // UNASSIGNED C_U, // 0x0A7B // UNASSIGNED C_U, // 0x0A7C // UNASSIGNED C_U, // 0x0A7D // UNASSIGNED C_U, // 0x0A7E // UNASSIGNED C_U // 0x0A7F // UNASSIGNED }; static int typeOf(int c) { if ((c >= CCA_START) && (c < CCA_END)) { return CCA [ c - CCA_START ] & C_M_TYPE; } else { return C_U; } } static boolean isType(int c, int t) { return typeOf(c) == t; } static boolean hasFlag(int c, int f) { if ((c >= CCA_START) && (c < CCA_END)) { return (CCA [ c - CCA_START ] & f) == f; } else { return false; } } static boolean isC(int c) { return isType(c, C_C); } static boolean isR(int c) { return isType(c, C_C) && hasR(c); } static boolean isV(int c) { return isType(c, C_V); } static boolean isN(int c) { return c == 0x0A3C; } static boolean isH(int c) { return c == 0x0A4D; } static boolean isM(int c) { return isType(c, C_M); } static boolean isPreM(int c) { return isType(c, C_M) && hasFlag(c, C_PRE); } static boolean isX(int c) { switch (typeOf(c)) { case C_M: // matra (combining vowel) case C_A: // accent mark case C_T: // tone mark case C_O: // other (modifying) mark return true; default: return false; } } static boolean hasR(int c) { return hasFlag(c, C_R); } static boolean hasN(int c) { return hasFlag(c, C_N); } }