package com.tyndalehouse.step.core.utils.language; import java.util.ArrayList; import java.util.List; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.tyndalehouse.step.core.utils.language.hebrew.ConsonantType; import com.tyndalehouse.step.core.utils.language.hebrew.HebrewLetter; import com.tyndalehouse.step.core.utils.language.hebrew.HebrewLetterType; import com.tyndalehouse.step.core.utils.language.hebrew.SoundingType; import com.tyndalehouse.step.core.utils.language.hebrew.VowelLengthType; import com.tyndalehouse.step.core.utils.language.hebrew.VowelStressType; import com.tyndalehouse.step.core.utils.language.transliteration.StringToStringRule; import com.tyndalehouse.step.core.utils.language.transliteration.TransliterationRule; /** * Utilities for doing Hebrew transliteration * * @author chrisburrell */ public final class HebrewUtils { private static final Logger LOGGER = LoggerFactory.getLogger(HebrewUtils.class); public static final char HYPHEN = '.'; public static final char MAQAF_HYPHEN = '-'; private static transient List<TransliterationRule> transliterationRules; private static final char CLOSED_QUOTE = '\u2019'; private static final char OPEN_QUOTE = '\u2018'; private static final char K_WITH_LINE = '\u1e35'; private static final char T_WITH_DOT = '\u1e6d'; private static final char H_WITH_DOT = '\u1e25'; private static final char B_WITH_LINE = '\u1E07'; private static final char QAMATS_QATAN = 0x5C7; private static final char SHEVA = 0x05B0; private static final char HATAF_SEGOL = 0x5B1; private static final char HATAF_PATAH = 0x5B2; private static final char HATAF_QAMATS = 0x5B3; private static final char HIRIQ = 0x5B4; private static final char TSERE = 0x5B5; private static final char SEGOL = 0x5B6; private static final char PATAH = 0x5B7; private static final char QAMATS = 0x5B8; private static final char HOLAM = 0x5B9; private static final char HOLAM_HASER = 0x5BA; private static final char QUBUTS = 0x5BB; private static final char DAGESH = 0x5BC; private static final char METEG = 0x05BD; private static final char SHIN_DOT = 0x05C1; private static final int ETNAHTA = 0x0591; private static final char GERESH = 0x059C; private static final char GERESH_MUQDAM = 0x059D; private static final int ZINOR = 0x05AE; private static final int DAGESH_GAP = 0xFB44 - 0x05e3; private static final int ALEPH = 0x05D0; private static final char ALEPH_LAMED = 0xFB4F; private static final char BET = 0x5D1; private static final char GIMEL = 0x5D2; private static final char DALET = 0x5D3; private static final int HE = 0x5D4; private static final char VAV = 0x5D5; private static final int ZAYIN = 0x5D6; private static final char HET = 0x5D7; private static final int TET = 0x5D8; private static final char YOD = 0x5D9; private static final char FINAL_KAF = 0x5DA; private static final char KAF = 0x5DB; private static final char LAMED = 0x5DC; private static final int FINAL_MEM = 0x5DD; private static final int MEM = 0x5DE; private static final int FINAL_NUN = 0x5DF; private static final int NUN = 0x5E0; private static final int SAMEKH = 0x5E1; private static final int AYIN = 0x5E2; private static final int FINAL_PE = 0x5E3; private static final char PE = 0x5E4; private static final int FINAL_TSADI = 0x5E5; private static final int TSADI = 0x5E6; private static final char QOF = 0x5E7; private static final char RESH = 0x5E8; private static final int SIN = 0x5E9; private static final char TAV = 0x5EA; private static final char MAQAF = 0x05BE; private static final char HEBREW_COMBINED_RANGE_START = 0xFB1D; /** * prevent instantiation */ private HebrewUtils() { // do nothing } /** * @param rawForm the raw form of the word * @return true if it is hebrew text */ public static boolean isHebrewText(final String rawForm) { final int firstCharacter = rawForm.charAt(0); return isHebrewCharacter(firstCharacter); } /** * @param firstCharacter the character that we are testing * @return true to indicate we are dealing with the Hebrew set of characters */ public static boolean isHebrewCharacter(final int firstCharacter) { return (firstCharacter > 0x590 && firstCharacter < 0x600) || (firstCharacter > 0xFB10 && firstCharacter < 0xFB50); } /** * @param word text with pointing * @return text without pointing */ public static String unPoint(final String word) { return unPoint(word, true); } /** * @param word text with pointing * @param unpointVowels true to indicate we also want to exclude vowels * @return text without pointing */ public static String unPoint(final String word, boolean unpointVowels) { char endChar = unpointVowels ? ALEPH : SHEVA; final StringBuilder sb = new StringBuilder(word); int i = 0; while (i < sb.length()) { final char currentChar = sb.charAt(i); //ignore characters outside of the Hebrew character set if(currentChar < ETNAHTA || currentChar > ALEPH_LAMED) { i++; } else if (currentChar < endChar) { sb.deleteCharAt(i); } else if (currentChar >= HEBREW_COMBINED_RANGE_START && currentChar < ALEPH_LAMED) { sb.setCharAt(i, (char) (currentChar - DAGESH_GAP)); i++; } else { i++; } } return sb.toString(); } /** * Cleans up a String so that it can be indexed properly * * @param stepTransliteration the transliteration to be cleaned up * @return the new transliteration */ public static String removeHebrewTranslitMarkUpForIndexing(final String stepTransliteration) { final StringBuilder sb = new StringBuilder(stepTransliteration); // also remove double letters... char lastLetter = 0x0; for (int ii = 0; ii < sb.length(); ) { final char currentLetter = sb.charAt(ii); switch (currentLetter) { case '.': case '-': case '\'': case '*': case CLOSED_QUOTE: case OPEN_QUOTE: sb.deleteCharAt(ii); continue; case K_WITH_LINE: sb.setCharAt(ii, 'k'); break; case T_WITH_DOT: sb.setCharAt(ii, 't'); break; case H_WITH_DOT: sb.setCharAt(ii, 'h'); break; case B_WITH_LINE: sb.setCharAt(ii, 'b'); break; case 'é': sb.setCharAt(ii, 'e'); break; default: break; } if (currentLetter == lastLetter) { sb.deleteCharAt(ii); continue; } lastLetter = currentLetter; ii++; } return sb.toString(); } /** * @return gives the hebrew list of transliteration rules */ public static List<TransliterationRule> getTransliterationRules() { ensureTransliterationRules(); return transliterationRules; } /** * creates the transliteration rules lazily, on first time */ private static void ensureTransliterationRules() { if (transliterationRules != null) { return; } createTransliterationRules(); } /** * creates the rules, this is synchronized so that no-two threads are creating it at any point of time */ private static synchronized void createTransliterationRules() { // check again if it has been initialized, as we may be coming second if (transliterationRules == null) { final List<TransliterationRule> rules = new ArrayList<TransliterationRule>(); rules.add(new StringToStringRule("b", new String[]{"v"})); rules.add(new StringToStringRule("v", new String[]{"b", "w"})); rules.add(new StringToStringRule("w", new String[]{"v" })); rules.add(new StringToStringRule("h", new String[]{"ch", "chch"})); rules.add(new StringToStringRule("ch", new String[]{"h", "chch"})); rules.add(new StringToStringRule("j", new String[]{"y"})); rules.add(new StringToStringRule("x", new String[]{"h", "ch", "chch" })); rules.add(new StringToStringRule("+", new String[]{"t"})); rules.add(new StringToStringRule("$", new String[]{"s"})); rules.add(new StringToStringRule("s", new String[]{"sh", "ts", "shsh", "tsts"})); rules.add(new StringToStringRule("sh", new String[]{"shsh"})); rules.add(new StringToStringRule("gh", new String[]{"g"})); rules.add(new StringToStringRule("kh", new String[]{"k"})); rules.add(new StringToStringRule("k", new String[]{"kh" })); rules.add(new StringToStringRule("dh", new String[]{"d"})); rules.add(new StringToStringRule("th", new String[]{"t"})); rules.add(new StringToStringRule("ph", new String[]{"p" })); rules.add(new StringToStringRule("p", new String[]{"ph"})); rules.add(new StringToStringRule("tz", new String[]{"ts", "tsts" })); rules.add(new StringToStringRule("y", new String[]{""})); rules.add(new StringToStringRule("a", new String[]{""})); rules.add(new StringToStringRule("e", new String[]{""})); rules.add(new StringToStringRule("é", new String[]{"e"})); transliterationRules = rules; } } /** * vowel-based hebrew transliteration * * @param inputString the input string * @return the transliteration */ public static String transliterateHebrew(final String inputString) { final HebrewLetter[] letters = new HebrewLetter[inputString.length()]; final char[] input = inputString.toCharArray(); try { // iterate through looking for Yods for (int ii = 0; ii < input.length; ii++) { if (isHebrewConsonant(input[ii])) { processHebrewConsonant(letters, input, ii); processForteDagesh(input, ii, letters); } else if (isHebrewVowel(input[ii])) { processHebrewVowel(letters, input, ii); } else if (input[ii] != DAGESH) { processNonDagesh(letters, input, ii); } else { // dagesh letters[ii] = new HebrewLetter(input[ii]); } } boolean stressedWord = firstPass(letters, input); secondPass(letters, input); thirdPass(letters, input); String transliteration = transliterate(input, letters, stressedWord); if (LOGGER.isTraceEnabled()) { outputAnalysis(letters, inputString, transliteration); } return transliteration; // CHECKSTYLE:OFF } catch (final RuntimeException ex) { // output the error analysis LOGGER.error("=================================================================="); // output the letters first for (int ii = 0; ii < input.length; ii++) { final HebrewLetter hl = letters[ii]; LOGGER.error("[{}]: c:[{}]", ii, input[ii]); if (hl != null) { LOGGER.error( "char=[0x{}]\tletter=[{}]\tconsonant=[{}]\tvLength[{}]\tvStress[{}]\tsounding[{}]", Integer.toString(hl.getC(), 16), hl.getHebrewLetterType(), hl.getConsonantType(), hl.getVowelLengthType(), hl.getVowelStressType(), hl.getSoundingType()); } } LOGGER.error("Error occured during Hebrew transliteration. Analysis is above", ex); throw ex; } // CHECKSTYLE:ON } /** * Marks Alephs & Ayins as silent * * @param letters our current set of letters * @param input the input set of letters */ private static void thirdPass(final HebrewLetter[] letters, final char[] input) { for (int ii = 0; ii < letters.length; ii++) { if (input[ii] == AYIN || input[ii] == ALEPH) { //look for vowels until next consonant for (int jj = ii + 1; untilEndOfWord(letters, jj); jj++) { if (letters[jj].isConsonant()) { letters[ii].setSoundingType(SoundingType.SILENT); break; } else if (letters[jj].isVowel()) { letters[ii].setSoundingType(SoundingType.SOUNDING); break; } } } } } /** * Processes a hebrew vowel * * @param letters the input, strongly typed * @param input the input as a string * @param position our current position */ private static void processHebrewVowel(final HebrewLetter[] letters, final char[] input, final int position) { // a vowel or other pointing final HebrewLetter letter = new HebrewLetter(input[position]); letter.setHebrewLetterType(HebrewLetterType.VOWEL); if (isAny(input[position], SHEVA, HATAF_SEGOL, HATAF_PATAH, HATAF_QAMATS)) { letter.setVowelLengthType(VowelLengthType.VERY_SHORT); } else if (isAny(input[position], TSERE, QAMATS, HOLAM_HASER, HOLAM)) { letter.setVowelLengthType(VowelLengthType.LONG); } else if (input[position] == HIRIQ && hasAnyPointing(input, position, true, METEG) || hasAnyPointing(input, position, false, METEG)) { letter.setVowelLengthType(VowelLengthType.LONG); } else { letter.setVowelLengthType(VowelLengthType.SHORT); } letters[position] = letter; } /** * Pre-parse processing of a Non-Dagesh character. This gets called if input[ii] character is neither a * vowel, nor a consonant, nor a dagesh form * * @param letters the set of letters found so far. 0->ii-1 have already been processed. ii-end are yet * to be processed * @param input the input, ii indicating how far through we are * @param ii ii the index of how far through we are */ private static void processNonDagesh(final HebrewLetter[] letters, final char[] input, final int ii) { if(input[ii] >= ETNAHTA && input[ii] <= ZINOR || input[ii] == METEG) { // accents final HebrewLetter letter = new HebrewLetter(input[ii]); letters[ii] = letter; letter.setHebrewLetterType(HebrewLetterType.ACCENT); } else { letters[ii] = new HebrewLetter(input[ii]); } } /** * Pre-parse processing of a Hebrew consonant, processes the input[ii] character * * @param letters the set of letters found so far. 0->ii-1 have already been processed. ii-end are yet * to be processed * @param input the input, ii indicating how far through we are * @param ii ii the index of how far through we are */ private static void processHebrewConsonant(final HebrewLetter[] letters, final char[] input, final int ii) { // CHECKSTYLE:OFF if (ii >= 2 && processYod(input, letters, ii)) { // do nothing } else if (processVav(input, letters, ii)) { // do nothing } else { final HebrewLetter letter = new HebrewLetter(input[ii]); letter.setHebrewLetterType(HebrewLetterType.CONSONANT); letters[ii] = letter; if (input[ii] == SIN && hasAnyPointing(input, ii, true, SHIN_DOT)) { letter.setIsShin(true); } } // CHECKSTYLE:ON } /** * Outputs the analysis at trace level * * @param letters the list of letters * @param inputString the string to be transliterated * @param transliteration the transliteration of these letters */ private static void outputAnalysis(final HebrewLetter[] letters, final String inputString, final String transliteration) { LOGGER.trace("**********************************"); LOGGER.trace("ANALYSIS FOR: [{}] => [{}]", inputString, transliteration); for (final HebrewLetter hl : letters) { LOGGER.trace( "char=[{}],xchar=[0x{}]\tletter=[{}]\tconsonant=[{}]\tvLength[{}]\tvStress[{}]\tsounding[{}]", hl.getC(), Integer.toString(hl.getC(), 16), hl.getHebrewLetterType(), hl.getConsonantType(), hl.getVowelLengthType(), hl.getVowelStressType(), hl.getSoundingType()); } LOGGER.trace("**********************************"); } /** * Stresses vowels and corrects vavs with dagesh iterates through all vowels and stresses them * * @param letters the set of strongly typed letters * @param input the actual characters * @return word has a stress */ private static boolean firstPass(final HebrewLetter[] letters, final char[] input) { boolean hasStress = false; for (int ii = 0; ii < letters.length; ii++) { //ignore the SHIN DOT if (letters[ii].getC() == SHIN_DOT) { continue; } if (HebrewLetterType.ACCENT.equals(letters[ii].getHebrewLetterType())) { if (isNotGeresh(input, ii) || previousConsonant(letters, ii) != 0) { final HebrewLetter letter = getCloseVowel(letters, ii); letter.setVowelStressType(VowelStressType.STRESSED); hasStress = true; } } else if (letters[ii].getC() == VAV && hasCloseDagesh(input, ii)) { letters[ii].setVowelLengthType(VowelLengthType.SHORT); } } return hasStress; } /** * @param input our input * @param position the current position * @return true if the letter is not a GERESH MUQDAM and not a GERESH character */ private static boolean isNotGeresh(final char[] input, final int position) { return input[position] != GERESH_MUQDAM && input[position] != GERESH; } /** * Looking for the first previous consonant * * @param letters the set of strongly typed letters * @param currentPosition the actual position in the cahracter * @return position of previous consonant */ private static int previousConsonant(final HebrewLetter[] letters, final int currentPosition) { for (int ii = currentPosition - 1; ii >= 0; ii--) { if (letters[ii].isConsonant()) { return ii; } } return -1; } /** * Marking silent shevas * * @param letters the set of strongly typed letters * @param input the actual characters */ private static void secondPass(final HebrewLetter[] letters, final char[] input) { int previousConsonantPosition = -1; int currentConsonantPosition = -1; for (int ii = 0; ii < letters.length; ii++) { if (letters[ii].isConsonant()) { previousConsonantPosition = currentConsonantPosition; currentConsonantPosition = ii; } else if (letters[ii].getC() == SHEVA) { if (!isLastHebrewConsonantInWordWithoutVowel(letters, ii) && (isStartOfWord(letters, currentConsonantPosition) || !letters[currentConsonantPosition].hasNoDagesh() || isAfterLongUnstressedVowel(letters, currentConsonantPosition) || hasAnyPointing(input, previousConsonantPosition, true, SHEVA)) && !(isAfterShortUnstressedVowel(letters, currentConsonantPosition) && letters[ii].hasNoDagesh())) { letters[ii].setSoundingType(SoundingType.SOUNDING); } else { letters[ii].setSoundingType(SoundingType.SILENT); } } } } /** * Transliterates letters one by one * * * * @param letters the list of letters * @param stressedWord true to indicate the word has at least one stress @return the transliterated string */ private static String transliterate(char[] input, final HebrewLetter[] letters, final boolean stressedWord) { final StringBuilder output = new StringBuilder(letters.length + 16); for (int ii = 0; ii < letters.length; ii++) { transliterate(letters, ii, output, stressedWord); } doEndings(input, output); return output.toString(); } /** * Transliterates the given input. This method creates the actual transliteration. * * @param letter the array of letters identified so far, after the parsing has occurred. * @param output current output * @param current the current position in 'letter' to be processed * @param hasStress true to indicate a word has a stress */ public static void transliterate(final HebrewLetter[] letter, final int current, final StringBuilder output, boolean hasStress) { final HebrewLetter currentLetter = letter[current]; final char c = currentLetter.getC(); // if (currentLetter.isStressed()) { // output.append('*'); // } // hyphenating vowels hyphenateSyllables(letter, current, output, hasStress); final int sizeBeforeAppending = output.length(); mapHebrewLetterToTransliteratedLetter(current, output, currentLetter, c); doubleHyphenateIfApplicable(output, letter, current, sizeBeforeAppending); } /** * Given the current input, examines the letter and outputs the relevant character to the StringBuilder * output. * * @param current the current position * @param output the rendered output * @param currentLetter the current letter * @param c the char value that we are examining */ // CHECKSTYLE:OFF private static void mapHebrewLetterToTransliteratedLetter(final int current, final StringBuilder output, final HebrewLetter currentLetter, final char c) { switch (c) { // consonants case ALEPH: // output.append(CLOSED_QUOTE); break; case BET: if (currentLetter.hasNoDagesh() && current != 0) { output.append('v'); } else { output.append('b'); } break; case GIMEL: output.append('g'); break; case DALET: output.append('d'); break; case HE: output.append('h'); break; case VAV: if (currentLetter.isVowel()) { if (currentLetter.isShureq()) { output.append('u'); } } else { output.append('v'); } break; case ZAYIN: output.append('z'); break; case HET: output.append("ch"); break; case TET: output.append('t'); break; case YOD: if (currentLetter.isConsonant()) { output.append('y'); } break; case FINAL_KAF: case KAF: if (currentLetter.hasNoDagesh() && current != 0) { output.append("kh"); } else { output.append('k'); } break; case LAMED: output.append('l'); break; case FINAL_MEM: case MEM: output.append('m'); break; case FINAL_NUN: case NUN: output.append('n'); break; case SAMEKH: output.append('s'); break; case AYIN: // output.append(OPEN_QUOTE); break; case FINAL_PE: case PE: if (currentLetter.hasNoDagesh() && current != 0) { output.append('p'); output.append('h'); } else { output.append('p'); } break; case FINAL_TSADI: case TSADI: output.append('t'); output.append('s'); break; case QOF: output.append('q'); break; case RESH: output.append('r'); break; case SIN: output.append('s'); if (currentLetter.isShin()) { output.append('h'); } break; case TAV: output.append('t'); break; // vowels case SHEVA: if (!currentLetter.isSilent()) { output.append('e'); } break; case HATAF_SEGOL: output.append('e'); break; case HATAF_PATAH: output.append('a'); break; case HATAF_QAMATS: output.append('o'); break; case HIRIQ: output.append('i'); break; case TSERE: output.append('e'); break; case SEGOL: output.append('e'); break; case PATAH: output.append('a'); break; case QAMATS: output.append('a'); break; case HOLAM: final int length = output.length(); if (length > 0 && output.charAt(length - 1) == 'w') { output.insert(length - 1, 'o'); } else { output.append('o'); } break; case HOLAM_HASER: output.append('o'); break; case QUBUTS: output.append('u'); break; case QAMATS_QATAN: output.append('o'); break; default: break; } } // CHECKSTYLE:ON /** * Adds the hyphens in for doubled letters * * @param output the output rendered so far * @param letters the letters under examination * @param currentPosition position * @param sizeBeforeAppending the size of the output, prior to process the currentLetter. */ private static void doubleHyphenateIfApplicable(final StringBuilder output, HebrewLetter[] letters, int currentPosition, final int sizeBeforeAppending) { final HebrewLetter currentLetter = letters[currentPosition]; // doubling and hyphenating if (currentLetter.isDoubled() && !isStartOfWord(letters, currentPosition)) { output.append(HYPHEN); // copy to the end, and discount the already added - final int endOfDoubleLetter = output.length() - 1; for (int ii = sizeBeforeAppending; ii < endOfDoubleLetter; ii++) { output.append(output.charAt(ii)); } } } /** * Marks the syllables * * @param letters set of letters * @param current the current position * @param output the current output * @param hasStress true to indicate a word has a stress */ private static void hyphenateSyllables(final HebrewLetter[] letters, final int current, final StringBuilder output, boolean hasStress) { if (letters[current].getC() == MAQAF) { output.append(MAQAF_HYPHEN); return; } if (letters[current].getC() == ' ') { output.append(' '); return; } //if previous was a maqaf, then we're not going to hyphenate if (current - 1 >= 0 && (letters[current - 1].getC() == MAQAF || letters[current - 1].getC() == ' ')) { return; } if (isStartOfWord(letters, current) || !letters[current].isConsonant() || isLastHebrewConsonantInWordWithoutVowel(letters, current)) { return; } //if the previous output was a syllable marker, then we're not going to do anything if (output.length() > 0 && ( output.charAt(output.length() - 1) == HYPHEN || output.charAt(output.length() - 1) == MAQAF_HYPHEN || output.charAt(output.length() - 1) == ' ') ) { //then don't output return; } // look for vowels boolean foundLongVowel = false; boolean foundStressedVowel = false; for (int ii = current - 1; ii > 0 && !letters[ii].isConsonant(); ii--) { if (letters[ii].isVowel()) { if (letters[ii].getC() == HATAF_PATAH || letters[ii].getC() == HATAF_QAMATS || letters[ii].getC() == HATAF_SEGOL || letters[ii].getC() == SHEVA) { output.append(HYPHEN); return; } if (letters[ii].isLong()) { foundLongVowel = true; } if (letters[ii].isStressed()) { foundStressedVowel = true; } if (hasStress && foundLongVowel && !foundStressedVowel) { output.append(HYPHEN); return; } } } if (letters[current].isDoubled()) { return; } for (int ii = current + 1; untilEndOfWord(letters, ii) && !letters[ii].isConsonant(); ii++) { if (letters[ii].getC() == SHEVA && letters[ii].isSilent()) { return; } } if (letters[current].isConsonant() && letters[current].isSilent()) { return; } output.append(HYPHEN); } /** * Checks whether the letter is the last Hebrew letter in a word, doesn't check past a MAQAF * * @param letters the set of letters * @param position our current position * @return true if it last without vowel, false if it's last with vowel OR not last consonant */ private static boolean isLastHebrewConsonantInWordWithoutVowel(final HebrewLetter[] letters, final int position) { final boolean isLastHebrewConsonant = isLastHebrewConsonantInWord(letters, position); if (isLastHebrewConsonant) { for (int ii = position + 1; untilEndOfWord(letters, ii); ii++) { if (letters[ii].isVowel() && !letters[ii].isSilent()) { return false; } } return true; } return false; } /** * Looks until the end of the word, and stops early if it hits a MAQAF (hebrew hyphen). * * @param letters the hebrew letters * @param ii the current position in the (usually) loop * @return true if we should continue */ private static boolean untilEndOfWord(final HebrewLetter[] letters, final int ii) { return isNotMaqafOrSpacing(letters, ii); } /** * @param letters out hebrew letters * @param ii current position * @return true if ii is 0 or precedecing character is spacing/maqaf etc. */ private static boolean isStartOfWord(final HebrewLetter[] letters, final int ii) { return ii == 0 || !isNotMaqafOrSpacing(letters, ii - 1); } /** * @param letters the current set of letters * @param ii our current position * @return true if it is not a maqaf or spacing */ private static boolean isNotMaqafOrSpacing(final HebrewLetter[] letters, final int ii) { return ii < letters.length && letters[ii].getC() != MAQAF && letters[ii].getC() != ' '; } /** * Checks whether it is the last hebrew consonant up to a MAQAF * * @param letters the set of letters * @param position our current position * @return true if no other consonants are found after the position */ private static boolean isLastHebrewConsonantInWord(final HebrewLetter[] letters, final int position) { boolean vowelReturnsConsonant = false; for (int ii = position + 1; untilEndOfWord(letters, ii); ii++) { if (vowelReturnsConsonant && letters[ii].isVowel()) { return false; } if (letters[ii].isConsonant()) { //check if it is an Aleph or a AYIN - if so we continue looking unless we hit a vowel if (letters[ii].getC() != AYIN && letters[ii].getC() != ALEPH) { //no aleph/ayin, so definitely not the last consonant return false; } //if we encounter a vowel, then we're going to return false vowelReturnsConsonant = true; } } return true; } /** * Swaps letters round if they finish in a particular order: * <p/> * <pre> * ha => ah, * cha = ach, * (a => a(, * </pre> * * @param letters * @param output the output which may need letters swapped */ private static void doEndings(final char[] letters, final StringBuilder output) { //find last consonant int lastConsonant = getLastConsonantPosition(letters); if(lastConsonant == -1 || (letters[lastConsonant] != HET && letters[lastConsonant] != HE)) { return; } //we've got a he or a het if(!hasAnyPointing(letters, lastConsonant, true, PATAH)) { return; } // check last character if a final int last = output.length() - 1; final int secondLast = last - 1; if (secondLast < 0) { return; } // ends with a if (output.charAt(last) == 'a') { // ends with ha final char secondChar = output.charAt(secondLast); if (output.charAt(secondLast) == 'h' || secondChar == H_WITH_DOT) { if (secondLast > 0 && output.charAt(secondLast - 1) == 'c') { output.setCharAt(last, 'h'); output.setCharAt(secondLast, 'c'); output.setCharAt(secondLast - 1, 'a'); } else { // ends only with ha output.setCharAt(secondLast, 'a'); output.setCharAt(last, secondChar); } } else if (secondChar == OPEN_QUOTE) { // ends with (a output.setCharAt(last, 'a'); output.setCharAt(secondLast, OPEN_QUOTE); } } } /** * Gets the last consonant in the Hebrew word * @param letters the letters in the word * @return the index of the last consonant */ private static int getLastConsonantPosition(final char[] letters) { for(int ii = letters.length - 1; ii >= 0; ii--) { if(isHebrewConsonant(letters[ii])) { return ii; } } return -1; } /** * looks for the previous letter and works out whether it is long and unstressed * * @param letters the set of letters * @param consonantPosition the current position * @return true if after a long unstressed vowel */ private static boolean isAfterLongUnstressedVowel(final HebrewLetter[] letters, final int consonantPosition) { return isAfterAnUnstressedVowel(letters, consonantPosition, true); } /** * looks for the previous letter and works out whether it is short and unstressed * * @param letters the set of letters * @param consonantPosition the current position * @return true if after a long unstressed vowel */ private static boolean isAfterShortUnstressedVowel(final HebrewLetter[] letters, final int consonantPosition) { return isAfterAnUnstressedVowel(letters, consonantPosition, false); } /** * looks for the previous letter and works out whether it is long and unstressed * * @param letters the set of letters * @param consonantPosition the current position * @return true if after a long unstressed vowel */ private static boolean isAfterAnUnstressedVowel(final HebrewLetter[] letters, final int consonantPosition, boolean lookingForLong) { // look for first letter we have int ii = consonantPosition - 1; while (ii >= 0 && !letters[ii].isConsonant()) { boolean isCorrectLength = lookingForLong ? letters[ii].isLong() : letters[ii].getVowelLengthType() == VowelLengthType.SHORT; boolean isSheva = letters[ii].getC() == SHEVA; if ((letters[ii].isVowel() && isCorrectLength && !letters[ii].isStressed())) { return true; } ii--; } return false; } /** * @param input input string * @param currentPosition the current position * @return True if the glyph contains a DAGESH after the VAV or other consonant - only looks forward */ private static boolean hasCloseDagesh(final char[] input, final int currentPosition) { return hasAnyPointing(input, currentPosition, true, DAGESH); } /** * Looks backwards to the consonant, then forwards to the beginning of the next consonant, then works * backwards until it hits a vowel * * @param letters the set of letters * @param currentPosition out current position * @return the closest vowel found in the sequence of hebrew letters */ private static HebrewLetter getCloseVowel(final HebrewLetter[] letters, final int currentPosition) { HebrewLetter vowel = getCloseVowel(letters, currentPosition, false); if (vowel != null) { return vowel; } vowel = getCloseVowel(letters, currentPosition, true); if (vowel != null) { return vowel; } for (int ii = currentPosition; ii >= 0; ii--) { if (letters[ii].isVowel()) { return letters[ii]; } } return vowel; } /** * Returns the closest vowel, looking forwards or backwards depending on the parameters passed in * * @param letters the input * @param currentPosition our current position in the input * @param forwards true for looking ahead, false for looking backwards * @return the Hebrew vowel, or null if not found */ private static HebrewLetter getCloseVowel(final HebrewLetter[] letters, final int currentPosition, final boolean forwards) { final int increment = forwards ? 1 : -1; for (int ii = currentPosition + increment; ii > 0 && untilEndOfWord(letters, ii); ii = ii + increment) { if (letters[ii].isVowel()) { return letters[ii]; } else if (letters[ii].isConsonant()) { break; } } // not found return null; } /** * @param c the character * @return true to indicate a vowel */ private static boolean isHebrewVowel(final char c) { return c >= SHEVA && c <= QAMATS_QATAN && c != DAGESH && c != SHIN_DOT; } /** * Dagesh processing for length of vowels * * @param input input string * @param currentPosition the current position * @param letters the set of letters */ private static void processForteDagesh(final char[] input, final int currentPosition, final HebrewLetter[] letters) { if (!HebrewLetterType.CONSONANT.equals(letters[currentPosition].getHebrewLetterType())) { return; } if (!hasAnyPointing(input, currentPosition, true, DAGESH)) { letters[currentPosition].setConsonantType(ConsonantType.NO_DAGESH); return; } // first character is always single if (isStartOfWord(letters, currentPosition) || isLastLetterInWord(input, currentPosition)) { letters[0].setConsonantType(ConsonantType.SINGLE); return; } final char consonant = input[currentPosition]; if (isAny(consonant, BET, GIMEL, DALET, KAF, PE, TAV) && hasAnyPointing(input, currentPosition, false, SHEVA, HATAF_SEGOL, HATAF_PATAH, HATAF_QAMATS)) { // not dagesh forte if any of those letters letters[currentPosition].setConsonantType(ConsonantType.SINGLE); return; } letters[currentPosition].setConsonantType(ConsonantType.DOUBLE); } /** * @param input input string * @param currentPosition the current position * @return true to indicate a letter */ private static boolean isLastLetterInWord(final char[] input, final int currentPosition) { for (int ii = currentPosition + 1; ii < input.length && input[ii] != MAQAF && input[ii] != ' '; ii++) { if (isHebrewConsonant(input[ii])) { //aleph or AYIN with no vowels if (input[ii] == ALEPH || input[ii] == AYIN) { return !hasAnyPointing(input, currentPosition, true, QAMATS_QATAN, SHEVA, HATAF_SEGOL, HATAF_PATAH, HATAF_QAMATS, HIRIQ, TSERE, SEGOL, PATAH, QAMATS, HOLAM, HOLAM_HASER, QUBUTS); } return false; } } return true; } /** * @param charAt our current char * @return true if it is a consonant */ private static boolean isHebrewConsonant(final char charAt) { return charAt >= ALEPH && charAt <= TAV; } /** * checks if consonant is contained in consonants * * @param letter the one we are looking for * @param matchingLetters the possibilities * @return true if found in the list of consonants provided */ private static boolean isAny(final char letter, final char... matchingLetters) { for (int ii = 0; ii < matchingLetters.length; ii++) { if (letter == matchingLetters[ii]) { return true; } } return false; } /** * @param inputString input string * @param letters the letters found in the word so far * @param currentPosition the current position * @return found a vowel */ private static boolean processVav(final char[] inputString, final HebrewLetter[] letters, final int currentPosition) { final boolean isVav = inputString[currentPosition] == VAV; if (isVav) { if (isVavConsonant(inputString, currentPosition, letters)) { final HebrewLetter letter = new HebrewLetter(VAV); letter.setHebrewLetterType(HebrewLetterType.CONSONANT); letters[currentPosition] = letter; } else { final HebrewLetter letter = new HebrewLetter(VAV); letter.setHebrewLetterType(HebrewLetterType.VOWEL); if (hasAnyPointing(inputString, currentPosition, true, DAGESH)) { letter.setShureq(true); } // next consonant has a dagesh? final int position = nextHebrewConsonant(inputString, currentPosition); if (position != -1 && hasAnyPointing(inputString, position, true, DAGESH)) { letter.setVowelLengthType(VowelLengthType.SHORT); } else { letter.setVowelLengthType(VowelLengthType.LONG); } letters[currentPosition] = letter; } return true; } return false; } /** * Finds where the next Hebrew consonant is * * @param inputString the input * @param currentPosition our current position in the input * @return the position of the next hebrew consonant */ private static int nextHebrewConsonant(final char[] inputString, final int currentPosition) { int ii = currentPosition + 1; while (ii < inputString.length && !isHebrewConsonant(inputString[ii])) { ii++; } return ii == inputString.length ? -1 : ii; } /** * searches for any letters provided * * @param inputString the input string * @param position the current position in the string * @param after true to indicate to look after, false for before * @param otherMarks the unicode characters we are look for * @return true if all marks were matched */ public static boolean hasAnyPointing(final char[] inputString, final int position, final boolean after, final char... otherMarks) { return hasPointing(inputString, position, after, false, false, otherMarks); } /** * searches for all letters provided * * @param inputString the input string * @param position the current position in the string * @param after true to indicate to look after, false for before * @param otherMarks the unicode characters we are look for * @return true if all marks were matched */ public static boolean hasAllPointing(final char[] inputString, final int position, final boolean after, final char... otherMarks) { return hasPointing(inputString, position, after, true, false, otherMarks); } /** * searches for all letters provided * * @param inputString the input string * @param position the current position in the string * @param after true to indicate to look after, false for before * @param otherMarks the unicode characters we are look for * @return true if all marks were matched */ public static boolean hasAllPointingIncludingVav(final char[] inputString, final int position, final boolean after, final char... otherMarks) { return hasPointing(inputString, position, after, true, true, otherMarks); } /** * searches for all letters provided * * @param inputString the input string * @param position the current position in the string * @param after true to indicate to look after, false for before * @param hasAllLetters true to include all letters * @param includeVav true to include searching passed the Vav letter * @param otherMarks the unicode characters we are look for * @return true if all marks were matched */ public static boolean hasPointing(final char[] inputString, final int position, final boolean after, final boolean hasAllLetters, final boolean includeVav, final char... otherMarks) { final boolean[] foundAll = new boolean[otherMarks.length]; final int increment = after ? 1 : -1; for (int ii = position + increment; ii < inputString.length; ii = ii + increment) { final char newChar = inputString[ii]; if (newChar >= ALEPH || (includeVav && newChar != VAV)) { break; } for (int jj = 0; jj < otherMarks.length; jj++) { if (newChar == otherMarks[jj]) { // has any letters? and we found one, so no need to go further if (!hasAllLetters) { return true; } foundAll[jj] = true; } } } return areAllTrue(foundAll); } /** * @param inputString the input string * @param currentPosition the current position * @param letters * @return true if vav is a consonant */ private static boolean isVavConsonant(final char[] inputString, final int currentPosition, final HebrewLetter[] letters) { final boolean hasDagesh = hasAnyPointing(inputString, currentPosition, true, DAGESH); if (isStartOfWord(letters, currentPosition)) { return !hasDagesh; } if (isLastLetterInWord(inputString, currentPosition)) { return !hasDagesh && !hasAnyPointing(inputString, currentPosition, true, HOLAM); } if (hasDagesh) { if (hasAnyPointing(inputString, currentPosition, true, HIRIQ, TSERE, SEGOL, SHEVA, PATAH, QAMATS, QUBUTS, QAMATS_QATAN)) { return true; } if (hasAllPointingIncludingVav(inputString, currentPosition, true, VAV, HOLAM)) { return true; } return false; } if (hasAllPointing(inputString, currentPosition, true, QAMATS, HOLAM)) { return true; } //if we follow a vowel, then we want to be a consonant return isFollowingVowel(currentPosition, letters); } /** * True to indicate we are following a vowel - stops at the first consonant/start of word * * @param currentPosition the current position * @param letters the letters * @return true if following a vowel */ private static boolean isFollowingVowel(final int currentPosition, final HebrewLetter[] letters) { for (int ii = currentPosition - 1; ii > 0 && !letters[ii].isConsonant() && !isStartOfWord(letters, ii); ii--) { if (letters[ii].isVowel()) { return true; } } return false; } /** * If it's a yod, adds the letter to the array * * @param inputString the input string * @param letters the set of letters * @param currentPosition our current position in the input string * @return true if a yod was found */ private static boolean processYod(final char[] inputString, final HebrewLetter[] letters, final int currentPosition) { final boolean isYod = inputString[currentPosition] == YOD; if (isYod) { final HebrewLetter letter = new HebrewLetter(YOD); if (isYodVowel(inputString, currentPosition, letters)) { letter.setHebrewLetterType(HebrewLetterType.VOWEL); letter.setVowelLengthType(VowelLengthType.LONG); letters[currentPosition] = letter; return true; } else { letter.setHebrewLetterType(HebrewLetterType.CONSONANT); letters[currentPosition] = letter; return true; } } return false; } /** * @param inputString the input string * @param currentPosition the current position in the string * @param letters * @return true if yod is a vowel */ private static boolean isYodVowel(final char[] inputString, final int currentPosition, final HebrewLetter[] letters) { return hasAnyPointing(inputString, currentPosition, false, HIRIQ, TSERE, SEGOL, QAMATS, HOLAM_HASER) && !hasAnyPointing(inputString, currentPosition, true, QAMATS_QATAN, SHEVA, HATAF_SEGOL, HATAF_PATAH, HATAF_QAMATS, HIRIQ, TSERE, SEGOL, PATAH, QAMATS, HOLAM, HOLAM_HASER, QUBUTS, DAGESH); } /** * True if all are true * * @param foundAll the list of boolean flags * @return true if all booleans passed are true */ private static boolean areAllTrue(final boolean[] foundAll) { for (int ii = 0; ii < foundAll.length; ii++) { if (!foundAll[ii]) { return false; } } return true; } }