package com.tyndalehouse.step.core.utils.language;
import java.text.Normalizer;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
import com.tyndalehouse.step.core.utils.language.transliteration.StringToStringRule;
import com.tyndalehouse.step.core.utils.language.transliteration.TransliterationRule;
/**
* Utilities for doing Hebrew transliteration
*
* @author chrisburrell
*
*/
public final class GreekUtils {
private static final int PERISPOMENI = 0x0342;
private static final int YOT = 0x03F3;
private static final Pattern BETA_UPPER_CASE_SYMBOLS = Pattern.compile("[*]");
private static final Pattern BETA_ACCENTS = Pattern.compile("[()/=+|&'*\\\\]");
private static final String GREEK_BREATHING = "h";
private static List<TransliterationRule> transliterationRules;
/** prevent instantiation */
private GreekUtils() {
// do nothing
}
/**
* @param form the word
* @return true if the normalized form without the diacritics is between PERISPOMENI and YOT, the unicode
* range for Greek
*/
public static boolean isGreekText(final String form) {
final int firstProper = unAccent(form).charAt(0);
return firstProper >= PERISPOMENI && firstProper <= YOT;
}
/**
* @param stepTransliteration the step transliteration
* @return withou the leading H
*/
public static String removeGreekTranslitMarkUpForIndexing(final String stepTransliteration) {
if (stepTransliteration.startsWith(GREEK_BREATHING)) {
return stepTransliteration.substring(1);
}
return stepTransliteration;
}
/**
* @param word a word with accents
* @return a word without accents
*/
public static String unAccent(final String word) {
return Normalizer.normalize(word, Normalizer.Form.NFD).replaceAll(
"[\\p{InCombiningDiacriticalMarks}\u2e00-\u2E3B]*", "");
}
/**
* assumes lower case version of beta, since this is what we search on
*
* @param beta the input string, with '*'
* @return a version without breathing or *
*/
public static String toBetaLowercase(final String beta) {
return BETA_UPPER_CASE_SYMBOLS.matcher(beta).replaceAll("");
}
/**
* Gets rid of breathing and upper case symbols. Does not change the case of the characters
*
* @param beta the input string with breathing and * for capitals
* @return a version without breathing or *
*/
public static String toBetaUnaccented(final String beta) {
if (beta == null) {
return null;
}
return BETA_ACCENTS.matcher(beta).replaceAll("");
}
/**
* Performs a greek transliteration on a normalised string
*
* @param normalized the normalised string
* @return the equivalent transliteration
*/
// CHECKSTYLE:OFF
public static String transliterateGreek(final String normalized) {
final StringBuilder sb = new StringBuilder(normalized);
int position = 0;
while (position < sb.length()) {
switch (sb.charAt(position)) {
case 'α':
sb.setCharAt(position++, 'a');
break;
case 'β':
sb.setCharAt(position++, 'b');
break;
case 'γ':
if (position + 1 < sb.length()) {
switch (sb.charAt(position + 1)) {
case 'γ':
sb.setCharAt(position++, 'n');
sb.setCharAt(position++, 'g');
break;
case 'κ':
sb.setCharAt(position++, 'n');
sb.setCharAt(position++, 'k');
break;
case 'χ':
sb.setCharAt(position++, 'n');
sb.setCharAt(position++, 'c');
sb.insert(position++, 'h');
break;
default:
sb.setCharAt(position++, 'g');
break;
}
} else {
sb.setCharAt(position++, 'g');
}
break;
case 'δ':
sb.setCharAt(position++, 'd');
break;
case 'ε':
sb.setCharAt(position++, 'e');
break;
case 'ζ':
sb.setCharAt(position++, 'z');
break;
case 'η':
sb.setCharAt(position++, '\u0113');
break;
case 'θ':
sb.setCharAt(position++, 't');
sb.insert(position++, 'h');
break;
case 'ι':
sb.setCharAt(position++, 'i');
break;
case 'κ':
sb.setCharAt(position++, 'k');
break;
case 'λ':
sb.setCharAt(position++, 'l');
break;
case 'μ':
sb.setCharAt(position++, 'm');
break;
case 'ν':
sb.setCharAt(position++, 'n');
break;
case 'ξ':
sb.setCharAt(position++, 'x');
break;
case 'ο':
sb.setCharAt(position++, 'o');
break;
case 'π':
sb.setCharAt(position++, 'p');
break;
case 'ρ':
sb.setCharAt(position++, 'r');
break;
case 'ς':
sb.setCharAt(position++, 's');
break;
case 'σ':
sb.setCharAt(position++, 's');
break;
case 'τ':
sb.setCharAt(position++, 't');
break;
case 'υ':
sb.setCharAt(position++, 'u');
break;
case 'φ':
sb.setCharAt(position++, 'f');
break;
case 'χ':
sb.setCharAt(position++, 'c');
sb.insert(position++, 'h');
break;
case 'ψ':
sb.setCharAt(position++, 'p');
sb.insert(position++, 's');
break;
case 'ω':
sb.setCharAt(position++, '\u014d');
break;
// leave spaces in, but should never be hit
case ' ':
position++;
break;
// breathing character
case 0x314:
// if the previous character was not a 'r', then we add an 'h'
if (!((position == 0 && sb.charAt(1) == 'ρ') || (position > 0 && sb.charAt(0) == 'r'))) {
sb.deleteCharAt(position);
sb.insert(0, 'h');
position++;
continue;
}
sb.deleteCharAt(position);
break;
default:
// remove character since not recognised
sb.deleteCharAt(position);
break;
}
}
return sb.toString();
}
// CHECKSTYLE:ON
/**
* @return gives the hebrew list of transliteration rules
*/
public static List<TransliterationRule> getTransliterationRules() {
ensureTransliterationRules();
return transliterationRules;
}
/**
* creates the transliteration rules lazily, on first time
*/
private static void ensureTransliterationRules() {
if (transliterationRules != null) {
return;
}
createTransliterationRules();
}
/**
* creates the rules, this is synchronized so that no-two threads are creating it at any point of time
*/
private static synchronized void createTransliterationRules() {
// check again if it has been initialized, as we may be coming second
if (transliterationRules == null) {
final List<TransliterationRule> rules = new ArrayList<TransliterationRule>();
rules.add(new StringToStringRule("gg", new String[] { "ng" }));
rules.add(new StringToStringRule("gk", new String[] { "nk" }));
rules.add(new StringToStringRule("gch", new String[] { "nch" }));
rules.add(new StringToStringRule("q", new String[] { "th" }));
rules.add(new StringToStringRule("c", new String[] { "x" }));
rules.add(new StringToStringRule("x", new String[] { "ch" }));
rules.add(new StringToStringRule("ph", new String[] { "f" }));
rules.add(new StringToStringRule("y", new String[] { "ps" }));
rules.add(new StringToStringRule("ow", new String[] { "\u014d" }));
rules.add(new StringToStringRule("w", new String[] { "\u014d" }));
rules.add(new StringToStringRule("oo", new String[] { "\u014d" }));
rules.add(new StringToStringRule("o", new String[] { "\u014d" }));
rules.add(new StringToStringRule("mb", new String[] { "mp" }));
rules.add(new StringToStringRule("nd", new String[] { "nt" }));
rules.add(new StringToStringRule("rh", new String[] { "r" }));
rules.add(new StringToStringRule("e", new String[] { "\u0113" }));
rules.add(new StringToStringRule("é", new String[] { "\u0113" }));
rules.add(new StringToStringRule("h", new String[] { "\u0113" }));
transliterationRules = rules;
}
}
}