/******************************************************************************* * Copyright (c) 2012, Directors of the Tyndale STEP Project * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * Neither the name of the Tyndale House, Cambridge (www.TyndaleHouse.com) * nor the names of its contributors may be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. ******************************************************************************/ package com.tyndalehouse.step.core.utils; import com.tyndalehouse.step.core.utils.language.GreekUtils; import com.tyndalehouse.step.core.utils.language.HebrewUtils; import com.tyndalehouse.step.core.utils.language.transliteration.TransliterationOption; import com.tyndalehouse.step.core.utils.language.transliteration.TransliterationRule; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.text.Normalizer; import java.text.Normalizer.Form; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.Locale; import static com.tyndalehouse.step.core.utils.StringUtils.isBlank; import static com.tyndalehouse.step.core.utils.StringUtils.isEmpty; import static com.tyndalehouse.step.core.utils.language.GreekUtils.removeGreekTranslitMarkUpForIndexing; import static com.tyndalehouse.step.core.utils.language.HebrewUtils.removeHebrewTranslitMarkUpForIndexing; /** * A collection of utility methods enabling us to convert Strings, references one way or another. * * @author chrisburrell */ public final class StringConversionUtils { private static final Logger LOGGER = LoggerFactory.getLogger(StringConversionUtils.class); private static final char KEY_SEPARATOR = ':'; private static final String STRONG_PREFIX = "strong:"; private static final String UPPER_STRONG_PREFIX = "STRONG:"; private static final int STRONG_PREFIX_LENGTH = STRONG_PREFIX.length(); private static final int LANGUAGE_INDICATOR = STRONG_PREFIX_LENGTH; private static final int MAX_TRANSLITERATIONS = 512; /** * hiding implementation */ private StringConversionUtils() { // hiding implementation } /** * @param strongNumber the strong number to consider, whether to display or not * @return true if not G3588 and not null/blank. To be extended later to include other words */ public boolean isDisplayableStrongNumber(final String strongNumber) { if (isBlank(strongNumber)) { return false; } return !"G3588".equalsIgnoreCase(strongNumber); } /** * Not all bibles encode strong numbers as strong:[HG]\d+ unfortunately, so instead we cope for strong: and * strong:H. * <p/> * In essence we chop off any of the following prefixes: strong:G, strong:H, strong:, H, G. We don't use a regular * expression, since this will be much quicker * * @param strong strong key * @return the key containing just the digits */ public static String getStrongKey(final String strong) { if (strong.startsWith(STRONG_PREFIX)) { final char c = strong.charAt(LANGUAGE_INDICATOR); if (c == 'H' || c == 'G') { return strong.substring(LANGUAGE_INDICATOR + 1); } return strong.substring(LANGUAGE_INDICATOR); } final char c = strong.charAt(0); if (c == 'H' || c == 'G') { return strong.substring(1); } // perhaps some passages encode just the number return strong; } /** * in this case, we assume that a key starts shortly after the last ':' with a number * * @param potentialKey a key that can potentially be shortened * @return the shortened key */ public static String getAnyKey(final String potentialKey) { return getAnyKey(potentialKey, true); } /** * Strips off strong: if present, to yield Gxxxx - Assumes strong prefix is upperCase, i.e. STRONG: * * @param key key to change * @return the key without the prefix */ public static String getStrongLanguageSpecificKey(final String key) { if (key.startsWith(UPPER_STRONG_PREFIX)) { return key.substring(STRONG_PREFIX_LENGTH); } return key; } /** * pads the strong number according to its size, to an optional letter followed by 4 digits * * @param key the key to the strong number * @return the strong number, padded */ public static String getStrongPaddedKey(final String key) { if (StringUtils.isBlank(key)) { return ""; } final StringBuilder sb = new StringBuilder(key.length()); final String[] split = key.toUpperCase(Locale.ENGLISH).split(" "); for (final String s : split) { final String strongNumber = getStrongLanguageSpecificKey(s); if (strongNumber == null) { continue; } final int length = strongNumber.length(); if (sb.length() > 0) { // add a space separator sb.append(' '); } // check we have G or H final char firstChar = strongNumber.charAt(0); if (firstChar == 'G' || firstChar == 'H') { padPrefixedStrongNumber(sb, strongNumber, length, firstChar); } else { padNonPrefixedStrongNumber(sb, strongNumber, length); } } return sb.toString().trim(); } /** * Pads any strong number that is not prefixed by a letter such as G or H * * @param sb the output buffer * @param strongNumber the strong number itself * @param length the length of the strong number */ private static void padNonPrefixedStrongNumber(final StringBuilder sb, final String strongNumber, final int length) { // we only have the numbers so do our best for (int ii = length; ii < 4; ii++) { sb.append('0'); } sb.append(strongNumber); fixAugmentedSuffix(sb); } private static void fixAugmentedSuffix(StringBuilder sb) { //if it's an augmented strong, we need to lower case the last letter, so final int lastCharPosition = sb.length() - 1; final char lastChar = sb.charAt(lastCharPosition); if (Character.isAlphabetic(lastChar)) { sb.setCharAt(lastCharPosition, Character.toLowerCase(lastChar)); } } /** * @param strongNumber a strong number from length 2 (including prefix) to 6. * @return the right padded version for it. */ public static String padPrefixedStrongNumber(final String strongNumber) { final StringBuilder b = new StringBuilder(strongNumber.length()); padPrefixedStrongNumber(b, strongNumber, strongNumber.length(), strongNumber.charAt(0)); return b.toString(); } /** * Pads the given prefixed number, from say G12 to G0012 * * @param sb the string to build up * @param suffixedStrongNumber the strong number * @param length the length of the string * @param firstChar the first character, i.e. either G or H */ private static void padPrefixedStrongNumber(final StringBuilder sb, final String suffixedStrongNumber, final int suffixedLength, final char firstChar) { String strongNumber; boolean suffix = false; int length = 0; final char lastChar = suffixedStrongNumber.charAt(suffixedStrongNumber.length() - 1); if (Character.isAlphabetic(lastChar)) { strongNumber = suffixedStrongNumber.substring(0, suffixedStrongNumber.length() - 1); suffix = true; length = suffixedLength - 1; } else { strongNumber = suffixedStrongNumber; length = suffixedLength; } switch (length) { case 1: sb.append(strongNumber); break; case 2: sb.append(firstChar); sb.append('0'); sb.append('0'); sb.append('0'); sb.append(strongNumber.charAt(1)); break; case 3: sb.append(firstChar); sb.append('0'); sb.append('0'); sb.append(strongNumber.charAt(1)); sb.append(strongNumber.charAt(2)); break; case 4: sb.append(firstChar); sb.append('0'); sb.append(strongNumber.charAt(1)); sb.append(strongNumber.charAt(2)); sb.append(strongNumber.charAt(3)); break; case 6: if (strongNumber.charAt(1) == '0') { sb.append(firstChar); sb.append(strongNumber.charAt(2)); sb.append(strongNumber.charAt(3)); sb.append(strongNumber.charAt(4)); sb.append(strongNumber.charAt(5)); break; } sb.append(strongNumber); break; default: sb.append(strongNumber); break; } if (suffix) { sb.append(Character.toLowerCase(lastChar)); } } /** * in this case, we assume that a key starts shortly after the last ':' with a number * * @param potentialKey a key that can potentially be shortened * @param trimInitial trim initial character after ':' * @return the shortened key */ public static String getAnyKey(final String potentialKey, final boolean trimInitial) { LOGGER.trace("Looking for key [{}] with trimInitial [{}]", potentialKey, trimInitial); // find first colon and start afterwards, -1 yields 0, which is the beginning of the string // so we can work with that. int start = potentialKey.lastIndexOf(KEY_SEPARATOR) + 1; // start at the first char after the colon // int start = lastColon + 1; if (trimInitial) { final char protocol = potentialKey.charAt(start); if (protocol == 'G' || protocol == 'H') { start++; } // finally, we may have 0s: while (start < potentialKey.length() && potentialKey.charAt(start) == '0') { start++; } } return potentialKey.substring(start); } /** * Takes accents and other punctuation off the word - less performant * * @param word the word to be processed * @return the unaccented form */ public static String unAccent(final String word) { return unAccent(unAccent(word, true), false); } /** * takes accents and other punctuation off the word * * @param word the word to be processed * @param isGreek true for greek, false for hebrew * @return the unaccented form */ public static String unAccent(final String word, final boolean isGreek) { return unAccent(word, isGreek, true); } /** * takes accents and other punctuation off the word * * @param word the word to be processed * @param isGreek true for greek, false for hebrew * @param unpointHebrewVowels true to remove Hebrew vowels * @return the unaccented form */ public static String unAccent(final String word, final boolean isGreek, boolean unpointHebrewVowels) { if (isGreek) { return GreekUtils.unAccent(word); } return HebrewUtils.unPoint(word, unpointHebrewVowels); } /** * Takes accents and other punctuation off the word - less performant * * @param word the word to be processed * @return the unaccented form */ public static String unAccentLeavingVowels(final String word) { return unAccentHebrewLeavingVowels(unAccent(word, true)); } /** * takes accents and other punctuation off the word * * @param word the word to be processed * @return the unaccented form */ public static String unAccentHebrewLeavingVowels(final String word) { return HebrewUtils.unPoint(word, false); } /** * Removes the starting H, if present (for greek transliterations only at present time) * * @param stepTransliteration the transliteration * @param isGreek true if greek * @return the transliteration adapted for unaccented texts) */ public static String adaptForTransliterationForIndexing(final String stepTransliteration, final boolean isGreek) { if (isGreek) { return GreekUtils.removeGreekTranslitMarkUpForIndexing(stepTransliteration); } // otherwise hebrew, so run the pattern to remove everything... return HebrewUtils.removeHebrewTranslitMarkUpForIndexing(stepTransliteration); } /** * Removes the starting H, if present (for greek transliterations only at present time), removes other symbols such * as letters with lines or dots, etc. Then runs a set of rules on both transliterations. See TYNSTEP-374 for the * rule definitions. * * @param stepTransliteration the transliteration * @param isGreek true if greek * @return the transliteration adapted for unaccented texts) */ public static List<TransliterationOption> adaptTransliterationForQuerying( final String stepTransliteration, final boolean isGreek) { if (isGreek) { return trimmedTranslits(multiplyTranslitOptions(removeGreekTranslitMarkUpForIndexing(stepTransliteration), GreekUtils.getTransliterationRules())); } // otherwise hebrew, so run the pattern to remove everything... return trimmedTranslits(multiplyTranslitOptions(removeHebrewTranslitMarkUpForIndexing(stepTransliteration), HebrewUtils.getTransliterationRules())); } private static List<TransliterationOption> trimmedTranslits(final List<TransliterationOption> transliterationRules) { return transliterationRules.subList(0, Math.min(transliterationRules.size(), MAX_TRANSLITERATIONS)); } /** * @param baseString a transliteration without any mark-up * @param transliterationRules the rules to apply * @return all possible transliterations */ public static List<TransliterationOption> multiplyTranslitOptions(final String baseString, final List<TransliterationRule> transliterationRules) { // it is important to remember that we strip out special characters here, so ensure that the rules // below do not conflict with the stripping of the mark-up // go letter by letter and apply rules // run a rule, and that gives me, a new set of prefixes, keep on running rules iterating through final StringBuilder base = new StringBuilder(); final List<TransliterationOption> options = new ArrayList<TransliterationOption>(); options.add(new TransliterationOption(0, base)); final char[] baseChars = baseString.toCharArray(); for (int ii = 0; ii < baseChars.length; ii++) { for (final TransliterationRule r : transliterationRules) { r.expand(options, baseChars, ii); } // update all options that are still on our current position, to bump them up for (final TransliterationOption leftBehind : options) { if (leftBehind.getNextValidPosition() == ii) { leftBehind.getOption().append(baseChars[ii]); leftBehind.setNextValidPosition(ii + 1); } } } //trim the empty options off for (Iterator<TransliterationOption> iterator = options.iterator(); iterator.hasNext(); ) { TransliterationOption option = iterator.next(); if (option.getOption().length() == 0) { iterator.remove(); } } return options; } /** * @param rawForm raw form of the word * @return the transliteration of the word given */ public static String transliterate(final String rawForm) { // decompose characters from breathing and accents and store in StringBuilder if (rawForm == null || rawForm.length() == 0) { return ""; } if (HebrewUtils.isHebrewText(rawForm)) { return HebrewUtils.transliterateHebrew(rawForm); } // then assume Greek return GreekUtils.transliterateGreek(Normalizer.normalize(rawForm.toLowerCase(Locale.ENGLISH), Form.NFD)); } /** * Starts with punctuation. * * @param s the s * @return true, if the first character is a punctuation character */ public static boolean startsWithPunctuation(final String s) { if (isEmpty(s)) { return false; } final char c = s.charAt(0); return isPunctuation(c); } private static boolean isPunctuation(final char c) { switch (c) { case ',': case '.': case '?': case '/': case ';': case ':': case '\'': case '!': return true; default: return false; } } }