StringConversionUtils.java example

Explorer
step-master
/*******************************************************************************

 * Copyright (c) 2012, Directors of the Tyndale STEP Project
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions 
 * are met:
 *
 * Redistributions of source code must retain the above copyright 
 * notice, this list of conditions and the following disclaimer.
 * Redistributions in binary form must reproduce the above copyright 
 * notice, this list of conditions and the following disclaimer in 
 * the documentation and/or other materials provided with the 
 * distribution.
 * Neither the name of the Tyndale House, Cambridge (www.TyndaleHouse.com)  
 * nor the names of its contributors may be used to endorse or promote 
 * products derived from this software without specific prior written 
 * permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING 
 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF 
 * THE POSSIBILITY OF SUCH DAMAGE.
 ******************************************************************************/
package com.tyndalehouse.step.core.utils;

import com.tyndalehouse.step.core.utils.language.GreekUtils;
import com.tyndalehouse.step.core.utils.language.HebrewUtils;
import com.tyndalehouse.step.core.utils.language.transliteration.TransliterationOption;
import com.tyndalehouse.step.core.utils.language.transliteration.TransliterationRule;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.text.Normalizer;
import java.text.Normalizer.Form;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;

import static com.tyndalehouse.step.core.utils.StringUtils.isBlank;
import static com.tyndalehouse.step.core.utils.StringUtils.isEmpty;
import static com.tyndalehouse.step.core.utils.language.GreekUtils.removeGreekTranslitMarkUpForIndexing;
import static com.tyndalehouse.step.core.utils.language.HebrewUtils.removeHebrewTranslitMarkUpForIndexing;

/**
 * A collection of utility methods enabling us to convert Strings, references one way or another.
 *
 * @author chrisburrell
 */
public final class StringConversionUtils {
    private static final Logger LOGGER = LoggerFactory.getLogger(StringConversionUtils.class);
    private static final char KEY_SEPARATOR = ':';
    private static final String STRONG_PREFIX = "strong:";
    private static final String UPPER_STRONG_PREFIX = "STRONG:";
    private static final int STRONG_PREFIX_LENGTH = STRONG_PREFIX.length();
    private static final int LANGUAGE_INDICATOR = STRONG_PREFIX_LENGTH;
    private static final int MAX_TRANSLITERATIONS = 512;

    /**
     * hiding implementation
     */
    private StringConversionUtils() {
        // hiding implementation
    }

    /**
     * @param strongNumber the strong number to consider, whether to display or not
     * @return true if not G3588 and not null/blank. To be extended later to include other words
     */
    public boolean isDisplayableStrongNumber(final String strongNumber) {
        if (isBlank(strongNumber)) {
            return false;
        }

        return !"G3588".equalsIgnoreCase(strongNumber);
    }

    /**
     * Not all bibles encode strong numbers as strong:[HG]\d+ unfortunately, so instead we cope for strong: and
     * strong:H.
     * <p/>
     * In essence we chop off any of the following prefixes: strong:G, strong:H, strong:, H, G. We don't use a regular
     * expression, since this will be much quicker
     *
     * @param strong strong key
     * @return the key containing just the digits
     */
    public static String getStrongKey(final String strong) {
        if (strong.startsWith(STRONG_PREFIX)) {
            final char c = strong.charAt(LANGUAGE_INDICATOR);
            if (c == 'H' || c == 'G') {
                return strong.substring(LANGUAGE_INDICATOR + 1);
            }
            return strong.substring(LANGUAGE_INDICATOR);
        }

        final char c = strong.charAt(0);
        if (c == 'H' || c == 'G') {
            return strong.substring(1);
        }

        // perhaps some passages encode just the number
        return strong;
    }

    /**
     * in this case, we assume that a key starts shortly after the last ':' with a number
     *
     * @param potentialKey a key that can potentially be shortened
     * @return the shortened key
     */
    public static String getAnyKey(final String potentialKey) {
        return getAnyKey(potentialKey, true);
    }

    /**
     * Strips off strong: if present, to yield Gxxxx - Assumes strong prefix is upperCase, i.e. STRONG:
     *
     * @param key key to change
     * @return the key without the prefix
     */
    public static String getStrongLanguageSpecificKey(final String key) {
        if (key.startsWith(UPPER_STRONG_PREFIX)) {
            return key.substring(STRONG_PREFIX_LENGTH);
        }
        return key;
    }

    /**
     * pads the strong number according to its size, to an optional letter followed by 4 digits
     *
     * @param key the key to the strong number
     * @return the strong number, padded
     */
    public static String getStrongPaddedKey(final String key) {
        if (StringUtils.isBlank(key)) {
            return "";
        }

        final StringBuilder sb = new StringBuilder(key.length());
        final String[] split = key.toUpperCase(Locale.ENGLISH).split(" ");
        for (final String s : split) {
            final String strongNumber = getStrongLanguageSpecificKey(s);

            if (strongNumber == null) {
                continue;
            }

            final int length = strongNumber.length();
            if (sb.length() > 0) {
                // add a space separator
                sb.append(' ');
            }

            // check we have G or H
            final char firstChar = strongNumber.charAt(0);
            if (firstChar == 'G' || firstChar == 'H') {
                padPrefixedStrongNumber(sb, strongNumber, length, firstChar);
            } else {
                padNonPrefixedStrongNumber(sb, strongNumber, length);
            }
        }

        return sb.toString().trim();
    }

    /**
     * Pads any strong number that is not prefixed by a letter such as G or H
     *
     * @param sb           the output buffer
     * @param strongNumber the strong number itself
     * @param length       the length of the strong number
     */
    private static void padNonPrefixedStrongNumber(final StringBuilder sb, final String strongNumber,
                                                   final int length) {
        // we only have the numbers so do our best
        for (int ii = length; ii < 4; ii++) {
            sb.append('0');
        }
        sb.append(strongNumber);
        fixAugmentedSuffix(sb);
    }

    private static void fixAugmentedSuffix(StringBuilder sb) {
        //if it's an augmented strong, we need to lower case the last letter, so
        final int lastCharPosition = sb.length() - 1;
        final char lastChar = sb.charAt(lastCharPosition);
        if (Character.isAlphabetic(lastChar)) {
            sb.setCharAt(lastCharPosition, Character.toLowerCase(lastChar));
        }
    }

    /**
     * @param strongNumber a strong number from length 2 (including prefix) to 6.
     * @return the right padded version for it.
     */
    public static String padPrefixedStrongNumber(final String strongNumber) {
        final StringBuilder b = new StringBuilder(strongNumber.length());
        padPrefixedStrongNumber(b, strongNumber, strongNumber.length(), strongNumber.charAt(0));
        return b.toString();
    }

    /**
     * Pads the given prefixed number, from say G12 to G0012
     *
     * @param sb                   the string to build up
     * @param suffixedStrongNumber the strong number
     * @param length               the length of the string
     * @param firstChar            the first character, i.e. either G or H
     */
    private static void padPrefixedStrongNumber(final StringBuilder sb, final String suffixedStrongNumber,
                                                final int suffixedLength, final char firstChar) {
        String strongNumber;
        boolean suffix = false;
        int length = 0;
        final char lastChar = suffixedStrongNumber.charAt(suffixedStrongNumber.length() - 1);
        if (Character.isAlphabetic(lastChar)) {
            strongNumber = suffixedStrongNumber.substring(0, suffixedStrongNumber.length() - 1);
            suffix = true;
            length = suffixedLength - 1;
        } else {
            strongNumber = suffixedStrongNumber;
            length = suffixedLength;
        }

        switch (length) {
            case 1:
                sb.append(strongNumber);
                break;
            case 2:
                sb.append(firstChar);
                sb.append('0');
                sb.append('0');
                sb.append('0');
                sb.append(strongNumber.charAt(1));
                break;
            case 3:
                sb.append(firstChar);
                sb.append('0');
                sb.append('0');
                sb.append(strongNumber.charAt(1));
                sb.append(strongNumber.charAt(2));
                break;
            case 4:
                sb.append(firstChar);
                sb.append('0');
                sb.append(strongNumber.charAt(1));
                sb.append(strongNumber.charAt(2));
                sb.append(strongNumber.charAt(3));
                break;
            case 6:
                if (strongNumber.charAt(1) == '0') {
                    sb.append(firstChar);
                    sb.append(strongNumber.charAt(2));
                    sb.append(strongNumber.charAt(3));
                    sb.append(strongNumber.charAt(4));
                    sb.append(strongNumber.charAt(5));
                    break;
                }

                sb.append(strongNumber);
                break;
            default:
                sb.append(strongNumber);
                break;
        }

        if (suffix) {
            sb.append(Character.toLowerCase(lastChar));
        }
    }

    /**
     * in this case, we assume that a key starts shortly after the last ':' with a number
     *
     * @param potentialKey a key that can potentially be shortened
     * @param trimInitial  trim initial character after ':'
     * @return the shortened key
     */
    public static String getAnyKey(final String potentialKey, final boolean trimInitial) {
        LOGGER.trace("Looking for key [{}] with trimInitial [{}]", potentialKey, trimInitial);

        // find first colon and start afterwards, -1 yields 0, which is the beginning of the string
        // so we can work with that.
        int start = potentialKey.lastIndexOf(KEY_SEPARATOR) + 1;

        // start at the first char after the colon
        // int start = lastColon + 1;
        if (trimInitial) {
            final char protocol = potentialKey.charAt(start);
            if (protocol == 'G' || protocol == 'H') {
                start++;
            }

            // finally, we may have 0s:
            while (start < potentialKey.length() && potentialKey.charAt(start) == '0') {
                start++;
            }
        }

        return potentialKey.substring(start);
    }

    /**
     * Takes accents and other punctuation off the word - less performant
     *
     * @param word the word to be processed
     * @return the unaccented form
     */
    public static String unAccent(final String word) {
        return unAccent(unAccent(word, true), false);
    }

    /**
     * takes accents and other punctuation off the word
     *
     * @param word    the word to be processed
     * @param isGreek true for greek, false for hebrew
     * @return the unaccented form
     */
    public static String unAccent(final String word, final boolean isGreek) {
        return unAccent(word, isGreek, true);
    }


    /**
     * takes accents and other punctuation off the word
     *
     * @param word                the word to be processed
     * @param isGreek             true for greek, false for hebrew
     * @param unpointHebrewVowels true to remove Hebrew vowels
     * @return the unaccented form
     */
    public static String unAccent(final String word, final boolean isGreek, boolean unpointHebrewVowels) {
        if (isGreek) {
            return GreekUtils.unAccent(word);
        }
        return HebrewUtils.unPoint(word, unpointHebrewVowels);
    }

    /**
     * Takes accents and other punctuation off the word - less performant
     *
     * @param word the word to be processed
     * @return the unaccented form
     */
    public static String unAccentLeavingVowels(final String word) {
        return unAccentHebrewLeavingVowels(unAccent(word, true));
    }

    /**
     * takes accents and other punctuation off the word
     *
     * @param word the word to be processed
     * @return the unaccented form
     */
    public static String unAccentHebrewLeavingVowels(final String word) {
        return HebrewUtils.unPoint(word, false);
    }

    /**
     * Removes the starting H, if present (for greek transliterations only at present time)
     *
     * @param stepTransliteration the transliteration
     * @param isGreek             true if greek
     * @return the transliteration adapted for unaccented texts)
     */
    public static String adaptForTransliterationForIndexing(final String stepTransliteration,
                                                            final boolean isGreek) {
        if (isGreek) {
            return GreekUtils.removeGreekTranslitMarkUpForIndexing(stepTransliteration);
        }

        // otherwise hebrew, so run the pattern to remove everything...
        return HebrewUtils.removeHebrewTranslitMarkUpForIndexing(stepTransliteration);
    }

    /**
     * Removes the starting H, if present (for greek transliterations only at present time), removes other symbols such
     * as letters with lines or dots, etc. Then runs a set of rules on both transliterations. See TYNSTEP-374 for the
     * rule definitions.
     *
     * @param stepTransliteration the transliteration
     * @param isGreek             true if greek
     * @return the transliteration adapted for unaccented texts)
     */
    public static List<TransliterationOption> adaptTransliterationForQuerying(
            final String stepTransliteration, final boolean isGreek) {
        if (isGreek) {
            return trimmedTranslits(multiplyTranslitOptions(removeGreekTranslitMarkUpForIndexing(stepTransliteration),
                    GreekUtils.getTransliterationRules()));
        }

        // otherwise hebrew, so run the pattern to remove everything...
        return trimmedTranslits(multiplyTranslitOptions(removeHebrewTranslitMarkUpForIndexing(stepTransliteration),
                HebrewUtils.getTransliterationRules()));
    }

    private static List<TransliterationOption> trimmedTranslits(final List<TransliterationOption> transliterationRules) {
        return transliterationRules.subList(0, Math.min(transliterationRules.size(), MAX_TRANSLITERATIONS));

    }

    /**
     * @param baseString           a transliteration without any mark-up
     * @param transliterationRules the rules to apply
     * @return all possible transliterations
     */
    public static List<TransliterationOption> multiplyTranslitOptions(final String baseString,
                                                                      final List<TransliterationRule> transliterationRules) {
        // it is important to remember that we strip out special characters here, so ensure that the rules
        // below do not conflict with the stripping of the mark-up

        // go letter by letter and apply rules
        // run a rule, and that gives me, a new set of prefixes, keep on running rules iterating through
        final StringBuilder base = new StringBuilder();

        final List<TransliterationOption> options = new ArrayList<TransliterationOption>();
        options.add(new TransliterationOption(0, base));

        final char[] baseChars = baseString.toCharArray();
        for (int ii = 0; ii < baseChars.length; ii++) {
            for (final TransliterationRule r : transliterationRules) {
                r.expand(options, baseChars, ii);
            }

            // update all options that are still on our current position, to bump them up
            for (final TransliterationOption leftBehind : options) {
                if (leftBehind.getNextValidPosition() == ii) {
                    leftBehind.getOption().append(baseChars[ii]);
                    leftBehind.setNextValidPosition(ii + 1);
                }
            }
        }

        //trim the empty options off
        for (Iterator<TransliterationOption> iterator = options.iterator(); iterator.hasNext(); ) {
            TransliterationOption option = iterator.next();
            if (option.getOption().length() == 0) {
                iterator.remove();
            }
        }

        return options;
    }

    /**
     * @param rawForm raw form of the word
     * @return the transliteration of the word given
     */
    public static String transliterate(final String rawForm) {
        // decompose characters from breathing and accents and store in StringBuilder

        if (rawForm == null || rawForm.length() == 0) {
            return "";
        }

        if (HebrewUtils.isHebrewText(rawForm)) {
            return HebrewUtils.transliterateHebrew(rawForm);
        }

        // then assume Greek
        return GreekUtils.transliterateGreek(Normalizer.normalize(rawForm.toLowerCase(Locale.ENGLISH),
                Form.NFD));
    }

    /**
     * Starts with punctuation.
     *
     * @param s the s
     * @return true, if the first character is a punctuation character
     */
    public static boolean startsWithPunctuation(final String s) {
        if (isEmpty(s)) {
            return false;
        }

        final char c = s.charAt(0);
        return isPunctuation(c);
    }

    private static boolean isPunctuation(final char c) {
        switch (c) {
            case ',':
            case '.':
            case '?':
            case '/':
            case ';':
            case ':':
            case '\'':
            case '!':
                return true;
            default:
                return false;
        }
    }

}