LatinPlaceName.java example

Explorer
activityinfo-master
package org.activityinfo.core.shared.importing.match.names;

/**
 * Contains a fully normalized and parsed latin names name as
 * an array of characters with pointers to the beginnings of parts.
 */
class LatinPlaceName {

    public static final int MAX_PARTS = 15;

    public static final int MAX_LENGTH = 100;

    private final int LETTER_CLASS = 0;
    private final int DIGIT_CLASS = 1;
    private final int OTHER = -1;

    public static final String APOSTROPHE = "'";


    /**
     * Normalized characters, without any separating spaces
     */
    char[] chars = new char[MAX_LENGTH];

    /**
     * The offsets of the parts
     */
    private int[] partOffsets = new int[MAX_PARTS+1];

    /**
     * The number of chars in the normalized string
     */
    private int numChars;

    /**
     * The number of parts found
     */
    private int numParts;


    private final LatinCharacterNormalizer characterNormalizer;

    LatinPlaceName() {
        this.characterNormalizer = LatinCharacterNormalizer.get();
    }

    public void set(String input) {
        numParts = 0;
        numChars = 0;

        int currentClass = OTHER;

        for(int i=0;i!=input.length();++i) {
            String ch = characterNormalizer.normalizeCharacter(input.substring(i, i + 1));

            // For words like N'Goutjina we just drop the apostrophe
            if(ch.equals(APOSTROPHE) && currentClass == LETTER_CLASS) {
                continue;
            }

            int characterClass = classify(ch);
            if(isBreak(currentClass, characterClass)) {
                partOffsets[numParts] = numChars;
                numParts++;
            }

            currentClass = characterClass;

            if(currentClass != OTHER) {
                for(int j=0;j!=ch.length();++j) {
                    chars[numChars++] = ch.charAt(j);
                }
            }
        }

        // add a final offset for convenience
        partOffsets[numParts] = numChars;
    }

    public int partCount() {
        return numParts;
    }

    public boolean isEmpty() {
        return numChars == 0;
    }

    /**
     * @return the number of characters in part {@code partIndex}
     */
    public int charCount(int partIndex) {
        return partOffsets[partIndex+1] - partOffsets[partIndex];
    }

    /**
     * @return the total number of alphanumeric characters in this names name following normalization
     */
    public int charCount() {
        return numChars;
    }


    /**
     * @return the starting index of part {@code partIndex}
     */
    public int partStart(int partIndex) {
        return partOffsets[partIndex];
    }

    /**
     *
     * @return true if the transition from character class {@code a} to character
     * class {@code b} should be considered a break between components
     */
    private boolean isBreak(int fromClass, int toClass) {

        if(fromClass == toClass) {
            return false;
        }

        // OTHER => [LETTER, DIGIT] is always a break
        // (don't count [LETTER, DIGIT] -> OTHER as this will double
        //  count parts)
        if(fromClass == OTHER) {
            return true;
        }

        // We consider LETTER => DIGIT a break, for example:
        // "Commune2" should be understood as [commune, 2]
        if(fromClass == LETTER_CLASS && toClass == DIGIT_CLASS) {
            return true;
        }

        // But going from DIGIT => LETTER is probably not because
        // it could be a suffix like "2b" or an ordinal indicator
        // like "1st" or "2eme"
        return false;
    }

    /**
     * Classifies a character as {@code LETTER}, {@code DIGIT}, or {@code OTHER}
     */
    private int classify(String input) {
        char ch = input.charAt(0);
        if(isDigit(ch)) {
            return DIGIT_CLASS;
        } else if(ch >= 'A' && ch <= 'Z') {
            return LETTER_CLASS;
        } else {
            return OTHER;
        }
    }

    private boolean isDigit(int ch) {
        return ch >= '0' && ch <= '9';
    }

    public String part(int index) {
        return new String(chars, partOffsets[index], charCount(index));
    }

    public char charAt(int partIndex, int charIndex) {
        return chars[ partOffsets[partIndex] + charIndex ];
    }

    public boolean isPartNumeric(int partIndex) {
        return Character.isDigit(chars[partStart(partIndex)]);
    }

    public int parsePartAsInteger(int index) {
        int numberStart = partStart(index);
        int numberEnd = numberStart;
        int partEnd = partStart(index+1);
        while(numberEnd < partEnd && isDigit(chars[numberEnd])) {
            numberEnd++;
        }
        return Integer.parseInt(new String(chars, numberStart, numberEnd - numberStart));
    }

    @Override
    public String toString() {
        StringBuilder sb = new StringBuilder();
        sb.append("[");
        for(int partIndex = 0; partIndex!=numParts;++partIndex) {
            if(partIndex > 0) {
                sb.append(", ");
            }
            sb.append(part(partIndex));
        }
        sb.append("]");
        return sb.toString();
    }

    public int tryParsePartAsRomanNumeral(int index) {
        return RomanNumerals.tryDecodeRomanNumeral(chars, partOffsets[index], partOffsets[index+1]);
    }
}