package org.activityinfo.core.shared.importing.match.names;
/**
* Contains a fully normalized and parsed latin names name as
* an array of characters with pointers to the beginnings of parts.
*/
class LatinPlaceName {
public static final int MAX_PARTS = 15;
public static final int MAX_LENGTH = 100;
private final int LETTER_CLASS = 0;
private final int DIGIT_CLASS = 1;
private final int OTHER = -1;
public static final String APOSTROPHE = "'";
/**
* Normalized characters, without any separating spaces
*/
char[] chars = new char[MAX_LENGTH];
/**
* The offsets of the parts
*/
private int[] partOffsets = new int[MAX_PARTS+1];
/**
* The number of chars in the normalized string
*/
private int numChars;
/**
* The number of parts found
*/
private int numParts;
private final LatinCharacterNormalizer characterNormalizer;
LatinPlaceName() {
this.characterNormalizer = LatinCharacterNormalizer.get();
}
public void set(String input) {
numParts = 0;
numChars = 0;
int currentClass = OTHER;
for(int i=0;i!=input.length();++i) {
String ch = characterNormalizer.normalizeCharacter(input.substring(i, i + 1));
// For words like N'Goutjina we just drop the apostrophe
if(ch.equals(APOSTROPHE) && currentClass == LETTER_CLASS) {
continue;
}
int characterClass = classify(ch);
if(isBreak(currentClass, characterClass)) {
partOffsets[numParts] = numChars;
numParts++;
}
currentClass = characterClass;
if(currentClass != OTHER) {
for(int j=0;j!=ch.length();++j) {
chars[numChars++] = ch.charAt(j);
}
}
}
// add a final offset for convenience
partOffsets[numParts] = numChars;
}
public int partCount() {
return numParts;
}
public boolean isEmpty() {
return numChars == 0;
}
/**
* @return the number of characters in part {@code partIndex}
*/
public int charCount(int partIndex) {
return partOffsets[partIndex+1] - partOffsets[partIndex];
}
/**
* @return the total number of alphanumeric characters in this names name following normalization
*/
public int charCount() {
return numChars;
}
/**
* @return the starting index of part {@code partIndex}
*/
public int partStart(int partIndex) {
return partOffsets[partIndex];
}
/**
*
* @return true if the transition from character class {@code a} to character
* class {@code b} should be considered a break between components
*/
private boolean isBreak(int fromClass, int toClass) {
if(fromClass == toClass) {
return false;
}
// OTHER => [LETTER, DIGIT] is always a break
// (don't count [LETTER, DIGIT] -> OTHER as this will double
// count parts)
if(fromClass == OTHER) {
return true;
}
// We consider LETTER => DIGIT a break, for example:
// "Commune2" should be understood as [commune, 2]
if(fromClass == LETTER_CLASS && toClass == DIGIT_CLASS) {
return true;
}
// But going from DIGIT => LETTER is probably not because
// it could be a suffix like "2b" or an ordinal indicator
// like "1st" or "2eme"
return false;
}
/**
* Classifies a character as {@code LETTER}, {@code DIGIT}, or {@code OTHER}
*/
private int classify(String input) {
char ch = input.charAt(0);
if(isDigit(ch)) {
return DIGIT_CLASS;
} else if(ch >= 'A' && ch <= 'Z') {
return LETTER_CLASS;
} else {
return OTHER;
}
}
private boolean isDigit(int ch) {
return ch >= '0' && ch <= '9';
}
public String part(int index) {
return new String(chars, partOffsets[index], charCount(index));
}
public char charAt(int partIndex, int charIndex) {
return chars[ partOffsets[partIndex] + charIndex ];
}
public boolean isPartNumeric(int partIndex) {
return Character.isDigit(chars[partStart(partIndex)]);
}
public int parsePartAsInteger(int index) {
int numberStart = partStart(index);
int numberEnd = numberStart;
int partEnd = partStart(index+1);
while(numberEnd < partEnd && isDigit(chars[numberEnd])) {
numberEnd++;
}
return Integer.parseInt(new String(chars, numberStart, numberEnd - numberStart));
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("[");
for(int partIndex = 0; partIndex!=numParts;++partIndex) {
if(partIndex > 0) {
sb.append(", ");
}
sb.append(part(partIndex));
}
sb.append("]");
return sb.toString();
}
public int tryParsePartAsRomanNumeral(int index) {
return RomanNumerals.tryDecodeRomanNumeral(chars, partOffsets[index], partOffsets[index+1]);
}
}