package org.xpect.text;
import java.util.List;
import org.eclipse.xtext.util.Strings;
import com.google.common.base.Function;
import com.google.common.collect.Lists;
public class GenericTokenizer implements Function<String, List<String>> {
public enum CharType {
LETTER {
@Override
public boolean canContinue(CharType previous) {
switch (previous) {
case LETTER:
return true;
default:
return false;
}
}
},
NUMBER {
@Override
public boolean canContinue(CharType previous) {
switch (previous) {
case LETTER:
case NUMBER:
return true;
default:
return false;
}
}
},
SPACE {
@Override
public boolean canContinue(CharType previous) {
switch (previous) {
case SPACE:
return true;
default:
return false;
}
}
},
SYMBOL {
@Override
public boolean canContinue(CharType previous) {
return false;
}
};
public static CharType get(char c) {
switch (Character.getType(c)) {
case Character.TITLECASE_LETTER:
case Character.UPPERCASE_LETTER:
case Character.LOWERCASE_LETTER:
case Character.OTHER_LETTER:
case Character.MODIFIER_LETTER:
return LETTER;
case Character.LETTER_NUMBER:
case Character.DECIMAL_DIGIT_NUMBER:
case Character.OTHER_NUMBER:
return NUMBER;
case Character.LINE_SEPARATOR:
case Character.PARAGRAPH_SEPARATOR:
case Character.SPACE_SEPARATOR:
case Character.CONTROL:
return SPACE;
case Character.COMBINING_SPACING_MARK:
case Character.CONNECTOR_PUNCTUATION:
case Character.CURRENCY_SYMBOL:
case Character.DASH_PUNCTUATION:
case Character.ENCLOSING_MARK:
case Character.END_PUNCTUATION:
case Character.FINAL_QUOTE_PUNCTUATION:
case Character.FORMAT:
case Character.INITIAL_QUOTE_PUNCTUATION:
case Character.MATH_SYMBOL:
case Character.MODIFIER_SYMBOL:
case Character.NON_SPACING_MARK:
case Character.OTHER_PUNCTUATION:
case Character.OTHER_SYMBOL:
case Character.PRIVATE_USE:
case Character.START_PUNCTUATION:
case Character.SURROGATE:
case Character.UNASSIGNED:
default:
return SYMBOL;
}
}
public abstract boolean canContinue(CharType previous);
}
public List<String> apply(String input) {
List<String> result = Lists.newArrayList();
if (!Strings.isEmpty(input)) {
int lastOffset = 0;
CharType lastType = CharType.get(input.charAt(0));
for (int offset = 1; offset < input.length(); offset++) {
char c = input.charAt(offset);
CharType type = CharType.get(c);
if (!type.canContinue(lastType)) {
String token = input.substring(lastOffset, offset);
result.add(token);
lastOffset = offset;
lastType = type;
}
}
String token = input.substring(lastOffset, input.length());
result.add(token);
}
return result;
}
}