/*
* Reference ETL Parser for Java
* Copyright (c) 2000-2009 Constantine A Plotnikov
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
package net.sf.etl.parsers;
import java.math.BigInteger;
/**
* This class contains utilities useful for examining string token contents.
*
* @author const
*/
public final class LiteralUtils {
/**
* a private constructor to prevent creation of class instances.
*/
private LiteralUtils() {
}
/**
* Information about number that is being parsed.
*/
public static class NumberInfo {
/**
* A kind of number
*/
public final Tokens kind;
/**
* a text of number with underscores removed.
*/
public final String text;
/**
* a suffix attached to number
*/
public final String suffix;
/**
* exponent (adjusted according the dot position)
*/
public final int exponent;
/**
* a base of number
*/
public final int base;
/**
* a sign of the number (1 for positive numbers and -1 for negative)
*/
public final int sign;
/**
* A constructor
*
* @param kind
* A kind of token
* @param base
* a base of number
* @param sign
* a sign of the number (1 for positive numbers and -1 for
* negative)
* @param suffix
* a suffix attached to number
* @param text
* a text of number with underscores removed.
* @param exponenet
* Exponent associated with the token
*/
public NumberInfo(Tokens kind, int sign, int base, String text,
int exponenet, String suffix) {
super();
this.base = base;
this.exponent = exponenet;
this.kind = kind;
this.sign = sign;
this.suffix = suffix;
this.text = text;
}
}
/**
* Parse number
*
* @param input
* an input token
* @return information about number.
*/
public static NumberInfo parseNumber(String input) {
return new NumberParser(input).parse();
}
/**
* Parse text of integer token to integer value.
*
* @param intToken
* a integer token to parse
* @return parsed value
*/
public static int parseInt(String intToken) {
final NumberInfo n = parseNumber(intToken);
if (n.kind != Tokens.INTEGER && n.kind != Tokens.INTEGER_WITH_SUFFIX) {
throw new NumberFormatException("wrong token kind: " + n.kind);
}
String textToParse = n.text;
if (n.sign == -1) {
textToParse = "-" + textToParse;
}
return Integer.parseInt(textToParse, n.base);
}
/**
* Parse text of floating point or integer token to double.
*
* @param doubleToken
* a floating point or integer token to parse
* @return parsed double
*/
public static double parseDouble(String doubleToken) {
final NumberInfo n = parseNumber(doubleToken);
BigInteger digits = new BigInteger((n.sign >= 0 ? "" : "-") + n.text,
n.base);
double exp = 1;
int a = Math.abs(n.exponent);
for (int i = 0; i < a; i++) {
exp *= n.base;
}
double rc = digits.doubleValue();
return n.exponent < 0 ? rc / exp : rc * exp;
}
/**
* Parse text of string token to unicode characters. The string prefix is
* ignored. Note it is assumed that the token has been already parsed by the
* lexer, so minimal additional validation is performed.
*
* @param stringToken
* a string token to parse or null
* @return parsed string or null if null has been passed as argument
*/
public static String parseString(String stringToken) {
if (stringToken == null) {
return null;
}
final StringBuilder rc = new StringBuilder();
int n = stringToken.length();
if (n < 2) {
throw new IllegalArgumentException("Unexpected end of the token "
+ n);
}
int i = 0;
while (Character.isUnicodeIdentifierPart(stringToken.charAt(i))) {
i++;
}
final char quote = stringToken.charAt(i);
switch (quote) {
case '\'':
case '"':
break;
default:
throw new IllegalArgumentException("Invalid quote character "
+ stringToken.charAt(0));
}
boolean multiline = stringToken.length() > 6 + i
&& stringToken.charAt(i + 1) == quote
&& stringToken.charAt(i + 2) == quote;
// ignore last and first characters
n -= multiline ? 3 : 1;
i += multiline ? 3 : 1;
if (i > n
|| stringToken.charAt(n) != quote
|| !(multiline ? stringToken.charAt(n + 1) == quote
&& stringToken.charAt(n + 2) == quote : true)) {
throw new IllegalArgumentException(
"The string is in invalid format: " + stringToken);
}
while (i < n) {
char ch = stringToken.charAt(i++);
if ((ch >= '\uD800' && ch <= '\uDBFF')
|| (ch >= '\uDC00' && ch <= '\uDFFF')) {
// NOTE POST 0.2: fix it
throw new IllegalArgumentException(
"Large codepoints are not yet handled: " + ((int) ch));
}
switch (ch) {
case '\\':
if (i >= n) {
throw new IllegalArgumentException(
"Unexpected end of the token " + i);
}
ch = stringToken.charAt(i++);
switch (ch) {
case 'U':
final int start = i;
while (i < n && (ch = stringToken.charAt(i++)) != ';') {
if (('0' > ch || ch > '9') && ('a' > ch || ch > 'f')
&& ('A' > ch || ch > 'F')) {
throw new IllegalArgumentException(
"Invalid symbol in escape sequence " + ch);
}
}
if (i == start || stringToken.charAt(i - 1) != ';') {
throw new IllegalArgumentException(
"Unexpected end of the token " + i);
}
final int codepoint = Integer.parseInt(stringToken
.substring(start, i - 1), 16);
rc.appendCodePoint(codepoint);
break;
case 'u':
final int ch16 = Integer.parseInt(stringToken.substring(i,
i + 4), 16);
rc.append((char) ch16);
i += 4;
break;
case 'x':
final int ch8 = Integer.parseInt(stringToken.substring(i,
i + 2), 16) & 0xFF;
rc.append((char) ch8);
i += 2;
break;
case 'n':
rc.append('\n');
break;
case 'r':
rc.append('\r');
break;
case 't':
rc.append('\t');
break;
case 'f':
rc.append('\f');
break;
case 'b':
rc.append('\b');
break;
default:
rc.append(ch);
}
break;
default:
rc.append(ch);
}
}
return rc.toString();
}
/**
* This is a parser of number. It is loosely based on lexer code.
*/
private static class NumberParser {
/**
* Buffer used for consuming characters
*/
StringBuffer buffer = new StringBuffer();
/**
* Input text
*/
final String inputText;
/**
* position in input text
*/
int pos = 0;
/**
* number base
*/
int base = 10;
/**
* A sign of the number
*/
int sign = 1;
/**
* Exponent
*/
int exponent = 0;
/**
* a suffix attached to number
*/
String suffix;
/**
* digits of the number without dot and underscore
*/
String text;
/**
* A constructor for parser
*
* @param inputText
*/
NumberParser(String inputText) {
this.inputText = inputText;
}
/**
* Look at character
*
* @param n
* position relatively to current.
* @return -1 if end of string or character at the current position.
*/
private int la(int n) {
return (pos + n) >= inputText.length() ? -1 : inputText.charAt(pos
+ n);
}
/**
* Look at character
*
* @return -1 if end of string or character at the current position.
*/
private int la() {
return pos >= inputText.length() ? -1 : inputText.charAt(pos);
}
/**
* check if next symbol match specified
*
* @param ch
* character to match
* @return true if character is matched
*/
private boolean lach(char ch) {
return la() == ch;
}
/**
* Consume character and possibly add it to buffer.
*
* @param addToBuffer
*/
private void consume(boolean addToBuffer) {
if (pos > inputText.length()) {
throw new NumberFormatException();
}
if (addToBuffer) {
buffer.append(inputText.charAt(pos));
}
pos++;
}
/**
* check if next symbol is digit
*
* @param n
* look ahead position
* @return true if next symbol is digit
* @since 0.0.1
*/
private boolean laDigit(int n) {
final int ch = la(n);
return ('0' <= ch && ch <= '9');
}
/**
* check if next symbol is digit
*
* @return true if next symbol is digit
* @since 0.0.1
*/
private boolean laDigit() {
return laDigit(0);
}
/**
* look ahead alpha
*
* @return true if letter
*/
private boolean laAlpha() {
final int ch = la();
return ('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z');
}
/**
* @return parsed number
*/
NumberInfo parse() {
Tokens kind = Tokens.INTEGER;
int beforeDot = -1;
// parsing integer of decimal, or floating point number
if (lach('+')) {
consume(false);
} else if (lach('-')) {
sign = -1;
consume(false);
}
while (laDigit() || lach('_')) {
consume(!lach('_')); // '0'
}
if (lach('#')) {
// based number
try {
base = Integer.parseInt(buffer.toString());
} catch (final Exception ex) {
throw new NumberFormatException();
}
if (2 > base || base > 36) {
throw new NumberFormatException();
}
buffer.setLength(0);
consume(false); // '\#'
while (laDigit() || laAlpha() || lach('.') || lach('_')) {
final int ch = la();
if (ch != '.' && ch != '_') {
// check if digit conform to the base
if (base <= 10) {
if (!('0' <= ch && ch < '0' + base)) {
throw new NumberFormatException();
}
} else {
if (!(('0' <= ch && ch <= '9')
|| ('a' <= ch && ch < 'a' + base - 10) || ('A' <= ch && ch < 'A' + base - 10))) {
throw new NumberFormatException();
}
}
} else if (ch == '.') {
beforeDot = buffer.length();
if (kind == Tokens.FLOAT) {
throw new NumberFormatException();
}
kind = Tokens.FLOAT;
}
consume(!lach('_') && !lach('.'));
} // end while
if (lach('#')) {
consume(false);
text = buffer.toString();
buffer.setLength(0);
} else {
throw new NumberFormatException();
}
} else {
// parse non based integer
if (lach('.') && laDigit(1)) {
// floating point number
kind = Tokens.FLOAT;
beforeDot = buffer.length();
consume(false); // '.'
consume(true);
while (laDigit()) {
consume(true); // '0'
}
}
text = buffer.toString();
buffer.setLength(0);
}
if (lach('e') || lach('E')) {
kind = Tokens.FLOAT;
consume(false); // 'e'
if (lach('+') || lach('-')) {
consume(lach('-'));
}
if (!laDigit()) {
throw new NumberFormatException();
} else {
while (laDigit() || lach('_')) {
consume(!lach('_')); // digit
}
}
exponent = Integer.parseInt(buffer.toString());
buffer.setLength(0);
}
exponent -= beforeDot == -1 ? 0 : text.length() - beforeDot;
if (laAlpha() && !lach('E') && !lach('e')) {
if (kind == Tokens.FLOAT) {
kind = Tokens.FLOAT_WITH_SUFFIX;
} else {
kind = Tokens.INTEGER_WITH_SUFFIX;
}
do {
consume(true);
} while (laAlpha() || lach('_') || laDigit());
suffix = buffer.toString();
buffer.setLength(0);
}
if (pos != inputText.length()) {
throw new NumberFormatException(
"Some characters left in the string "
+ (inputText.length() - pos));
}
return new NumberInfo(kind, sign, base, text, exponent, suffix);
}
}
}