LiteralUtils.java example

Explorer
etl-java-master
/*
 * Reference ETL Parser for Java
 * Copyright (c) 2000-2009 Constantine A Plotnikov
 *
 * Permission is hereby granted, free of charge, to any person 
 * obtaining a copy of this software and associated documentation 
 * files (the "Software"), to deal in the Software without restriction,
 * including without limitation the rights to use, copy, modify, merge, 
 * publish, distribute, sublicense, and/or sell copies of the Software, 
 * and to permit persons to whom the Software is furnished to do so, 
 * subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be 
 * included in all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 
 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 
 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 
 * SOFTWARE. 
 */
package net.sf.etl.parsers;

import java.math.BigInteger;

/**
 * This class contains utilities useful for examining string token contents.
 * 
 * @author const
 */
public final class LiteralUtils {

	/**
	 * a private constructor to prevent creation of class instances.
	 */
	private LiteralUtils() {
	}

	/**
	 * Information about number that is being parsed.
	 */
	public static class NumberInfo {
		/**
		 * A kind of number
		 */
		public final Tokens kind;
		/**
		 * a text of number with underscores removed.
		 */
		public final String text;
		/**
		 * a suffix attached to number
		 */
		public final String suffix;
		/**
		 * exponent (adjusted according the dot position)
		 */
		public final int exponent;
		/**
		 * a base of number
		 */
		public final int base;
		/**
		 * a sign of the number (1 for positive numbers and -1 for negative)
		 */
		public final int sign;

		/**
		 * A constructor
		 * 
		 * @param kind
		 *            A kind of token
		 * @param base
		 *            a base of number
		 * @param sign
		 *            a sign of the number (1 for positive numbers and -1 for
		 *            negative)
		 * @param suffix
		 *            a suffix attached to number
		 * @param text
		 *            a text of number with underscores removed.
		 * @param exponenet
		 *            Exponent associated with the token
		 */
		public NumberInfo(Tokens kind, int sign, int base, String text,
				int exponenet, String suffix) {
			super();
			this.base = base;
			this.exponent = exponenet;
			this.kind = kind;
			this.sign = sign;
			this.suffix = suffix;
			this.text = text;
		}

	}

	/**
	 * Parse number
	 * 
	 * @param input
	 *            an input token
	 * @return information about number.
	 */
	public static NumberInfo parseNumber(String input) {
		return new NumberParser(input).parse();
	}

	/**
	 * Parse text of integer token to integer value.
	 * 
	 * @param intToken
	 *            a integer token to parse
	 * @return parsed value
	 */
	public static int parseInt(String intToken) {
		final NumberInfo n = parseNumber(intToken);
		if (n.kind != Tokens.INTEGER && n.kind != Tokens.INTEGER_WITH_SUFFIX) {
			throw new NumberFormatException("wrong token kind: " + n.kind);
		}
		String textToParse = n.text;
		if (n.sign == -1) {
			textToParse = "-" + textToParse;
		}
		return Integer.parseInt(textToParse, n.base);
	}

	/**
	 * Parse text of floating point or integer token to double.
	 * 
	 * @param doubleToken
	 *            a floating point or integer token to parse
	 * @return parsed double
	 */
	public static double parseDouble(String doubleToken) {
		final NumberInfo n = parseNumber(doubleToken);
		BigInteger digits = new BigInteger((n.sign >= 0 ? "" : "-") + n.text,
				n.base);
		double exp = 1;
		int a = Math.abs(n.exponent);
		for (int i = 0; i < a; i++) {
			exp *= n.base;
		}
		double rc = digits.doubleValue();
		return n.exponent < 0 ? rc / exp : rc * exp;
	}

	/**
	 * Parse text of string token to unicode characters. The string prefix is
	 * ignored. Note it is assumed that the token has been already parsed by the
	 * lexer, so minimal additional validation is performed.
	 * 
	 * @param stringToken
	 *            a string token to parse or null
	 * @return parsed string or null if null has been passed as argument
	 */
	public static String parseString(String stringToken) {
		if (stringToken == null) {
			return null;
		}
		final StringBuilder rc = new StringBuilder();
		int n = stringToken.length();
		if (n < 2) {
			throw new IllegalArgumentException("Unexpected end of the token "
					+ n);
		}
		int i = 0;
		while (Character.isUnicodeIdentifierPart(stringToken.charAt(i))) {
			i++;
		}
		final char quote = stringToken.charAt(i);
		switch (quote) {
		case '\'':
		case '"':
			break;
		default:
			throw new IllegalArgumentException("Invalid quote character "
					+ stringToken.charAt(0));
		}
		boolean multiline = stringToken.length() > 6 + i
				&& stringToken.charAt(i + 1) == quote
				&& stringToken.charAt(i + 2) == quote;
		// ignore last and first characters
		n -= multiline ? 3 : 1;
		i += multiline ? 3 : 1;
		if (i > n
				|| stringToken.charAt(n) != quote
				|| !(multiline ? stringToken.charAt(n + 1) == quote
						&& stringToken.charAt(n + 2) == quote : true)) {
			throw new IllegalArgumentException(
					"The string is in invalid format: " + stringToken);
		}
		while (i < n) {
			char ch = stringToken.charAt(i++);
			if ((ch >= '\uD800' && ch <= '\uDBFF')
					|| (ch >= '\uDC00' && ch <= '\uDFFF')) {
				// NOTE POST 0.2: fix it
				throw new IllegalArgumentException(
						"Large codepoints are not yet handled: " + ((int) ch));
			}
			switch (ch) {
			case '\\':
				if (i >= n) {
					throw new IllegalArgumentException(
							"Unexpected end of the token " + i);
				}
				ch = stringToken.charAt(i++);
				switch (ch) {
				case 'U':
					final int start = i;
					while (i < n && (ch = stringToken.charAt(i++)) != ';') {
						if (('0' > ch || ch > '9') && ('a' > ch || ch > 'f')
								&& ('A' > ch || ch > 'F')) {
							throw new IllegalArgumentException(
									"Invalid symbol in escape sequence " + ch);
						}
					}
					if (i == start || stringToken.charAt(i - 1) != ';') {
						throw new IllegalArgumentException(
								"Unexpected end of the token " + i);
					}
					final int codepoint = Integer.parseInt(stringToken
							.substring(start, i - 1), 16);
					rc.appendCodePoint(codepoint);
					break;
				case 'u':
					final int ch16 = Integer.parseInt(stringToken.substring(i,
							i + 4), 16);
					rc.append((char) ch16);
					i += 4;
					break;
				case 'x':
					final int ch8 = Integer.parseInt(stringToken.substring(i,
							i + 2), 16) & 0xFF;
					rc.append((char) ch8);
					i += 2;
					break;
				case 'n':
					rc.append('\n');
					break;
				case 'r':
					rc.append('\r');
					break;
				case 't':
					rc.append('\t');
					break;
				case 'f':
					rc.append('\f');
					break;
				case 'b':
					rc.append('\b');
					break;
				default:
					rc.append(ch);
				}
				break;
			default:
				rc.append(ch);
			}
		}
		return rc.toString();
	}

	/**
	 * This is a parser of number. It is loosely based on lexer code.
	 */
	private static class NumberParser {
		/**
		 * Buffer used for consuming characters
		 */
		StringBuffer buffer = new StringBuffer();
		/**
		 * Input text
		 */
		final String inputText;
		/**
		 * position in input text
		 */
		int pos = 0;
		/**
		 * number base
		 */
		int base = 10;
		/**
		 * A sign of the number
		 */
		int sign = 1;
		/**
		 * Exponent
		 */
		int exponent = 0;

		/**
		 * a suffix attached to number
		 */
		String suffix;
		/**
		 * digits of the number without dot and underscore
		 */
		String text;

		/**
		 * A constructor for parser
		 * 
		 * @param inputText
		 */
		NumberParser(String inputText) {
			this.inputText = inputText;
		}

		/**
		 * Look at character
		 * 
		 * @param n
		 *            position relatively to current.
		 * @return -1 if end of string or character at the current position.
		 */
		private int la(int n) {
			return (pos + n) >= inputText.length() ? -1 : inputText.charAt(pos
					+ n);
		}

		/**
		 * Look at character
		 * 
		 * @return -1 if end of string or character at the current position.
		 */
		private int la() {
			return pos >= inputText.length() ? -1 : inputText.charAt(pos);
		}

		/**
		 * check if next symbol match specified
		 * 
		 * @param ch
		 *            character to match
		 * @return true if character is matched
		 */
		private boolean lach(char ch) {
			return la() == ch;
		}

		/**
		 * Consume character and possibly add it to buffer.
		 * 
		 * @param addToBuffer
		 */
		private void consume(boolean addToBuffer) {
			if (pos > inputText.length()) {
				throw new NumberFormatException();
			}
			if (addToBuffer) {
				buffer.append(inputText.charAt(pos));
			}
			pos++;
		}

		/**
		 * check if next symbol is digit
		 * 
		 * @param n
		 *            look ahead position
		 * @return true if next symbol is digit
		 * @since 0.0.1
		 */
		private boolean laDigit(int n) {
			final int ch = la(n);
			return ('0' <= ch && ch <= '9');
		}

		/**
		 * check if next symbol is digit
		 * 
		 * @return true if next symbol is digit
		 * @since 0.0.1
		 */
		private boolean laDigit() {
			return laDigit(0);
		}

		/**
		 * look ahead alpha
		 * 
		 * @return true if letter
		 */
		private boolean laAlpha() {
			final int ch = la();
			return ('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z');
		}

		/**
		 * @return parsed number
		 */
		NumberInfo parse() {
			Tokens kind = Tokens.INTEGER;
			int beforeDot = -1;
			// parsing integer of decimal, or floating point number
			if (lach('+')) {
				consume(false);
			} else if (lach('-')) {
				sign = -1;
				consume(false);
			}
			while (laDigit() || lach('_')) {
				consume(!lach('_')); // '0'
			}
			if (lach('#')) {
				// based number
				try {
					base = Integer.parseInt(buffer.toString());
				} catch (final Exception ex) {
					throw new NumberFormatException();
				}
				if (2 > base || base > 36) {
					throw new NumberFormatException();
				}
				buffer.setLength(0);
				consume(false); // '\#'
				while (laDigit() || laAlpha() || lach('.') || lach('_')) {
					final int ch = la();
					if (ch != '.' && ch != '_') {
						// check if digit conform to the base
						if (base <= 10) {
							if (!('0' <= ch && ch < '0' + base)) {
								throw new NumberFormatException();
							}
						} else {
							if (!(('0' <= ch && ch <= '9')
									|| ('a' <= ch && ch < 'a' + base - 10) || ('A' <= ch && ch < 'A' + base - 10))) {
								throw new NumberFormatException();
							}
						}
					} else if (ch == '.') {
						beforeDot = buffer.length();
						if (kind == Tokens.FLOAT) {
							throw new NumberFormatException();
						}
						kind = Tokens.FLOAT;
					}
					consume(!lach('_') && !lach('.'));
				} // end while
				if (lach('#')) {
					consume(false);
					text = buffer.toString();
					buffer.setLength(0);
				} else {
					throw new NumberFormatException();
				}
			} else {
				// parse non based integer
				if (lach('.') && laDigit(1)) {
					// floating point number
					kind = Tokens.FLOAT;
					beforeDot = buffer.length();
					consume(false); // '.'
					consume(true);
					while (laDigit()) {
						consume(true); // '0'
					}
				}
				text = buffer.toString();
				buffer.setLength(0);
			}
			if (lach('e') || lach('E')) {
				kind = Tokens.FLOAT;
				consume(false); // 'e'
				if (lach('+') || lach('-')) {
					consume(lach('-'));
				}
				if (!laDigit()) {
					throw new NumberFormatException();
				} else {
					while (laDigit() || lach('_')) {
						consume(!lach('_')); // digit
					}
				}
				exponent = Integer.parseInt(buffer.toString());
				buffer.setLength(0);
			}
			exponent -= beforeDot == -1 ? 0 : text.length() - beforeDot;
			if (laAlpha() && !lach('E') && !lach('e')) {
				if (kind == Tokens.FLOAT) {
					kind = Tokens.FLOAT_WITH_SUFFIX;
				} else {
					kind = Tokens.INTEGER_WITH_SUFFIX;
				}
				do {
					consume(true);
				} while (laAlpha() || lach('_') || laDigit());
				suffix = buffer.toString();
				buffer.setLength(0);
			}
			if (pos != inputText.length()) {
				throw new NumberFormatException(
						"Some characters left in the string "
								+ (inputText.length() - pos));
			}
			return new NumberInfo(kind, sign, base, text, exponent, suffix);
		}
	}

}