/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.types.parser; import org.apache.flink.annotation.PublicEvolving; import org.apache.flink.types.BooleanValue; import org.apache.flink.types.ByteValue; import org.apache.flink.types.DoubleValue; import org.apache.flink.types.FloatValue; import org.apache.flink.types.IntValue; import org.apache.flink.types.LongValue; import org.apache.flink.types.ShortValue; import org.apache.flink.types.StringValue; import java.math.BigDecimal; import java.math.BigInteger; import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.util.HashMap; import java.util.Map; /** * A FieldParser is used parse a field from a sequence of bytes. Fields occur in a byte sequence and are terminated * by the end of the byte sequence or a delimiter. * <p> * The parsers do not throw exceptions in general, but set an error state. That way, they can be used in functions * that ignore invalid lines, rather than failing on them. * * @param <T> The type that is parsed. */ @PublicEvolving public abstract class FieldParser<T> { /** * An enumeration of different types of errors that may occur. */ public static enum ParseErrorState { /** No error occurred. */ NONE, /** The domain of the numeric type is not large enough to hold the parsed value. */ NUMERIC_VALUE_OVERFLOW_UNDERFLOW, /** A stand-alone sign was encountered while parsing a numeric type. */ NUMERIC_VALUE_ORPHAN_SIGN, /** An illegal character was encountered while parsing a numeric type. */ NUMERIC_VALUE_ILLEGAL_CHARACTER, /** The field was not in a correct format for the numeric type. */ NUMERIC_VALUE_FORMAT_ERROR, /** A quoted string was not terminated until the line end. */ UNTERMINATED_QUOTED_STRING, /** The parser found characters between the end of the quoted string and the delimiter. */ UNQUOTED_CHARS_AFTER_QUOTED_STRING, /** The column is empty. */ EMPTY_COLUMN, /** Invalid Boolean value **/ BOOLEAN_INVALID } private Charset charset = StandardCharsets.UTF_8; private ParseErrorState errorState = ParseErrorState.NONE; /** * Parses the value of a field from the byte array, taking care of properly reset * the state of this parser. * The start position within the byte array and the array's valid length is given. * The content of the value is delimited by a field delimiter. * * @param bytes The byte array that holds the value. * @param startPos The index where the field starts * @param limit The limit unto which the byte contents is valid for the parser. The limit is the * position one after the last valid byte. * @param delim The field delimiter character * @param reuse An optional reusable field to hold the value * * @return The index of the next delimiter, if the field was parsed correctly. A value less than 0 otherwise. */ public int resetErrorStateAndParse(byte[] bytes, int startPos, int limit, byte[] delim, T reuse) { resetParserState(); return parseField(bytes, startPos, limit, delim, reuse); } /** * Each parser's logic should be implemented inside this method */ protected abstract int parseField(byte[] bytes, int startPos, int limit, byte[] delim, T reuse); /** * Reset the state of the parser. Called as the very first method inside * {@link FieldParser#resetErrorStateAndParse(byte[], int, int, byte[], Object)}, by default it just reset * its error state. * */ protected void resetParserState() { this.errorState = ParseErrorState.NONE; } /** * Gets the parsed field. This method returns the value parsed by the last successful invocation of * {@link #parseField(byte[], int, int, byte[], Object)}. It objects are mutable and reused, it will return * the object instance that was passed the parse function. * * @return The latest parsed field. */ public abstract T getLastResult(); /** * Returns an instance of the parsed value type. * * @return An instance of the parsed value type. */ public abstract T createValue(); /** * Checks if the delimiter starts at the given start position of the byte array. * * Attention: This method assumes that enough characters follow the start position for the delimiter check! * * @param bytes The byte array that holds the value. * @param startPos The index of the byte array where the check for the delimiter starts. * @param delim The delimiter to check for. * * @return true if a delimiter starts at the given start position, false otherwise. */ public static final boolean delimiterNext(byte[] bytes, int startPos, byte[] delim) { for(int pos = 0; pos < delim.length; pos++) { // check each position if(delim[pos] != bytes[startPos+pos]) { return false; } } return true; } /** * Checks if the given bytes ends with the delimiter at the given end position. * * @param bytes The byte array that holds the value. * @param endPos The index of the byte array where the check for the delimiter ends. * @param delim The delimiter to check for. * * @return true if a delimiter ends at the given end position, false otherwise. */ public static final boolean endsWithDelimiter(byte[] bytes, int endPos, byte[] delim) { if (endPos < delim.length - 1) { return false; } for (int pos = 0; pos < delim.length; ++pos) { if (delim[pos] != bytes[endPos - delim.length + 1 + pos]) { return false; } } return true; } /** * Sets the error state of the parser. Called by subclasses of the parser to set the type of error * when failing a parse. * * @param error The error state to set. */ protected void setErrorState(ParseErrorState error) { this.errorState = error; } /** * Gets the error state of the parser, as a value of the enumeration {@link ParseErrorState}. * If no error occurred, the error state will be {@link ParseErrorState#NONE}. * * @return The current error state of the parser. */ public ParseErrorState getErrorState() { return this.errorState; } /** * Returns the end position of a string. Sets the error state if the column is empty. * * @return the end position of the string or -1 if an error occurred */ protected final int nextStringEndPos(byte[] bytes, int startPos, int limit, byte[] delimiter) { int endPos = startPos; final int delimLimit = limit - delimiter.length + 1; while (endPos < limit) { if (endPos < delimLimit && delimiterNext(bytes, endPos, delimiter)) { if (endPos == startPos) { setErrorState(ParseErrorState.EMPTY_COLUMN); return -1; } break; } endPos++; } return endPos; } /** * Returns the length of a string. Throws an exception if the column is empty. * * @return the length of the string */ protected static final int nextStringLength(byte[] bytes, int startPos, int length, char delimiter) { if (length <= 0) { throw new IllegalArgumentException("Invalid input: Empty string"); } int limitedLength = 0; final byte delByte = (byte) delimiter; while (limitedLength < length && bytes[startPos + limitedLength] != delByte) { limitedLength++; } return limitedLength; } /** * Gets the character set used for this parser. * * @return the charset used for this parser. */ public Charset getCharset() { return this.charset; } /** * Sets the character set used for this parser. * * @param charset charset used for this parser. */ public void setCharset(Charset charset) { this.charset = charset; } // -------------------------------------------------------------------------------------------- // Mapping from types to parsers // -------------------------------------------------------------------------------------------- /** * Gets the parser for the type specified by the given class. Returns null, if no parser for that class * is known. * * @param type The class of the type to get the parser for. * @return The parser for the given type, or null, if no such parser exists. */ public static <T> Class<FieldParser<T>> getParserForType(Class<T> type) { Class<? extends FieldParser<?>> parser = PARSERS.get(type); if (parser == null) { return null; } else { @SuppressWarnings("unchecked") Class<FieldParser<T>> typedParser = (Class<FieldParser<T>>) parser; return typedParser; } } private static final Map<Class<?>, Class<? extends FieldParser<?>>> PARSERS = new HashMap<Class<?>, Class<? extends FieldParser<?>>>(); static { // basic types PARSERS.put(Byte.class, ByteParser.class); PARSERS.put(Short.class, ShortParser.class); PARSERS.put(Integer.class, IntParser.class); PARSERS.put(Long.class, LongParser.class); PARSERS.put(String.class, StringParser.class); PARSERS.put(Float.class, FloatParser.class); PARSERS.put(Double.class, DoubleParser.class); PARSERS.put(Boolean.class, BooleanParser.class); PARSERS.put(BigDecimal.class, BigDecParser.class); PARSERS.put(BigInteger.class, BigIntParser.class); // value types PARSERS.put(ByteValue.class, ByteValueParser.class); PARSERS.put(ShortValue.class, ShortValueParser.class); PARSERS.put(IntValue.class, IntValueParser.class); PARSERS.put(LongValue.class, LongValueParser.class); PARSERS.put(StringValue.class, StringValueParser.class); PARSERS.put(FloatValue.class, FloatValueParser.class); PARSERS.put(DoubleValue.class, DoubleValueParser.class); PARSERS.put(BooleanValue.class, BooleanValueParser.class); // SQL date/time types PARSERS.put(java.sql.Time.class, SqlTimeParser.class); PARSERS.put(java.sql.Date.class, SqlDateParser.class); PARSERS.put(java.sql.Timestamp.class, SqlTimestampParser.class); } }