/* * Copyright (c) 2007-2010 Concurrent, Inc. All Rights Reserved. * * Project and contact information: http://www.cascading.org/ * * This file is part of the Cascading project. * * Cascading is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Cascading is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Cascading. If not, see <http://www.gnu.org/licenses/>. */ package cascading.scheme; import java.beans.ConstructorProperties; import java.io.IOException; import java.util.Arrays; import java.util.List; import java.util.regex.Pattern; import cascading.tap.TapException; import cascading.tuple.Fields; import cascading.tuple.Tuple; import cascading.tuple.TupleEntry; import cascading.tuple.Tuples; import cascading.util.Util; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.mapred.OutputCollector; import org.apache.log4j.Logger; /** * Class TextDelimited is a sub-class of {@link TextLine}. It provides direct support for delimited text files, like * TAB (\t) or COMMA (,) delimited files. It also optionally allows for quoted values. * <p/> * TextDelimited may also be used to skip the "header" in a file, where the header is defined as the very first line * in every input file. That is, if the byte offset of the current line from the input is zero (0), that line will * be skipped. * <p/> * By default headers are not skipped. * <p/> * By default this {@link Scheme} is both {@code strict} and {@code safe}. * <p/> * Strict meaning if a line of text does not parse into the expected number of fields, this class will throw a * {@link TapException}. If strict is {@code false}, then {@link Tuple} will be returned with {@code null} values * for the missing fields. * <p/> * Safe meaning if a field cannot be coerced into an expected type, a {@code null} will be used for the value. * If safe is {@code false}, a {@link TapException} will be thrown. * <p/> * Also by default, {@code quote} strings are not searched for to improve processing speed. If a file is * COMMA delimited but may have COMMA's in a value, the whole value should be surrounded by the quote string, typically * double quotes ({@literal "}). * <p/> * Note all empty fields in a line will be returned as {@code null} unless coerced into a new type. * <p/> * This Scheme may source/sink {@link Fields#ALL}, when given on the constructor the new instance will automatically * default to strict == false as the number of fields parsed are arbitrary or unknown. A type array may not be given * either, so all values will be returned as Strings. * * @see TextLine */ public class TextDelimited extends TextLine { /** Field LOG */ private static final Logger LOG = Logger.getLogger( TextDelimited.class ); /** Field SPECIAL_REGEX_CHARS */ private static final String SPECIAL_REGEX_CHARS = "([\\]\\[|.*<>\\\\$^?()=!+])"; /** Field QUOTED_REGEX_FORMAT */ private static final String QUOTED_REGEX_FORMAT = "%2$s(?!(?:[^%1$s%2$s]|[^%1$s%2$s]%2$s[^%1$s])+%1$s)"; /** Field CLEAN_REGEX_FORMAT */ private static final String CLEAN_REGEX_FORMAT = "^(?:%1$s)(.*)(?:%1$s)$"; /** Field ESCAPE_REGEX_FORMAT */ private static final String ESCAPE_REGEX_FORMAT = "(%1$s%1$s)"; /** Field splitPattern */ protected Pattern splitPattern; /** Field cleanPattern */ protected Pattern cleanPattern; /** Field escapePattern */ protected Pattern escapePattern; /** Field skipHeader */ private boolean skipHeader; /** Field delimiter * */ private String delimiter; /** Field quote */ private String quote; /** Field strict */ private boolean strict = true; /** Field numValues */ private int numValues; /** Field types */ private Class[] types; /** Field safe */ private boolean safe = true; /** Field buffer */ private Object[] buffer; /** Field decoratorTuple */ private DecoratorTuple decoratorTuple; /** Class DecoratorTuple just wraps a Tuple. */ private static class DecoratorTuple extends Tuple { String string; private DecoratorTuple() { super( (List<Object>) null ); } public void set( Tuple tuple, String string ) { this.elements = Tuple.elements( tuple ); this.string = string; } @Override public String toString() { return string; } } /** * Constructor TextDelimited creates a new TextDelimited instance. * * @param fields of type Fields * @param delimiter of type String */ @ConstructorProperties({"fields", "delimiter"}) public TextDelimited( Fields fields, String delimiter ) { this( fields, null, delimiter, null, null ); } /** * Constructor TextDelimited creates a new TextDelimited instance. * * @param fields of type Fields * @param skipHeader of type boolean * @param delimiter of type String */ @ConstructorProperties({"fields", "skipHeader", "delimiter"}) public TextDelimited( Fields fields, boolean skipHeader, String delimiter ) { this( fields, null, skipHeader, delimiter, null, null ); } /** * Constructor TextDelimited creates a new TextDelimited instance. * * @param fields of type Fields * @param delimiter of type String * @param types of type Class[] */ @ConstructorProperties({"fields", "delimiter", "types"}) public TextDelimited( Fields fields, String delimiter, Class[] types ) { this( fields, null, delimiter, null, types ); } /** * Constructor TextDelimited creates a new TextDelimited instance. * * @param fields of type Fields * @param skipHeader of type boolean * @param delimiter of type String * @param types of type Class[] */ @ConstructorProperties({"fields", "skipHeader", "delimiter", "types"}) public TextDelimited( Fields fields, boolean skipHeader, String delimiter, Class[] types ) { this( fields, null, skipHeader, delimiter, null, types ); } /** * Constructor TextDelimited creates a new TextDelimited instance. * * @param fields of type Fields * @param delimiter of type String * @param quote of type String * @param types of type Class[] */ @ConstructorProperties({"fields", "delimiter", "quote", "types"}) public TextDelimited( Fields fields, String delimiter, String quote, Class[] types ) { this( fields, null, delimiter, quote, types ); } /** * Constructor TextDelimited creates a new TextDelimited instance. * * @param fields of type Fields * @param skipHeader of type boolean * @param delimiter of type String * @param quote of type String * @param types of type Class[] */ @ConstructorProperties({"fields", "skipHeader", "delimiter", "quote", "types"}) public TextDelimited( Fields fields, boolean skipHeader, String delimiter, String quote, Class[] types ) { this( fields, null, skipHeader, delimiter, quote, types ); } /** * Constructor TextDelimited creates a new TextDelimited instance. * * @param fields of type Fields * @param delimiter of type String * @param quote of type String * @param types of type Class[] * @param safe of type boolean */ @ConstructorProperties({"fields", "delimiter", "quote", "types", "safe"}) public TextDelimited( Fields fields, String delimiter, String quote, Class[] types, boolean safe ) { this( fields, null, delimiter, quote, types, safe ); } /** * Constructor TextDelimited creates a new TextDelimited instance. * * @param fields of type Fields * @param skipHeader of type boolean * @param delimiter of type String * @param quote of type String * @param types of type Class[] * @param safe of type boolean */ @ConstructorProperties({"fields", "skipHeader", "delimiter", "quote", "types", "safe"}) public TextDelimited( Fields fields, boolean skipHeader, String delimiter, String quote, Class[] types, boolean safe ) { this( fields, null, skipHeader, delimiter, quote, types, safe ); } /** * Constructor TextDelimited creates a new TextDelimited instance. * * @param fields of type Fields * @param sinkCompression of type Compress * @param delimiter of type String */ @ConstructorProperties({"fields", "sinkCompression", "delimiter"}) public TextDelimited( Fields fields, Compress sinkCompression, String delimiter ) { this( fields, sinkCompression, delimiter, null, null ); } /** * Constructor TextDelimited creates a new TextDelimited instance. * * @param fields of type Fields * @param sinkCompression of type Compress * @param skipHeader of type boolean * @param delimiter of type String */ @ConstructorProperties({"fields", "sinkCompression", "skipHeader", "delimiter"}) public TextDelimited( Fields fields, Compress sinkCompression, boolean skipHeader, String delimiter ) { this( fields, sinkCompression, skipHeader, delimiter, null, null ); } /** * Constructor TextDelimited creates a new TextDelimited instance. * * @param fields of type Fields * @param sinkCompression of type Compress * @param delimiter of type String * @param types of type Class[] */ @ConstructorProperties({"fields", "sinkCompression", "delimiter", "types"}) public TextDelimited( Fields fields, Compress sinkCompression, String delimiter, Class[] types ) { this( fields, sinkCompression, delimiter, null, types ); } /** * Constructor TextDelimited creates a new TextDelimited instance. * * @param fields of type Fields * @param sinkCompression of type Compress * @param skipHeader of type boolean * @param delimiter of type String * @param types of type Class[] */ @ConstructorProperties({"fields", "sinkCompression", "skipHeader", "delimiter", "types"}) public TextDelimited( Fields fields, Compress sinkCompression, boolean skipHeader, String delimiter, Class[] types ) { this( fields, sinkCompression, skipHeader, delimiter, null, types ); } /** * Constructor TextDelimited creates a new TextDelimited instance. * * @param fields of type Fields * @param sinkCompression of type Compress * @param delimiter of type String * @param types of type Class[] * @param safe of type boolean */ @ConstructorProperties({"fields", "sinkCompression", "delimiter", "types", "safe"}) public TextDelimited( Fields fields, Compress sinkCompression, String delimiter, Class[] types, boolean safe ) { this( fields, sinkCompression, delimiter, null, types, safe ); } /** * Constructor TextDelimited creates a new TextDelimited instance. * * @param fields of type Fields * @param sinkCompression of type Compress * @param skipHeader of type boolean * @param delimiter of type String * @param types of type Class[] * @param safe of type boolean */ @ConstructorProperties({"fields", "sinkCompression", "skipHeader", "delimiter", "types", "safe"}) public TextDelimited( Fields fields, Compress sinkCompression, boolean skipHeader, String delimiter, Class[] types, boolean safe ) { this( fields, sinkCompression, skipHeader, delimiter, null, types, safe ); } /** * Constructor TextDelimited creates a new TextDelimited instance. * * @param fields of type Fields * @param delimiter of type String * @param quote of type String */ @ConstructorProperties({"fields", "delimiter", "quote"}) public TextDelimited( Fields fields, String delimiter, String quote ) { this( fields, null, delimiter, quote ); } /** * Constructor TextDelimited creates a new TextDelimited instance. * * @param fields of type Fields * @param skipHeader of type boolean * @param delimiter of type String * @param quote of type String */ @ConstructorProperties({"fields", "skipHeader", "delimiter", "quote"}) public TextDelimited( Fields fields, boolean skipHeader, String delimiter, String quote ) { this( fields, null, skipHeader, delimiter, quote ); } /** * Constructor TextDelimited creates a new TextDelimited instance. * * @param fields of type Fields * @param sinkCompression of type Compress * @param delimiter of type String * @param quote of type String */ @ConstructorProperties({"fields", "sinkCompression", "skipHeader", "delimiter", "quote"}) public TextDelimited( Fields fields, Compress sinkCompression, String delimiter, String quote ) { this( fields, sinkCompression, false, delimiter, true, quote, null, true ); } /** * Constructor TextDelimited creates a new TextDelimited instance. * * @param fields of type Fields * @param sinkCompression of type Compress * @param skipHeader of type boolean * @param delimiter of type String * @param quote of type String */ public TextDelimited( Fields fields, Compress sinkCompression, boolean skipHeader, String delimiter, String quote ) { this( fields, sinkCompression, skipHeader, delimiter, true, quote, null, true ); } /** * Constructor TextDelimited creates a new TextDelimited instance. * * @param fields of type Fields * @param sinkCompression of type Compress * @param delimiter of type String * @param quote of type String * @param types of type Class[] */ @ConstructorProperties({"fields", "sinkCompression", "delimiter", "quote", "types"}) public TextDelimited( Fields fields, Compress sinkCompression, String delimiter, String quote, Class[] types ) { this( fields, sinkCompression, false, delimiter, true, quote, types, true ); } /** * Constructor TextDelimited creates a new TextDelimited instance. * * @param fields of type Fields * @param sinkCompression of type Compress * @param skipHeader of type boolean * @param delimiter of type String * @param quote of type String * @param types of type Class[] */ @ConstructorProperties({"fields", "sinkCompression", "skipHeader", "delimiter", "quote", "types"}) public TextDelimited( Fields fields, Compress sinkCompression, boolean skipHeader, String delimiter, String quote, Class[] types ) { this( fields, sinkCompression, skipHeader, delimiter, true, quote, types, true ); } /** * Constructor TextDelimited creates a new TextDelimited instance. * * @param fields of type Fields * @param sinkCompression of type Compress * @param delimiter of type String * @param quote of type String * @param types of type Class[] * @param safe of type boolean */ @ConstructorProperties({"fields", "sinkCompression", "delimiter", "quote", "types", "safe"}) public TextDelimited( Fields fields, Compress sinkCompression, String delimiter, String quote, Class[] types, boolean safe ) { this( fields, sinkCompression, false, delimiter, true, quote, types, safe ); } /** * Constructor TextDelimited creates a new TextDelimited instance. * * @param fields of type Fields * @param sinkCompression of type Compress * @param skipHeader of type boolean * @param delimiter of type String * @param quote of type String * @param types of type Class[] * @param safe of type boolean */ @ConstructorProperties({"fields", "sinkCompression", "skipHeader", "delimiter", "quote", "types", "safe"}) public TextDelimited( Fields fields, Compress sinkCompression, boolean skipHeader, String delimiter, String quote, Class[] types, boolean safe ) { this( fields, sinkCompression, skipHeader, delimiter, true, quote, types, safe ); } /** * Constructor TextDelimited creates a new TextDelimited instance. * * @param fields of type Fields * @param sinkCompression of type Compress * @param skipHeader of type boolean * @param delimiter of type String * @param strict of type boolean * @param quote of type String * @param types of type Class[] * @param safe of type boolean */ @ConstructorProperties({"fields", "sinkCompression", "skipHeader", "delimiter", "strict", "quote", "types", "safe"}) public TextDelimited( Fields fields, Compress sinkCompression, boolean skipHeader, String delimiter, boolean strict, String quote, Class[] types, boolean safe ) { super( sinkCompression ); // normalizes ALL and UNKNOWN setSinkFields( fields ); setSourceFields( fields ); fields = getSinkFields(); this.skipHeader = skipHeader; this.delimiter = delimiter; this.strict = strict; this.safe = safe; this.numValues = fields.size(); if( fields.isAll() ) this.strict = false; if( !fields.isAll() && this.numValues == 0 ) throw new IllegalArgumentException( "may not be zero declared fields, found: " + fields.printVerbose() ); if( quote != null && !quote.isEmpty() ) // if empty, leave null this.quote = quote; splitPattern = createSplitPatternFor( this.delimiter, this.quote ); cleanPattern = createCleanPatternFor( this.quote ); escapePattern = createEscapePatternFor( this.quote ); if( types != null && types.length == 0 ) this.types = null; if( types != null ) this.types = Arrays.copyOf( types, types.length ); if( this.types != null && fields.isAll() ) throw new IllegalArgumentException( "when using Fields.ALL, field types may not be used" ); if( this.types != null && this.types.length != fields.size() ) throw new IllegalArgumentException( "num of types must equal number of fields: " + fields.printVerbose() + ", found: " + types.length ); } /** * Method createEscapePatternFor creates a regex {@link Pattern} cleaning quote escapes from a String. * <p/> * If {@code quote} is null or empty, a null value will be returned; * * @param quote of type String * @return Pattern */ public static Pattern createEscapePatternFor( String quote ) { if( quote == null || quote.isEmpty() ) return null; return Pattern.compile( String.format( ESCAPE_REGEX_FORMAT, quote ) ); } /** * Method createCleanPatternFor creates a regex {@link Pattern} for removing quote characters from a String. * <p/> * If {@code quote} is null or empty, a null value will be returned; * * @param quote of type String * @return Pattern */ public static Pattern createCleanPatternFor( String quote ) { if( quote == null || quote.isEmpty() ) return null; return Pattern.compile( String.format( CLEAN_REGEX_FORMAT, quote ) ); } /** * Method createSplitPatternFor creates a regex {@link Pattern} for splitting a line of text into its component * parts using the given delimiter and quote Strings. {@code quote} may be null. * * @param delimiter of type String * @param quote of type String * @return Pattern */ public static Pattern createSplitPatternFor( String delimiter, String quote ) { String escapedDelimiter = delimiter.replaceAll( SPECIAL_REGEX_CHARS, "\\\\$1" ); if( quote == null || quote.isEmpty() ) return Pattern.compile( escapedDelimiter ); else return Pattern.compile( String.format( QUOTED_REGEX_FORMAT, quote, escapedDelimiter ) ); } @Override public Tuple source( Object key, Object value ) { if( skipHeader && ( (LongWritable) key ).get() == 0 ) return null; Object[] split = createSplit( value.toString(), splitPattern, numValues ); if( numValues != 0 && split.length != numValues ) { String message = "did not parse correct number of values from input data, expected: " + numValues + ", got: " + split.length + ":" + Util.join( ",", (String[]) split ); if( strict ) throw new TapException( message ); LOG.warn( message ); Object[] array = new Object[numValues]; Arrays.fill( array, "" ); System.arraycopy( split, 0, array, 0, split.length ); split = array; } cleanSplit( split, cleanPattern, escapePattern, quote ); if( types != null ) // forced null in ctor { Object[] result = new Object[split.length]; for( int i = 0; i < split.length; i++ ) { try { result[ i ] = Tuples.coerce( split[ i ], types[ i ] ); } catch( Exception exception ) { String message = "field " + getSourceFields().get( i ) + " cannot be coerced from : " + result[ i ] + " to: " + types[ i ].getName(); result[ i ] = null; LOG.warn( message, exception ); if( !safe ) throw new TapException( message, exception ); } } split = result; } return new Tuple( split ); } /** * Method createSplit will split the given {@code value} with the given {@code splitPattern}. * * @param value of type String * @param splitPattern of type Pattern * @param numValues of type int * @return String[] */ public static String[] createSplit( String value, Pattern splitPattern, int numValues ) { return splitPattern.split( value, numValues ); } /** * Method cleanSplit will return a quote free array of String values, the given {@code split} array * will be updated in place. * <p/> * If {@code cleanPattern} is null, quote cleaning will not be performed, but all empty String values * will be replaces with a {@code null} value. * * @param split of type Object[] * @param cleanPattern of type Pattern * @param escapePattern of type Pattern * @param quote of type String * @return Object[] as a convenience */ public static Object[] cleanSplit( Object[] split, Pattern cleanPattern, Pattern escapePattern, String quote ) { if( cleanPattern != null ) { for( int i = 0; i < split.length; i++ ) { split[ i ] = cleanPattern.matcher( (String) split[ i ] ).replaceAll( "$1" ); split[ i ] = escapePattern.matcher( (String) split[ i ] ).replaceAll( quote ); } } for( int i = 0; i < split.length; i++ ) { if( ( (String) split[ i ] ).isEmpty() ) split[ i ] = null; } return split; } private Object[] getBuffer( Tuple tuple ) { if( buffer == null ) buffer = new Object[tuple.size()]; return buffer; } @Override public void sink( TupleEntry tupleEntry, OutputCollector outputCollector ) throws IOException { Tuple tuple = tupleEntry.selectTuple( sinkFields ); Object[] buffer = Tuples.asArray( tuple, getBuffer( tuple ) ); if( quote != null ) { for( int i = 0; i < buffer.length; i++ ) { Object value = buffer[ i ]; if( value == null ) continue; String valueString = value.toString(); if( valueString.contains( quote ) ) valueString = valueString.replaceAll( quote, quote + quote ); if( valueString.contains( delimiter ) ) valueString = quote + valueString + quote; buffer[ i ] = valueString; } } if( decoratorTuple == null ) decoratorTuple = new DecoratorTuple(); decoratorTuple.set( tupleEntry.getTuple(), Util.join( buffer, delimiter, false ) ); outputCollector.collect( null, decoratorTuple ); } }