/* * Copyright (c) 2007-2010 Concurrent, Inc. All Rights Reserved. * * Project and contact information: http://www.cascading.org/ * * This file is part of the Cascading project. * * Cascading is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Cascading is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Cascading. If not, see <http://www.gnu.org/licenses/>. */ package cascading.scheme; import java.beans.ConstructorProperties; import java.io.IOException; import cascading.tap.Tap; import cascading.tap.hadoop.ZipInputFormat; import cascading.tuple.Fields; import cascading.tuple.Tuple; import cascading.tuple.TupleEntry; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.TextInputFormat; import org.apache.hadoop.mapred.TextOutputFormat; /** * A TextLine is a type of {@link Scheme} for plain text files. Files are broken into * lines. Either line-feed or carriage-return are used to signal end of line. * <p/> * By default, this scheme returns a {@link Tuple} with two fields, "offset" and "line". * <p/> * Many of the constructors take both "sourceFields" and "sinkFields". sourceFields denote the field names * to be used instead of the names "offset" and "line". sinkFields is a selector and is by default {@link Fields#ALL}. * Any available field names can be given if only a subset of the incoming fields should be used. * <p/> * If a {@link Fields} instance is passed on the constructor as sourceFields having only one field, the return tuples * will simply be the "line" value using the given field name. * <p/> * Note that TextLine will concatenate all the Tuple values for the selected fields with a TAB delimiter before * writing out the line. * <p/> * Note sink compression is {@link Compress#DISABLE} by default. If {@code null} is passed to the constructor * for the compression value, it will remain disabled. * <p/> * If all the input files end with ".zip", the {@link ZipInputFormat} will be used. This is not * bi-directional, so zip files cannot be written. */ public class TextLine extends Scheme { public enum Compress { DEFAULT, ENABLE, DISABLE } /** Field serialVersionUID */ private static final long serialVersionUID = 1L; /** Field DEFAULT_SOURCE_FIELDS */ public static final Fields DEFAULT_SOURCE_FIELDS = new Fields( "offset", "line" ); /** Field sinkCompression */ Compress sinkCompression = Compress.DISABLE; /** * Creates a new TextLine instance that sources "offset" and "line" fields, and sinks all incoming fields, where * "offset" is the byte offset in the input file. */ public TextLine() { super( DEFAULT_SOURCE_FIELDS ); } /** * Creates a new TextLine instance that sources "offset" and "line" fields, and sinks all incoming fields, where * "offset" is the byte offset in the input file. * * @param numSinkParts of type int */ @ConstructorProperties({"numSinkParts"}) public TextLine( int numSinkParts ) { super( DEFAULT_SOURCE_FIELDS, numSinkParts ); } /** * Creates a new TextLine instance that sources "offset" and "line" fields, and sinks all incoming fields, where * "offset" is the byte offset in the input file. * * @param sinkCompression of type Compress */ @ConstructorProperties({"sinkCompression"}) public TextLine( Compress sinkCompression ) { super( DEFAULT_SOURCE_FIELDS ); setSinkCompression( sinkCompression ); } /** * Creates a new TextLine instance. If sourceFields has one field, only the text line will be returned in the * subsequent tuples. * * @param sourceFields the source fields for this scheme * @param sinkFields the sink fields for this scheme */ @ConstructorProperties({"sourceFields", "sinkFields"}) public TextLine( Fields sourceFields, Fields sinkFields ) { super( sourceFields, sinkFields ); if( sourceFields.size() < 1 || sourceFields.size() > 2 ) throw new IllegalArgumentException( "this scheme requires either one or two source fields, given [" + sourceFields + "]" ); } /** * Creates a new TextLine instance. If sourceFields has one field, only the text line will be returned in the * subsequent tuples. * * @param sourceFields the source fields for this scheme * @param sinkFields the sink fields for this scheme * @param numSinkParts of type int */ @ConstructorProperties({"sourceFields", "sinkFields", "numSinkParts"}) public TextLine( Fields sourceFields, Fields sinkFields, int numSinkParts ) { super( sourceFields, sinkFields, numSinkParts ); if( sourceFields.size() < 1 || sourceFields.size() > 2 ) throw new IllegalArgumentException( "this scheme requires either one or two source fields, given [" + sourceFields + "]" ); } /** * Constructor TextLine creates a new TextLine instance. If sourceFields has one field, only the text line will be returned in the * subsequent tuples. * * @param sourceFields of type Fields * @param sinkFields of type Fields * @param sinkCompression of type Compress */ @ConstructorProperties({"sourceFields", "sinkFields", "sinkCompression"}) public TextLine( Fields sourceFields, Fields sinkFields, Compress sinkCompression ) { super( sourceFields, sinkFields ); this.sinkCompression = sinkCompression; if( sourceFields.size() < 1 || sourceFields.size() > 2 ) throw new IllegalArgumentException( "this scheme requires either one or two source fields, given [" + sourceFields + "]" ); } /** * Constructor TextLine creates a new TextLine instance. If sourceFields has one field, only the text line will be returned in the * subsequent tuples. * * @param sourceFields of type Fields * @param sinkFields of type Fields * @param sinkCompression of type Compress * @param numSinkParts of type int */ @ConstructorProperties({"sourceFields", "sinkFields", "sinkCompression", "numSinkParts"}) public TextLine( Fields sourceFields, Fields sinkFields, Compress sinkCompression, int numSinkParts ) { super( sourceFields, sinkFields, numSinkParts ); setSinkCompression( sinkCompression ); if( sourceFields.size() < 1 || sourceFields.size() > 2 ) throw new IllegalArgumentException( "this scheme requires either one or two source fields, given [" + sourceFields + "]" ); } /** * Creates a new TextLine instance. If sourceFields has one field, only the text line will be returned in the * subsequent tuples. * * @param sourceFields the source fields for this scheme */ @ConstructorProperties({"sourceFields"}) public TextLine( Fields sourceFields ) { super( sourceFields ); if( sourceFields.size() < 1 || sourceFields.size() > 2 ) throw new IllegalArgumentException( "this scheme requires either one or two source fields, given [" + sourceFields + "]" ); } /** * Creates a new TextLine instance. If sourceFields has one field, only the text line will be returned in the * subsequent tuples. The resulting data set will have numSinkParts. * * @param sourceFields the source fields for this scheme * @param numSinkParts of type int */ @ConstructorProperties({"sourceFields", "numSinkParts"}) public TextLine( Fields sourceFields, int numSinkParts ) { super( sourceFields, numSinkParts ); if( sourceFields.size() < 1 || sourceFields.size() > 2 ) throw new IllegalArgumentException( "this scheme requires either one or two source fields, given [" + sourceFields + "]" ); } /** * Method getSinkCompression returns the sinkCompression of this TextLine object. * * @return the sinkCompression (type Compress) of this TextLine object. */ public Compress getSinkCompression() { return sinkCompression; } /** * Method setSinkCompression sets the sinkCompression of this TextLine object. If null, compression will remain disabled. * * @param sinkCompression the sinkCompression of this TextLine object. */ public void setSinkCompression( Compress sinkCompression ) { if( sinkCompression != null ) // leave disabled if null this.sinkCompression = sinkCompression; } @Override public void sourceInit( Tap tap, JobConf conf ) { if( hasZippedFiles( FileInputFormat.getInputPaths( conf ) ) ) conf.setInputFormat( ZipInputFormat.class ); else conf.setInputFormat( TextInputFormat.class ); } private boolean hasZippedFiles( Path[] paths ) { boolean isZipped = paths[ 0 ].getName().endsWith( ".zip" ); for( int i = 1; i < paths.length; i++ ) { if( isZipped != paths[ i ].getName().endsWith( ".zip" ) ) throw new IllegalStateException( "cannot mix zipped and upzippled files" ); } return isZipped; } @Override public void sinkInit( Tap tap, JobConf conf ) throws IOException { if( tap.getQualifiedPath( conf ).toString().endsWith( ".zip" ) ) throw new IllegalStateException( "cannot write zip files: " + FileOutputFormat.getOutputPath( conf ) ); if( getSinkCompression() == Compress.DISABLE ) conf.setBoolean( "mapred.output.compress", false ); else if( getSinkCompression() == Compress.ENABLE ) conf.setBoolean( "mapred.output.compress", true ); conf.setOutputKeyClass( Text.class ); // be explicit conf.setOutputValueClass( Text.class ); // be explicit conf.setOutputFormat( TextOutputFormat.class ); } @Override public Tuple source( Object key, Object value ) { Tuple tuple = new Tuple(); if( sourceFields.size() == 2 ) tuple.add( key.toString() ); tuple.add( value.toString() ); return tuple; } @Override public void sink( TupleEntry tupleEntry, OutputCollector outputCollector ) throws IOException { // it's ok to use NULL here so the collector does not write anything outputCollector.collect( null, tupleEntry.selectTuple( sinkFields ) ); } }