/* * Copyright (c) 2007-2010 Concurrent, Inc. All Rights Reserved. * * Project and contact information: http://www.cascading.org/ * * This file is part of the Cascading project. * * Cascading is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Cascading is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Cascading. If not, see <http://www.gnu.org/licenses/>. */ package cascading.tap; import java.beans.ConstructorProperties; import java.io.IOException; import java.util.ArrayList; import java.util.List; import cascading.scheme.Scheme; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.PathFilter; import org.apache.hadoop.mapred.JobConf; /** * Class GlobHfs is a type of {@link MultiSourceTap} that accepts Hadoop style 'file globing' expressions so * multiple files that match the given pattern may be used as the input sources for a given {@link cascading.flow.Flow}. * <p/> * See {@link FileSystem#globStatus(org.apache.hadoop.fs.Path)} for details on the globing syntax. But in short * it is similar to standard regular expressions except alternation is done via {foo,bar} instead of (foo|bar). * <p/> * Note that a {@link cascading.flow.Flow} sourcing from GlobHfs is not currently compatible with the {@link cascading.cascade.Cascade} * scheduler. GlobHfs expects the files and paths to exist so the wildcards can be resolved into concrete values so * that the scheduler can order the Flows properly. * <p/> * Note that globing can match files or directories. It may consume less resources to match directories and let * Hadoop include all sub-files immediately contained in the directory instead of enumerating every individual file. * Ending the glob path with a {@code /} should match only directories. * * @see Hfs * @see MultiSourceTap * @see FileSystem */ public class GlobHfs extends MultiSourceTap { /** Field pathPattern */ private String pathPattern; /** Field pathFilter */ private PathFilter pathFilter; /** * Constructor GlobHfs creates a new GlobHfs instance. * * @param scheme of type Scheme * @param pathPattern of type String */ @ConstructorProperties({"scheme", "pathPattern"}) public GlobHfs( Scheme scheme, String pathPattern ) { this( scheme, pathPattern, null ); } /** * Constructor GlobHfs creates a new GlobHfs instance. * * @param scheme of type Scheme * @param pathPattern of type String * @param pathFilter of type PathFilter */ @ConstructorProperties({"scheme", "pathPattern", "pathFilter"}) public GlobHfs( Scheme scheme, String pathPattern, PathFilter pathFilter ) { super( scheme ); this.pathPattern = pathPattern; this.pathFilter = pathFilter; } @Override protected Tap[] getTaps() { if( taps != null ) return taps; try { taps = makeTaps( new JobConf() ); } catch( IOException exception ) { throw new TapException( "unable to resolve taps for globbing path: " + pathPattern ); } return taps; } private Tap[] makeTaps( JobConf conf ) throws IOException { FileStatus[] statusList = null; Path path = new Path( pathPattern ); FileSystem fileSystem = path.getFileSystem( conf ); if( pathFilter == null ) statusList = fileSystem.globStatus( path ); else statusList = fileSystem.globStatus( path, pathFilter ); if( statusList == null || statusList.length == 0 ) throw new TapException( "unable to find paths matching path pattern: " + pathPattern ); List<Hfs> notEmpty = new ArrayList<Hfs>(); for( int i = 0; i < statusList.length; i++ ) { // remove empty files. turns out a directory returns a length not zero // so this jives with the expectations set in the above javadoc if( statusList[ i ].getLen() != 0 ) notEmpty.add( new Hfs( getScheme(), statusList[ i ].getPath().toString() ) ); } if( notEmpty.isEmpty() ) throw new TapException( "all paths matching path pattern are zero length: " + pathPattern ); return notEmpty.toArray( new Tap[ notEmpty.size() ] ); } @Override public void sourceInit( JobConf conf ) throws IOException { taps = makeTaps( conf ); super.sourceInit( conf ); } @Override public boolean equals( Object object ) { if( this == object ) return true; if( object == null || getClass() != object.getClass() ) return false; GlobHfs globHfs = (GlobHfs) object; // do not compare tap arrays, these values should be sufficient to show identity if( getScheme() != null ? !getScheme().equals( globHfs.getScheme() ) : globHfs.getScheme() != null ) return false; if( pathFilter != null ? !pathFilter.equals( globHfs.pathFilter ) : globHfs.pathFilter != null ) return false; if( pathPattern != null ? !pathPattern.equals( globHfs.pathPattern ) : globHfs.pathPattern != null ) return false; return true; } @Override public int hashCode() { int result = pathPattern != null ? pathPattern.hashCode() : 0; result = 31 * result + ( pathFilter != null ? pathFilter.hashCode() : 0 ); return result; } @Override public String toString() { return "GlobHfs[" + pathPattern + ']'; } }