/*
* Copyright (c) 2007-2010 Concurrent, Inc. All Rights Reserved.
*
* Project and contact information: http://www.cascading.org/
*
* This file is part of the Cascading project.
*
* Cascading is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Cascading is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Cascading. If not, see <http://www.gnu.org/licenses/>.
*/
package cascading;
import java.io.File;
import java.io.IOException;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import cascading.flow.Flow;
import cascading.flow.FlowConnector;
import cascading.flow.MultiMapReducePlanner;
import cascading.operation.Identity;
import cascading.operation.Insert;
import cascading.operation.regex.RegexParser;
import cascading.operation.regex.RegexSplitter;
import cascading.operation.text.DateParser;
import cascading.pipe.CoGroup;
import cascading.pipe.Each;
import cascading.pipe.GroupBy;
import cascading.pipe.Pipe;
import cascading.scheme.TextLine;
import cascading.tap.Hfs;
import cascading.tap.Lfs;
import cascading.tap.Tap;
import cascading.tuple.Fields;
import cascading.tuple.Tuple;
import cascading.tuple.TupleEntryIterator;
import cascading.util.Util;
import org.apache.hadoop.mapred.JobConf;
public class SortedValuesTest extends ClusterTestCase
{
String inputFileApache = "build/test/data/apache.200.txt";
String inputFileIps = "build/test/data/ips.20.txt";
String inputFileCross = "build/test/data/lhs+rhs-cross.txt";
String outputPath = "build/test/output/sorting/";
private String apacheCommonRegex = TestConstants.APACHE_COMMON_REGEX;
private RegexParser apacheCommonParser = new RegexParser( new Fields( "ip", "time", "method", "event", "status", "size" ), apacheCommonRegex, new int[]{
1, 2, 3, 4, 5, 6} );
public SortedValuesTest()
{
super( "sorted values", false ); // disable cluster
}
public void testCoGroupComparatorValues() throws Exception
{
runCoGroupComparatorTest( "cogroupcompareforward", false );
}
public void testCoGroupComparatorValuesReversed() throws Exception
{
runCoGroupComparatorTest( "cogroupcomparereversed", true );
}
private void runCoGroupComparatorTest( String path, boolean reverseSort ) throws IOException, ParseException
{
if( !new File( inputFileApache ).exists() )
fail( "data file not found" );
copyFromLocal( inputFileApache );
copyFromLocal( inputFileIps );
Tap sourceApache = new Hfs( new TextLine(), inputFileApache );
Tap sourceIP = new Hfs( new TextLine(), inputFileIps );
Tap sink = new Hfs( new TextLine(), outputPath + path, true );
Pipe apachePipe = new Pipe( "apache" );
apachePipe = new Each( apachePipe, new Fields( "line" ), apacheCommonParser );
apachePipe = new Each( apachePipe, new Insert( new Fields( "col" ), 1 ), Fields.ALL );
apachePipe = new Each( apachePipe, new Fields( "ip" ), new RegexParser( new Fields( "octet" ), "^[^.]*" ), new Fields( "col", "status", "event", "octet", "size" ) );
apachePipe = new Each( apachePipe, new Fields( "octet" ), new Identity( long.class ), Fields.REPLACE );
Fields groupApache = new Fields( "octet" );
groupApache.setComparator( "octet", new TestLongComparator( reverseSort ) );
Pipe ipPipe = new Pipe( "ip" );
ipPipe = new Each( ipPipe, new Fields( "line" ), new Identity( new Fields( "rawip" ) ) );
ipPipe = new Each( ipPipe, new Fields( "rawip" ), new RegexParser( new Fields( "rawoctet" ), "^[^.]*" ), new Fields( "rawoctet" ) );
ipPipe = new Each( ipPipe, new Fields( "rawoctet" ), new Identity( long.class ), Fields.REPLACE );
Fields groupIP = new Fields( "rawoctet" );
groupIP.setComparator( "rawoctet", new TestLongComparator( reverseSort ) );
Pipe pipe = new CoGroup( apachePipe, groupApache, ipPipe, groupIP );
pipe = new Each( pipe, new Identity() ); // let's force the stack to be exercised
Map<Object, Object> properties = getProperties();
if( MultiMapReducePlanner.getJobConf( properties ) != null )
MultiMapReducePlanner.getJobConf( properties ).setNumMapTasks( 13 );
Map sources = new HashMap();
sources.put( "apache", sourceApache );
sources.put( "ip", sourceIP );
Flow flow = new FlowConnector( properties ).connect( sources, sink, pipe );
flow.complete();
validateFile( sink, 199, 16, reverseSort, 5 );
}
public void testComprehensiveGroupBy() throws IOException
{
Boolean[][] testArray = new Boolean[][]{
// test group comparators
{false, null, false},
{true, null, false},
// test group, reversed
{false, null, true},
{true, null, true},
// test group and sort comparators
{false, false, false},
{true, false, false},
{true, true, false},
{false, true, false},
// test group and sort comparators, reversed
{false, false, true},
{true, false, true},
{true, true, true},
{false, true, true}
};
for( int i = 0; i < testArray.length; i++ )
runComprehensiveCase( testArray[ i ], false );
for( int i = 0; i < testArray.length; i++ )
runComprehensiveCase( testArray[ i ], true );
}
private void runComprehensiveCase( Boolean[] testCase, boolean useCollectionsComparator ) throws IOException
{
if( !new File( inputFileCross ).exists() )
fail( "data file not found" );
copyFromLocal( inputFileCross );
String test = Util.join( testCase, "_", true );
String path = "comprehensive/" + test;
Tap source = new Hfs( new TextLine( new Fields( "line" ) ), inputFileCross );
Tap sink = new Hfs( new TextLine( new Fields( "line" ), new Fields( "num", "lower", "upper" ), 1 ), outputPath + path, true );
Pipe pipe = new Pipe( "comprehensivesort" );
pipe = new Each( pipe, new Fields( "line" ), new RegexSplitter( new Fields( "num", "lower", "upper" ), "\\s" ) );
pipe = new Each( pipe, new Fields( "num" ), new Identity( long.class ), Fields.REPLACE );
Fields groupFields = new Fields( "num" );
if( testCase[ 0 ] )
groupFields.setComparator( "num", useCollectionsComparator ? Collections.reverseOrder() : new TestLongComparator() );
Fields sortFields = null;
if( testCase[ 1 ] != null )
{
sortFields = new Fields( "upper" );
if( testCase[ 1 ] )
sortFields.setComparator( "upper", useCollectionsComparator ? Collections.reverseOrder() : new TestStringComparator() );
}
pipe = new GroupBy( pipe, groupFields, sortFields, testCase[ 2 ] );
Map<Object, Object> properties = getProperties();
if( MultiMapReducePlanner.getJobConf( properties ) != null )
MultiMapReducePlanner.getJobConf( properties ).setNumMapTasks( 13 );
Flow flow = new FlowConnector( properties ).connect( source, sink, pipe );
flow.complete();
validateCase( test, testCase, sink );
}
private void validateCase( String test, Boolean[] testCase, Tap sink ) throws IOException
{
TupleEntryIterator iterator = sink.openForRead( new JobConf() );
LinkedHashMap<Long, List<String>> group = new LinkedHashMap<Long, List<String>>();
while( iterator.hasNext() )
{
Tuple tuple = iterator.next().getTuple();
String[] values = tuple.getString( 0 ).split( "\\s" );
long num = Long.parseLong( values[ 0 ] );
if( !group.containsKey( num ) )
group.put( num, new ArrayList<String>() );
group.get( num ).add( values[ 2 ] );
}
boolean groupIsReversed = testCase[ 0 ];
if( testCase[ 2 ] )
groupIsReversed = !groupIsReversed;
compare( "grouping+" + test, groupIsReversed, group.keySet() );
if( testCase[ 1 ] == null )
return;
boolean valueIsReversed = testCase[ 1 ];
if( testCase[ 2 ] )
valueIsReversed = !valueIsReversed;
for( Long grouping : group.keySet() )
compare( "values+" + test, valueIsReversed, group.get( grouping ) );
}
private void compare( String test, boolean isReversed, Collection values )
{
List<Object> groups = new ArrayList<Object>( values );
List<Object> sortedGroups = new ArrayList<Object>( groups );
Collections.sort( sortedGroups, isReversed ? Collections.reverseOrder() : null );
assertEquals( test, sortedGroups, groups );
}
public void testSortFails() throws Exception
{
String path = "fails";
if( !new File( inputFileApache ).exists() )
fail( "data file not found" );
copyFromLocal( inputFileApache );
Tap source = new Lfs( new TextLine(), inputFileApache );
Tap sink = new Lfs( new TextLine(), outputPath + path, true );
Pipe pipe = new Pipe( "apache" );
// RegexParser.APACHE declares: "time", "method", "event", "status", "size"
pipe = new Each( pipe, new Fields( "line" ), apacheCommonParser );
pipe = new Each( pipe, new Insert( new Fields( "col" ), 1 ), Fields.ALL );
// DateParser.APACHE declares: "ts"
pipe = new Each( pipe, new Fields( "time" ), new DateParser( "dd/MMM/yyyy:HH:mm:ss Z" ), new Fields( "col", "status", "ts", "event", "ip", "size" ) );
pipe = new GroupBy( pipe, new Fields( "col" ), new Fields( "does-not-exist" ) );
pipe = new Each( pipe, new Identity() ); // let's force the stack to be exercised
Map<Object, Object> properties = getProperties();
MultiMapReducePlanner.getJobConf( properties ).setNumMapTasks( 13 );
try
{
new FlowConnector( properties ).connect( source, sink, pipe );
fail( "did not throw exception" );
}
catch( Exception exception )
{
// passes
}
}
private void validateFile( Tap tap, int length, int uniqueValues, boolean isReversed, int comparePosition ) throws IOException, ParseException
{
TupleEntryIterator iterator = tap.openForRead( new JobConf() );
Set<Long> values = new HashSet<Long>();
long lastValue = isReversed ? Long.MAX_VALUE : Long.MIN_VALUE;
int count = 0;
while( iterator.hasNext() )
{
Tuple tuple = iterator.next().getTuple();
count++;
tuple = new Tuple( (Object[]) tuple.getString( 1 ).split( "\t" ) );
long value = tuple.getLong( comparePosition );
values.add( value );
if( isReversed )
assertTrue( "out of order in " + tap, lastValue >= value );
else
assertTrue( "out of order in " + tap, lastValue <= value );
lastValue = value;
}
if( length != -1 )
assertEquals( "length of " + tap, length, count );
if( uniqueValues != -1 )
assertEquals( "unique values of " + tap, uniqueValues, values.size() );
}
}