/* * Copyright (c) 2007-2010 Concurrent, Inc. All Rights Reserved. * * Project and contact information: http://www.cascading.org/ * * This file is part of the Cascading project. * * Cascading is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Cascading is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Cascading. If not, see <http://www.gnu.org/licenses/>. */ package cascading; import java.io.File; import java.io.IOException; import java.text.ParseException; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Set; import cascading.flow.Flow; import cascading.flow.FlowConnector; import cascading.flow.MultiMapReducePlanner; import cascading.operation.Identity; import cascading.operation.Insert; import cascading.operation.regex.RegexParser; import cascading.operation.regex.RegexSplitter; import cascading.operation.text.DateParser; import cascading.pipe.CoGroup; import cascading.pipe.Each; import cascading.pipe.GroupBy; import cascading.pipe.Pipe; import cascading.scheme.TextLine; import cascading.tap.Hfs; import cascading.tap.Lfs; import cascading.tap.Tap; import cascading.tuple.Fields; import cascading.tuple.Tuple; import cascading.tuple.TupleEntryIterator; import cascading.util.Util; import org.apache.hadoop.mapred.JobConf; public class SortedValuesTest extends ClusterTestCase { String inputFileApache = "build/test/data/apache.200.txt"; String inputFileIps = "build/test/data/ips.20.txt"; String inputFileCross = "build/test/data/lhs+rhs-cross.txt"; String outputPath = "build/test/output/sorting/"; private String apacheCommonRegex = TestConstants.APACHE_COMMON_REGEX; private RegexParser apacheCommonParser = new RegexParser( new Fields( "ip", "time", "method", "event", "status", "size" ), apacheCommonRegex, new int[]{ 1, 2, 3, 4, 5, 6} ); public SortedValuesTest() { super( "sorted values", false ); // disable cluster } public void testCoGroupComparatorValues() throws Exception { runCoGroupComparatorTest( "cogroupcompareforward", false ); } public void testCoGroupComparatorValuesReversed() throws Exception { runCoGroupComparatorTest( "cogroupcomparereversed", true ); } private void runCoGroupComparatorTest( String path, boolean reverseSort ) throws IOException, ParseException { if( !new File( inputFileApache ).exists() ) fail( "data file not found" ); copyFromLocal( inputFileApache ); copyFromLocal( inputFileIps ); Tap sourceApache = new Hfs( new TextLine(), inputFileApache ); Tap sourceIP = new Hfs( new TextLine(), inputFileIps ); Tap sink = new Hfs( new TextLine(), outputPath + path, true ); Pipe apachePipe = new Pipe( "apache" ); apachePipe = new Each( apachePipe, new Fields( "line" ), apacheCommonParser ); apachePipe = new Each( apachePipe, new Insert( new Fields( "col" ), 1 ), Fields.ALL ); apachePipe = new Each( apachePipe, new Fields( "ip" ), new RegexParser( new Fields( "octet" ), "^[^.]*" ), new Fields( "col", "status", "event", "octet", "size" ) ); apachePipe = new Each( apachePipe, new Fields( "octet" ), new Identity( long.class ), Fields.REPLACE ); Fields groupApache = new Fields( "octet" ); groupApache.setComparator( "octet", new TestLongComparator( reverseSort ) ); Pipe ipPipe = new Pipe( "ip" ); ipPipe = new Each( ipPipe, new Fields( "line" ), new Identity( new Fields( "rawip" ) ) ); ipPipe = new Each( ipPipe, new Fields( "rawip" ), new RegexParser( new Fields( "rawoctet" ), "^[^.]*" ), new Fields( "rawoctet" ) ); ipPipe = new Each( ipPipe, new Fields( "rawoctet" ), new Identity( long.class ), Fields.REPLACE ); Fields groupIP = new Fields( "rawoctet" ); groupIP.setComparator( "rawoctet", new TestLongComparator( reverseSort ) ); Pipe pipe = new CoGroup( apachePipe, groupApache, ipPipe, groupIP ); pipe = new Each( pipe, new Identity() ); // let's force the stack to be exercised Map<Object, Object> properties = getProperties(); if( MultiMapReducePlanner.getJobConf( properties ) != null ) MultiMapReducePlanner.getJobConf( properties ).setNumMapTasks( 13 ); Map sources = new HashMap(); sources.put( "apache", sourceApache ); sources.put( "ip", sourceIP ); Flow flow = new FlowConnector( properties ).connect( sources, sink, pipe ); flow.complete(); validateFile( sink, 199, 16, reverseSort, 5 ); } public void testComprehensiveGroupBy() throws IOException { Boolean[][] testArray = new Boolean[][]{ // test group comparators {false, null, false}, {true, null, false}, // test group, reversed {false, null, true}, {true, null, true}, // test group and sort comparators {false, false, false}, {true, false, false}, {true, true, false}, {false, true, false}, // test group and sort comparators, reversed {false, false, true}, {true, false, true}, {true, true, true}, {false, true, true} }; for( int i = 0; i < testArray.length; i++ ) runComprehensiveCase( testArray[ i ], false ); for( int i = 0; i < testArray.length; i++ ) runComprehensiveCase( testArray[ i ], true ); } private void runComprehensiveCase( Boolean[] testCase, boolean useCollectionsComparator ) throws IOException { if( !new File( inputFileCross ).exists() ) fail( "data file not found" ); copyFromLocal( inputFileCross ); String test = Util.join( testCase, "_", true ); String path = "comprehensive/" + test; Tap source = new Hfs( new TextLine( new Fields( "line" ) ), inputFileCross ); Tap sink = new Hfs( new TextLine( new Fields( "line" ), new Fields( "num", "lower", "upper" ), 1 ), outputPath + path, true ); Pipe pipe = new Pipe( "comprehensivesort" ); pipe = new Each( pipe, new Fields( "line" ), new RegexSplitter( new Fields( "num", "lower", "upper" ), "\\s" ) ); pipe = new Each( pipe, new Fields( "num" ), new Identity( long.class ), Fields.REPLACE ); Fields groupFields = new Fields( "num" ); if( testCase[ 0 ] ) groupFields.setComparator( "num", useCollectionsComparator ? Collections.reverseOrder() : new TestLongComparator() ); Fields sortFields = null; if( testCase[ 1 ] != null ) { sortFields = new Fields( "upper" ); if( testCase[ 1 ] ) sortFields.setComparator( "upper", useCollectionsComparator ? Collections.reverseOrder() : new TestStringComparator() ); } pipe = new GroupBy( pipe, groupFields, sortFields, testCase[ 2 ] ); Map<Object, Object> properties = getProperties(); if( MultiMapReducePlanner.getJobConf( properties ) != null ) MultiMapReducePlanner.getJobConf( properties ).setNumMapTasks( 13 ); Flow flow = new FlowConnector( properties ).connect( source, sink, pipe ); flow.complete(); validateCase( test, testCase, sink ); } private void validateCase( String test, Boolean[] testCase, Tap sink ) throws IOException { TupleEntryIterator iterator = sink.openForRead( new JobConf() ); LinkedHashMap<Long, List<String>> group = new LinkedHashMap<Long, List<String>>(); while( iterator.hasNext() ) { Tuple tuple = iterator.next().getTuple(); String[] values = tuple.getString( 0 ).split( "\\s" ); long num = Long.parseLong( values[ 0 ] ); if( !group.containsKey( num ) ) group.put( num, new ArrayList<String>() ); group.get( num ).add( values[ 2 ] ); } boolean groupIsReversed = testCase[ 0 ]; if( testCase[ 2 ] ) groupIsReversed = !groupIsReversed; compare( "grouping+" + test, groupIsReversed, group.keySet() ); if( testCase[ 1 ] == null ) return; boolean valueIsReversed = testCase[ 1 ]; if( testCase[ 2 ] ) valueIsReversed = !valueIsReversed; for( Long grouping : group.keySet() ) compare( "values+" + test, valueIsReversed, group.get( grouping ) ); } private void compare( String test, boolean isReversed, Collection values ) { List<Object> groups = new ArrayList<Object>( values ); List<Object> sortedGroups = new ArrayList<Object>( groups ); Collections.sort( sortedGroups, isReversed ? Collections.reverseOrder() : null ); assertEquals( test, sortedGroups, groups ); } public void testSortFails() throws Exception { String path = "fails"; if( !new File( inputFileApache ).exists() ) fail( "data file not found" ); copyFromLocal( inputFileApache ); Tap source = new Lfs( new TextLine(), inputFileApache ); Tap sink = new Lfs( new TextLine(), outputPath + path, true ); Pipe pipe = new Pipe( "apache" ); // RegexParser.APACHE declares: "time", "method", "event", "status", "size" pipe = new Each( pipe, new Fields( "line" ), apacheCommonParser ); pipe = new Each( pipe, new Insert( new Fields( "col" ), 1 ), Fields.ALL ); // DateParser.APACHE declares: "ts" pipe = new Each( pipe, new Fields( "time" ), new DateParser( "dd/MMM/yyyy:HH:mm:ss Z" ), new Fields( "col", "status", "ts", "event", "ip", "size" ) ); pipe = new GroupBy( pipe, new Fields( "col" ), new Fields( "does-not-exist" ) ); pipe = new Each( pipe, new Identity() ); // let's force the stack to be exercised Map<Object, Object> properties = getProperties(); MultiMapReducePlanner.getJobConf( properties ).setNumMapTasks( 13 ); try { new FlowConnector( properties ).connect( source, sink, pipe ); fail( "did not throw exception" ); } catch( Exception exception ) { // passes } } private void validateFile( Tap tap, int length, int uniqueValues, boolean isReversed, int comparePosition ) throws IOException, ParseException { TupleEntryIterator iterator = tap.openForRead( new JobConf() ); Set<Long> values = new HashSet<Long>(); long lastValue = isReversed ? Long.MAX_VALUE : Long.MIN_VALUE; int count = 0; while( iterator.hasNext() ) { Tuple tuple = iterator.next().getTuple(); count++; tuple = new Tuple( (Object[]) tuple.getString( 1 ).split( "\t" ) ); long value = tuple.getLong( comparePosition ); values.add( value ); if( isReversed ) assertTrue( "out of order in " + tap, lastValue >= value ); else assertTrue( "out of order in " + tap, lastValue <= value ); lastValue = value; } if( length != -1 ) assertEquals( "length of " + tap, length, count ); if( uniqueValues != -1 ) assertEquals( "unique values of " + tap, uniqueValues, values.size() ); } }