/* * Copyright (c) 2007-2010 Concurrent, Inc. All Rights Reserved. * * Project and contact information: http://www.cascading.org/ * * This file is part of the Cascading project. * * Cascading is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Cascading is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Cascading. If not, see <http://www.gnu.org/licenses/>. */ package cascading; import java.io.File; import java.io.IOException; import java.util.Map; import cascading.cascade.Cascades; import cascading.flow.Flow; import cascading.flow.FlowConnector; import cascading.operation.AssertionLevel; import cascading.operation.aggregator.Count; import cascading.operation.assertion.AssertNotEquals; import cascading.operation.regex.RegexParser; import cascading.pipe.Each; import cascading.pipe.Every; import cascading.pipe.GroupBy; import cascading.pipe.Pipe; import cascading.scheme.SequenceFile; import cascading.scheme.TextLine; import cascading.tap.Hfs; import cascading.tap.Tap; import cascading.tap.TapException; import cascading.tuple.Fields; import cascading.tuple.Tuple; import cascading.tuple.TupleEntry; import org.apache.hadoop.mapred.OutputCollector; /** * */ public class TrapTest extends ClusterTestCase { String inputFileApache = "build/test/data/apache.10.txt"; String outputPath = "build/test/output/traps/"; public TrapTest() { super( "trap tests", true, 4, 4 ); } public void testTrapNone() throws Exception { if( !new File( inputFileApache ).exists() ) fail( "data file not found" ); copyFromLocal( inputFileApache ); Tap source = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileApache ); Pipe pipe = new Pipe( "map" ); pipe = new Each( pipe, new Fields( "line" ), new RegexParser( new Fields( "ip" ), "^[^ ]*" ), new Fields( "ip" ) ); pipe = new GroupBy( "reduce", pipe, new Fields( "ip" ) ); pipe = new Every( pipe, new Count(), new Fields( "ip", "count" ) ); Tap sink = new Hfs( new TextLine(), outputPath + "none/tap", true ); Tap trap = new Hfs( new TextLine(), outputPath + "none/trap", true ); Flow flow = new FlowConnector( getProperties() ).connect( "trap test", source, sink, trap, pipe ); flow.complete(); validateLength( flow, 8, null ); validateLength( flow.openTrap(), 0 ); } public void testTrapEachAll() throws Exception { if( !new File( inputFileApache ).exists() ) fail( "data file not found" ); copyFromLocal( inputFileApache ); Tap source = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileApache ); Pipe pipe = new Pipe( "map" ); pipe = new Each( pipe, new Fields( "line" ), new RegexParser( new Fields( "ip" ), "^[^ ]*" ), new Fields( "ip" ) ); // always fail pipe = new Each( pipe, new Fields( "ip" ), new TestFunction( new Fields( "test" ), null ), Fields.ALL ); pipe = new GroupBy( "reduce", pipe, new Fields( "ip" ) ); pipe = new Every( pipe, new Count(), new Fields( "ip", "count" ) ); Tap sink = new Hfs( new TextLine(), outputPath + "all/tap", true ); Tap trap = new Hfs( new TextLine(), outputPath + "all/trap", true ); Flow flow = new FlowConnector( getProperties() ).connect( "trap test", source, sink, trap, pipe ); flow.complete(); validateLength( flow, 0, null ); validateLength( flow.openTrap(), 10 ); } public void testTrapEachAllSequence() throws Exception { if( !new File( inputFileApache ).exists() ) fail( "data file not found" ); copyFromLocal( inputFileApache ); Tap source = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileApache ); Pipe pipe = new Pipe( "map" ); pipe = new Each( pipe, new Fields( "line" ), new RegexParser( new Fields( "ip" ), "^[^ ]*" ), new Fields( "ip" ) ); // always fail pipe = new Each( pipe, new Fields( "ip" ), new TestFunction( new Fields( "test" ), null ), Fields.ALL ); pipe = new GroupBy( "reduce", pipe, new Fields( "ip" ) ); pipe = new Every( pipe, new Count(), new Fields( "ip", "count" ) ); Tap sink = new Hfs( new SequenceFile( Fields.ALL ), outputPath + "allseq/tap", true ); Tap trap = new Hfs( new SequenceFile( Fields.ALL ), outputPath + "allseq/trap", true ); Flow flow = new FlowConnector( getProperties() ).connect( "trap test", source, sink, trap, pipe ); // flow.writeDOT( "traps.dot" ); flow.complete(); validateLength( flow, 0, null ); validateLength( flow.openTrap(), 10 ); } public void testTrapEveryAllAtStart() throws Exception { runTrapEveryAll( 0, "everystart", 8 ); } public void testTrapEveryAllAtAggregate() throws Exception { runTrapEveryAll( 1, "everyaggregate", 10 ); // fails at all values } public void testTrapEveryAllAtComplete() throws Exception { runTrapEveryAll( 2, "everycomplete", 8 ); } private void runTrapEveryAll( int failAt, String path, int failSize ) throws IOException { if( !new File( inputFileApache ).exists() ) fail( "data file not found" ); copyFromLocal( inputFileApache ); Tap source = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileApache ); Pipe pipe = new Pipe( "map" ); pipe = new Each( pipe, new Fields( "line" ), new RegexParser( new Fields( "ip" ), "^[^ ]*" ), new Fields( "ip" ) ); pipe = new GroupBy( "reduce", pipe, new Fields( "ip" ) ); pipe = new Every( pipe, new Count(), new Fields( "ip", "count" ) ); pipe = new Every( pipe, new TestFailAggregator( new Fields( "fail" ), failAt ), new Fields( "ip", "count" ) ); Tap sink = new Hfs( new TextLine(), outputPath + path + "/tap", true ); Tap trap = new Hfs( new TextLine(), outputPath + path + "/trap", true ); Map<String, Tap> traps = Cascades.tapsMap( "reduce", trap ); Flow flow = new FlowConnector( getProperties() ).connect( "trap test", source, sink, traps, pipe ); flow.complete(); validateLength( flow, 0, null ); validateLength( flow.openTrap(), failSize ); } /** * verify we can fail in randome places into the same trap * * @throws Exception */ public void testTrapEachAllChained() throws Exception { if( !new File( inputFileApache ).exists() ) fail( "data file not found" ); copyFromLocal( inputFileApache ); Tap source = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileApache ); Pipe pipe = new Pipe( "map" ); pipe = new Each( pipe, new Fields( "line" ), new RegexParser( new Fields( "ip" ), "^[^ ]*" ), new Fields( "ip" ) ); // always fail pipe = new Each( pipe, new TestFunction( new Fields( "test" ), new Tuple( 1 ), 1 ), Fields.ALL ); pipe = new Each( pipe, new TestFunction( new Fields( "test2" ), new Tuple( 2 ), 2 ), Fields.ALL ); pipe = new Each( pipe, new TestFunction( new Fields( "test3" ), new Tuple( 3 ), 3 ), Fields.ALL ); pipe = new Each( pipe, new TestFunction( new Fields( "test4" ), new Tuple( 4 ), 4 ), Fields.ALL ); Tap sink = new Hfs( new TextLine(), outputPath + "allchain/tap", true ); Tap trap = new Hfs( new TextLine(), outputPath + "allchain/trap", true ); Flow flow = new FlowConnector( getProperties() ).connect( "trap test", source, sink, trap, pipe ); // flow.writeDOT( "traps.dot" ); flow.complete(); validateLength( flow, 6, null ); validateLength( flow.openTrap(), 4 ); } /** * This test verifies traps can cross m/r and step boundaries. * * @throws Exception */ public void testTrapEachEveryAllChained() throws Exception { if( !new File( inputFileApache ).exists() ) fail( "data file not found" ); copyFromLocal( inputFileApache ); Tap source = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileApache ); Pipe pipe = new Pipe( "map" ); pipe = new Each( pipe, new Fields( "line" ), new RegexParser( new Fields( "ip" ), "^[^ ]*" ), new Fields( "ip" ) ); // always fail pipe = new Each( pipe, AssertionLevel.VALID, new AssertNotEquals( "75.185.76.245" ) ); pipe = new GroupBy( pipe, new Fields( "ip" ) ); pipe = new Each( pipe, AssertionLevel.VALID, new AssertNotEquals( "68.46.103.112" ) ); pipe = new GroupBy( pipe, new Fields( "ip" ) ); pipe = new Each( pipe, AssertionLevel.VALID, new AssertNotEquals( "76.197.151.0" ) ); pipe = new Each( pipe, AssertionLevel.VALID, new AssertNotEquals( "12.215.138.88" ) ); Tap sink = new Hfs( new TextLine(), outputPath + "eacheverychain/tap", true ); Tap trap = new Hfs( new TextLine(), outputPath + "eacheverychain/trap", true ); Flow flow = new FlowConnector( getProperties() ).connect( "trap test", source, sink, trap, pipe ); // flow.writeDOT( "traps.dot" ); flow.complete(); validateLength( flow, 6, null ); validateLength( flow.openTrap(), 4 ); } public void testTrapToSequenceFile() throws Exception { if( !new File( inputFileApache ).exists() ) fail( "data file not found" ); copyFromLocal( inputFileApache ); Tap source = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileApache ); Pipe pipe = new Pipe( "map" ); pipe = new Each( pipe, new Fields( "line" ), new RegexParser( new Fields( "ip" ), "^[^ ]*" ), new Fields( "ip" ) ); // always fail pipe = new Each( pipe, new Fields( "ip" ), new TestFunction( new Fields( "test" ), null ), Fields.ALL ); pipe = new GroupBy( "reduce", pipe, new Fields( "ip" ) ); pipe = new Every( pipe, new Count(), new Fields( "ip", "count" ) ); Tap sink = new Hfs( new TextLine(), outputPath + "seq/tap", true ); Tap trap = new Hfs( new SequenceFile( new Fields( "ip" ) ), outputPath + "seq/trap", true ); Flow flow = new FlowConnector( getProperties() ).connect( "trap test", source, sink, trap, pipe ); flow.complete(); validateLength( flow, 0, null ); validateLength( flow.openTrap(), 10 ); } private static class FailScheme extends TextLine { boolean sourceFired = false; boolean sinkFired = false; public FailScheme() { } public FailScheme( Fields sourceFields ) { super( sourceFields ); } @Override public Tuple source( Object key, Object value ) { if( !sourceFired ) { sourceFired = true; throw new TapException( "fail" ); } return super.source( key, value ); } @Override public void sink( TupleEntry tupleEntry, OutputCollector outputCollector ) throws IOException { if( !sinkFired ) { sinkFired = true; throw new TapException( "fail" ); } super.sink( tupleEntry, outputCollector ); } } public void testTrapTapSourceSink() throws Exception { if( !new File( inputFileApache ).exists() ) fail( "data file not found" ); copyFromLocal( inputFileApache ); Tap source = new Hfs( new FailScheme( new Fields( "offset", "line" ) ), inputFileApache ); Pipe pipe = new Pipe( "map" ); pipe = new Each( pipe, new Fields( "line" ), new RegexParser( new Fields( "ip" ), "^[^ ]*" ), new Fields( "ip" ) ); pipe = new GroupBy( pipe, new Fields( "ip" ) ); pipe = new Every( pipe, new Count(), new Fields( "ip", "count" ) ); Tap sink = new Hfs( new FailScheme(), outputPath + "sink/tap", true ); Tap trap = new Hfs( new TextLine(), outputPath + "sink/trap", true ); Map<Object, Object> properties = getProperties(); // compensate for running in cluster mode properties.put( "mapred.map.tasks", 1 ); properties.put( "mapred.reduce.tasks", 1 ); Flow flow = new FlowConnector( properties ).connect( "trap test", source, sink, trap, pipe ); flow.complete(); validateLength( flow.openTapForRead( new Hfs( new TextLine(), outputPath + "sink/tap", true ) ), 6, null ); validateLength( flow.openTrap(), 2 ); } }