/*
* Copyright (c) 2007-2010 Concurrent, Inc. All Rights Reserved.
*
* Project and contact information: http://www.cascading.org/
*
* This file is part of the Cascading project.
*
* Cascading is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Cascading is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Cascading. If not, see <http://www.gnu.org/licenses/>.
*/
package cascading.flow;
import java.io.IOException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import cascading.CascadingTestCase;
import cascading.TestBuffer;
import cascading.TestFunction;
import cascading.operation.AssertionLevel;
import cascading.operation.Function;
import cascading.operation.Identity;
import cascading.operation.aggregator.Count;
import cascading.operation.aggregator.First;
import cascading.operation.assertion.AssertNotNull;
import cascading.operation.assertion.AssertNull;
import cascading.operation.expression.ExpressionFilter;
import cascading.operation.regex.RegexFilter;
import cascading.operation.regex.RegexParser;
import cascading.operation.regex.RegexSplitter;
import cascading.pipe.CoGroup;
import cascading.pipe.Each;
import cascading.pipe.Every;
import cascading.pipe.GroupBy;
import cascading.pipe.Pipe;
import cascading.pipe.cogroup.InnerJoin;
import cascading.scheme.Scheme;
import cascading.scheme.SequenceFile;
import cascading.scheme.TextLine;
import cascading.tap.Hfs;
import cascading.tap.SinkMode;
import cascading.tap.Tap;
import cascading.tap.TempHfs;
import cascading.tuple.Fields;
import cascading.tuple.Tuple;
import org.jgrapht.alg.DijkstraShortestPath;
import org.jgrapht.graph.SimpleDirectedGraph;
public class BuildJobsTest extends CascadingTestCase
{
public BuildJobsTest()
{
super( "build jobs" );
}
/**
* Test a single piece Pipe, should not fail, inserts Identity pipe
*
* @throws IOException
*/
public void testIdentity() throws Exception
{
Tap source = new Hfs( new TextLine(), "input/path" );
Tap sink = new Hfs( new TextLine(), "output/path", true );
Pipe pipe = new Pipe( "test" );
Flow flow = new FlowConnector().connect( source, sink, pipe );
List<FlowStep> steps = flow.getSteps();
assertEquals( "wrong size", 1, steps.size() );
FlowStep step = (FlowStep) steps.get( 0 );
step.getJobConf(); // called init the step
assertEquals( "not equal: step.sources.size()", 1, step.sources.size() );
assertNull( "not null: step.groupBy", step.getGroup() );
assertNotNull( "null: step.sink", step.sink );
}
public void testName()
{
Pipe count = new Pipe( "count" );
Pipe pipe = new GroupBy( count, new Fields( 1 ) );
pipe = new Every( pipe, new Fields( 1 ), new Count(), new Fields( 0, 1 ) );
assertEquals( "not equal: count.getName()", "count", count.getName() );
assertEquals( "not equal: pipe.getName()", "count", pipe.getName() );
pipe = new Each( count, new Fields( 1 ), new RegexSplitter( Fields.size( 2 ) ) );
assertEquals( "not equal: pipe.getName()", "count", pipe.getName() );
}
public void testOneJob() throws IOException
{
Map sources = new HashMap();
Map sinks = new HashMap();
sources.put( "count", new Hfs( new Fields( "first", "second" ), "input/path" ) );
sinks.put( "count", new Hfs( new Fields( 0, 1 ), "output/path" ) );
Pipe pipe = new Pipe( "count" );
pipe = new GroupBy( pipe, new Fields( 1 ) );
pipe = new Every( pipe, new Fields( 1 ), new Count(), new Fields( 0, 1 ) );
List steps = new FlowConnector().connect( sources, sinks, pipe ).getSteps();
assertEquals( "wrong size", 1, steps.size() );
FlowStep step = (FlowStep) steps.get( 0 );
step.getJobConf(); // called init the step
assertEquals( "not equal: step.sources.size()", 1, step.sources.size() );
assertNotNull( "null: step.groupBy", step.getGroup() );
assertNotNull( "null: step.sink", step.sink );
int mapDist = countDistance( step.graph, step.sources.keySet().iterator().next(), step.getGroup() );
assertEquals( "not equal: mapDist", 0, mapDist );
int reduceDist = countDistance( step.graph, step.getGroup(), step.sink );
assertEquals( "not equal: reduceDist", 1, reduceDist );
}
public void testOneJob2() throws IOException
{
Map sources = new HashMap();
Map sinks = new HashMap();
sources.put( "count", new Hfs( new Fields( "first", "second" ), "input/path" ) );
sinks.put( "count", new Hfs( new Fields( 0, 1 ), "output/path" ) );
Pipe pipe = new Pipe( "count" );
pipe = new Each( pipe, new Fields( 1 ), new Identity(), new Fields( 2 ) ); // in:second out:all
pipe = new Each( pipe, new Fields( 0 ), new Identity( new Fields( "_all" ) ), new Fields( 1 ) ); // in:all out:_all
pipe = new GroupBy( pipe, new Fields( 0 ) ); // in:_all out:_all
pipe = new Every( pipe, new Fields( 0 ), new Count(), new Fields( 0, 1 ) ); // in:_all out:_all,count
List steps = new FlowConnector().connect( sources, sinks, pipe ).getSteps();
assertEquals( "wrong size", 1, steps.size() );
FlowStep step = (FlowStep) steps.get( 0 );
step.getJobConf(); // called init the step
assertEquals( "not equal: step.sources.size()", 1, step.sources.size() );
assertNotNull( "null: step.groupBy", step.getGroup() );
assertNotNull( "null: step.sink", step.sink );
int mapDist = countDistance( step.graph, step.sources.keySet().iterator().next(), step.getGroup() );
assertEquals( "not equal: mapDist", 2, mapDist );
int reduceDist = countDistance( step.graph, step.getGroup(), step.sink );
assertEquals( "not equal: reduceDist", 1, reduceDist );
}
public void testOneJob3() throws IOException
{
Map sources = new HashMap();
Map sinks = new HashMap();
sources.put( "a", new Hfs( new Fields( "first", "second" ), "input/path/a" ) );
sources.put( "b", new Hfs( new Fields( "third", "fourth" ), "input/path/b" ) );
Pipe pipeA = new Pipe( "a" );
Pipe pipeB = new Pipe( "b" );
Pipe splice = new CoGroup( pipeA, new Fields( 1 ), pipeB, new Fields( 1 ) );
sinks.put( splice.getName(), new Hfs( new Fields( 0, 1 ), "output/path" ) );
List steps = new FlowConnector().connect( sources, sinks, splice ).getSteps();
assertEquals( "wrong size", 1, steps.size() );
FlowStep step = (FlowStep) steps.get( 0 );
step.getJobConf(); // called init the step
assertEquals( "not equal: step.sources.size()", 2, step.sources.size() );
assertNotNull( "null: step.groupBy", step.getGroup() );
assertNotNull( "null: step.sink", step.sink );
Iterator<Tap> iterator = step.sources.keySet().iterator();
int mapDist = countDistance( step.graph, iterator.next(), step.getGroup() );
assertEquals( "not equal: mapDist", 0, mapDist );
mapDist = countDistance( step.graph, iterator.next(), step.getGroup() );
assertEquals( "not equal: mapDist", 0, mapDist );
int reduceDist = countDistance( step.graph, step.getGroup(), step.sink );
assertEquals( "not equal: reduceDist", 0, reduceDist );
}
public void testOneJob4() throws IOException
{
Map sources = new HashMap();
Map sinks = new HashMap();
sources.put( "a", new Hfs( new Fields( "first", "second" ), "input/path/a" ) );
sources.put( "b", new Hfs( new Fields( "third", "fourth" ), "input/path/b" ) );
Pipe pipeA = new Pipe( "a" );
Pipe pipeB = new Pipe( "b" );
Pipe cogroup = new CoGroup( pipeA, new Fields( 1 ), pipeB, new Fields( 1 ) );
cogroup = new Each( cogroup, new Identity() );
sinks.put( cogroup.getName(), new Hfs( new Fields( 0, 1 ), "output/path" ) );
List steps = new FlowConnector().connect( sources, sinks, cogroup ).getSteps();
assertEquals( "wrong size", 1, steps.size() );
FlowStep step = (FlowStep) steps.get( 0 );
step.getJobConf(); // called init the step
assertEquals( "not equal: step.sources.size()", 2, step.sources.size() );
assertNotNull( "null: step.groupBy", step.getGroup() );
assertNotNull( "null: step.sink", step.sink );
int mapDist = countDistance( step.graph, step.sources.keySet().iterator().next(), step.getGroup() );
assertEquals( "not equal: mapDist", 0, mapDist );
int reduceDist = countDistance( step.graph, step.getGroup(), step.sink );
assertEquals( "not equal: reduceDist", 1, reduceDist );
}
public void testOneJob5() throws IOException
{
Map sources = new HashMap();
Map sinks = new HashMap();
sources.put( "a", new Hfs( new Fields( "first", "second" ), "input/path/a" ) );
sources.put( "b", new Hfs( new Fields( "third", "fourth" ), "input/path/b" ) );
Pipe pipeA = new Pipe( "a" );
Pipe pipeB = new Pipe( "b" );
Pipe splice = new CoGroup( pipeA, pipeB );
splice = new Each( splice, new Identity() );
sinks.put( splice.getName(), new Hfs( new TextLine(), "output/path" ) );
List steps = new FlowConnector().connect( sources, sinks, splice ).getSteps();
assertEquals( "wrong size", 1, steps.size() );
FlowStep step = (FlowStep) steps.get( 0 );
step.getJobConf(); // called init the step
assertEquals( "not equal: step.sources.size()", 2, step.sources.size() );
assertNotNull( "null: step.groupBy", step.getGroup() );
assertNotNull( "null: step.sink", step.sink );
int mapDist = countDistance( step.graph, step.sources.keySet().iterator().next(), step.getGroup() );
assertEquals( "not equal: mapDist", 0, mapDist );
int reduceDist = countDistance( step.graph, step.getGroup(), step.sink );
assertEquals( "not equal: reduceDist", 1, reduceDist );
}
public void testNoGroup() throws IOException
{
Map sources = new HashMap();
Map sinks = new HashMap();
sources.put( "count", new Hfs( new Fields( "first", "second" ), "input/path" ) );
sinks.put( "count", new Hfs( new Fields( 0, 1 ), "output/path" ) );
Pipe pipe = new Pipe( "count" );
pipe = new Each( pipe, new Identity() );
pipe = new Every( pipe, new Fields( 1 ), new Count(), new Fields( 0, 1 ) );
try
{
Flow flow = new FlowConnector().connect( sources, sinks, pipe );
fail( "did not throw flow exception" );
}
catch( Exception exception )
{
// ignore
// exception.printStackTrace();
}
}
/** This should result in only two steps, one for each side */
public void testSplit()
{
Tap source = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "foo" );
Tap sink1 = new Hfs( new TextLine(), "foo/split1", true );
Tap sink2 = new Hfs( new TextLine(), "foo/split2", true );
Pipe pipe = new Pipe( "split" );
pipe = new Each( pipe, new Fields( "line" ), new RegexFilter( "^68.*" ) );
Pipe left = new Each( new Pipe( "left", pipe ), new Fields( "line" ), new RegexFilter( ".*46.*" ) );
Pipe right = new Each( new Pipe( "right", pipe ), new Fields( "line" ), new RegexFilter( ".*192.*" ) );
Map sources = new HashMap();
sources.put( "split", source );
Map sinks = new HashMap();
sinks.put( "left", sink1 );
sinks.put( "right", sink2 );
List<FlowStep> steps = new FlowConnector().connect( sources, sinks, left, right ).getSteps();
assertEquals( "not equal: steps.size()", 2, steps.size() );
}
/** this test verifies that the planner recognizes there are fewer tails than sinks. */
public void testSplitHangingTails()
{
Tap source = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "foo" );
Tap sink1 = new Hfs( new TextLine(), "foo/split1", true );
Tap sink2 = new Hfs( new TextLine(), "foo/split2", true );
Pipe pipe = new Pipe( "split" );
pipe = new Each( pipe, new Fields( "line" ), new RegexFilter( "^68.*" ) );
Pipe left = new Each( new Pipe( "left", pipe ), new Fields( "line" ), new RegexFilter( ".*46.*" ) );
Pipe right = new Each( new Pipe( "right", pipe ), new Fields( "line" ), new RegexFilter( ".*192.*" ) );
Map sources = new HashMap();
sources.put( "split", source );
Map sinks = new HashMap();
sinks.put( "left", sink1 );
sinks.put( "right", sink2 );
try
{
new FlowConnector().connect( sources, sinks, pipe );
fail( "did not catch missing tails" );
}
catch( Exception exception )
{
System.out.println( "exception.getMessage() = " + exception.getMessage() );
assertTrue( exception.getMessage().contains( "'left', 'right'" ) );
}
}
public void testSplitOnNonSafeOperations()
{
Tap source = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "foo" );
Tap sink1 = new Hfs( new TextLine(), "foo/split1", true );
Tap sink2 = new Hfs( new TextLine(), "foo/split2", true );
Pipe pipe = new Pipe( "split" );
// this operation is not safe
pipe = new Each( pipe, new Fields( "line" ), new TestFunction( new Fields( "ignore" ), new Tuple( 1 ), false ), new Fields( "line" ) );
pipe = new Each( pipe, new Fields( "line" ), new RegexFilter( "^68.*" ) );
Pipe left = new Each( new Pipe( "left", pipe ), new Fields( "line" ), new RegexFilter( ".*46.*" ) );
Pipe right = new Each( new Pipe( "right", pipe ), new Fields( "line" ), new RegexFilter( ".*192.*" ) );
Map sources = new HashMap();
sources.put( "split", source );
Map sinks = new HashMap();
sinks.put( "left", sink1 );
sinks.put( "right", sink2 );
Flow flow = new FlowConnector().connect( sources, sinks, left, right );
// flow.writeDOT( "splitonnonsafe.dot" );
List<FlowStep> steps = flow.getSteps();
assertEquals( "not equal: steps.size()", 3, steps.size() );
FlowStep step = steps.get( 0 );
assertEquals( "wrong number of operations", 2, step.getAllOperations().size() );
}
// verify unsafe splits happen when splitting on a pipe
public void testSplitOnNonSafeOperations2()
{
Tap source = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "foo" );
Tap sink1 = new Hfs( new TextLine(), "foo/split1", true );
Tap sink2 = new Hfs( new TextLine(), "foo/split2", true );
Tap sink3 = new Hfs( new TextLine(), "foo/split3", true );
Pipe pipe = new Pipe( "split" );
// this operation is not safe
pipe = new Each( pipe, new Fields( "line" ), new TestFunction( new Fields( "ignore" ), new Tuple( 1 ), false ), new Fields( "line" ) );
pipe = new Each( pipe, new Fields( "line" ), new RegexFilter( "^68.*" ) );
pipe = new Pipe( "middle", pipe );
Pipe left = new Each( new Pipe( "left", pipe ), new Fields( "line" ), new RegexFilter( ".*46.*" ) );
Pipe right = new Each( new Pipe( "right", pipe ), new Fields( "line" ), new RegexFilter( ".*192.*" ) );
Map sources = new HashMap();
sources.put( "split", source );
Map sinks = new HashMap();
sinks.put( "left", sink1 );
sinks.put( "right", sink2 );
sinks.put( "middle", sink3 );
Flow flow = new FlowConnector().connect( sources, sinks, left, right );
// flow.writeDOT( "splitonnonsafe.dot" );
// flow.writeStepsDOT( "splitonnonsafe-steps.dot" );
List<FlowStep> steps = flow.getSteps();
assertEquals( "not equal: steps.size()", 4, steps.size() );
FlowStep step = steps.get( 0 );
assertEquals( "wrong number of operations", 2, step.getAllOperations().size() );
}
/**
* This should result in a Temp Tap after the Each split.
* <p/>
* We previously would push the each to the next step, but if there is already data being written, save the cpu.
*/
public void testSplitComplex()
{
Tap source = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "foo" );
Tap sink1 = new Hfs( new TextLine(), "foo/split1", true );
Tap sink2 = new Hfs( new TextLine(), "foo/split2", true );
Pipe pipe = new Pipe( "split" );
pipe = new Each( pipe, new Fields( "line" ), new RegexParser( new Fields( "ip" ), "^[^ ]*" ), new Fields( "ip" ) );
pipe = new GroupBy( pipe, new Fields( "ip" ) );
pipe = new Every( pipe, new Fields( "ip" ), new Count(), new Fields( "ip", "count" ) );
pipe = new Each( pipe, new Fields( "ip" ), new RegexFilter( "^68.*" ) );
Pipe left = new Each( new Pipe( "left", pipe ), new Fields( "ip" ), new RegexFilter( ".*46.*" ) );
Pipe right = new Each( new Pipe( "right", pipe ), new Fields( "ip" ), new RegexFilter( ".*192.*" ) );
Map sources = new HashMap();
sources.put( "split", source );
Map sinks = new HashMap();
sinks.put( "left", sink1 );
sinks.put( "right", sink2 );
Flow flow = new FlowConnector().connect( sources, sinks, left, right );
// flow.writeDOT( "splitcomplex.dot" );
List<FlowStep> steps = flow.getSteps();
assertEquals( "not equal: steps.size()", 3, steps.size() );
FlowStep step = steps.get( 0 );
Scope nextScope = step.getNextScope( step.getGroup() );
FlowElement operator = step.getNextFlowElement( nextScope );
assertTrue( "not an Every", operator instanceof Every );
nextScope = step.getNextScope( operator );
operator = step.getNextFlowElement( nextScope );
assertTrue( "not a Each", operator instanceof Each );
nextScope = step.getNextScope( operator );
operator = step.getNextFlowElement( nextScope );
assertTrue( "not a TempHfs", operator instanceof TempHfs );
}
/** same as splitComplex, except pipe/branch naming is after the Each, not before */
public void testSplitComplex2()
{
Tap source = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "foo" );
Tap sink1 = new Hfs( new TextLine(), "foo/split1", true );
Tap sink2 = new Hfs( new TextLine(), "foo/split2", true );
Pipe pipe = new Pipe( "split" );
pipe = new Each( pipe, new Fields( "line" ), new RegexParser( new Fields( "ip" ), "^[^ ]*" ), new Fields( "ip" ) );
pipe = new GroupBy( pipe, new Fields( "ip" ) );
pipe = new Every( pipe, new Fields( "ip" ), new Count(), new Fields( "ip", "count" ) );
pipe = new Each( pipe, new Fields( "ip" ), new RegexFilter( "^68.*" ) );
Pipe left = new Each( pipe, new Fields( "ip" ), new RegexFilter( ".*46.*" ) );
left = new Pipe( "left", left );
Pipe right = new Each( pipe, new Fields( "ip" ), new RegexFilter( ".*192.*" ) );
right = new Pipe( "right", right );
Map sources = new HashMap();
sources.put( "split", source );
Map sinks = new HashMap();
sinks.put( "left", sink1 );
sinks.put( "right", sink2 );
Flow flow = new FlowConnector().connect( sources, sinks, left, right );
// flow.writeDOT( "splitcomplex.dot" );
List<FlowStep> steps = flow.getSteps();
assertEquals( "not equal: steps.size()", 3, steps.size() );
FlowStep step = steps.get( 0 );
Scope nextScope = step.getNextScope( step.getGroup() );
FlowElement operator = step.getNextFlowElement( nextScope );
assertTrue( "not an Every", operator instanceof Every );
nextScope = step.getNextScope( operator );
operator = step.getNextFlowElement( nextScope );
assertTrue( "not a Each", operator instanceof Each );
nextScope = step.getNextScope( operator );
operator = step.getNextFlowElement( nextScope );
assertTrue( "not a TempHfs", operator instanceof TempHfs );
}
public void testMerge()
{
Tap source1 = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "foo/merge1" );
Tap source2 = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "foo/merge2" );
Tap sink = new Hfs( new TextLine(), "foo" );
Pipe left = new Each( new Pipe( "left" ), new Fields( "line" ), new RegexFilter( ".*46.*" ) );
Pipe right = new Each( new Pipe( "right" ), new Fields( "line" ), new RegexFilter( ".*192.*" ) );
Pipe merge = new GroupBy( "merge", Pipe.pipes( left, right ), new Fields( "offset" ) );
Map sources = new HashMap();
sources.put( "left", source1 );
sources.put( "right", source2 );
Map sinks = new HashMap();
sinks.put( "merge", sink );
Flow flow = new FlowConnector().connect( sources, sinks, merge );
// flow.writeDOT( "merged.dot" );
List<FlowStep> steps = flow.getSteps();
assertEquals( "not equal: steps.size()", 1, steps.size() );
}
public void testDupeSource()
{
Tap source1 = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "foo/merge" );
Tap source2 = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "foo/merge" );
Tap sink = new Hfs( new TextLine(), "foo" );
Pipe left = new Each( new Pipe( "left" ), new Fields( "line" ), new RegexFilter( ".*46.*" ) );
Pipe right = new Each( new Pipe( "right" ), new Fields( "line" ), new RegexFilter( ".*192.*" ) );
right = new Each( right, new Fields( "line" ), new RegexFilter( ".*192.*" ) );
right = new Each( right, new Fields( "line" ), new RegexFilter( ".*192.*" ) );
right = new Each( right, new Fields( "line" ), new RegexFilter( ".*192.*" ) );
Pipe merge = new GroupBy( "merge", Pipe.pipes( left, right ), new Fields( "offset" ) );
Map sources = new HashMap();
sources.put( "left", source1 );
sources.put( "right", source2 );
Map sinks = new HashMap();
sinks.put( "merge", sink );
Flow flow = new FlowConnector().connect( sources, sinks, merge );
// flow.writeDOT( "dupesource.dot" );
List<FlowStep> steps = flow.getSteps();
assertEquals( "not equal: steps.size()", 1, steps.size() );
}
public void testDupeSourceRepeat()
{
Tap source1 = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "foo/merge" );
Tap sink = new Hfs( new TextLine(), "foo" );
Pipe pipe = new Pipe( "pipe" );
Pipe merge = new CoGroup( "cogroup", pipe, new Fields( "offset" ), 1, Fields.size( 4 ) );
Map sources = new HashMap();
sources.put( "pipe", source1 );
Map sinks = new HashMap();
sinks.put( "cogroup", sink );
Flow flow = new FlowConnector().connect( sources, sinks, merge );
// flow.writeDOT( "dupesource.dot" );
List<FlowStep> steps = flow.getSteps();
assertEquals( "not equal: steps.size()", 1, steps.size() );
}
public void testDupeSource2()
{
Tap source1 = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "foo/merge" );
Tap sink = new Hfs( new TextLine(), "foo" );
Pipe left = new Pipe( "left" );
Pipe right = new Pipe( "right" );
Pipe merge = new CoGroup( "cogroup", left, new Fields( "offset" ), right, new Fields( "offset" ), Fields.size( 4 ) );
Map sources = new HashMap();
sources.put( "left", source1 );
sources.put( "right", source1 );
Map sinks = new HashMap();
sinks.put( "cogroup", sink );
try
{
Flow flow = new FlowConnector().connect( sources, sinks, merge );
// flow.writeDOT( "dupesource.dot" );
fail( "did not throw planner exception" );
}
catch( Exception exception )
{
}
}
public void testDupeSource3()
{
Tap source1 = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "foo/merge" );
Tap source2 = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "bar/merge" );
Tap sink = new Hfs( new TextLine(), "foo" );
Pipe left = new Pipe( "left" );
Pipe middle = new Pipe( "middle" );
Pipe right = new Pipe( "right" );
Pipe[] pipes = Pipe.pipes( left, middle, right );
Fields[] fields = Fields.fields( new Fields( "offset" ), new Fields( "offset" ), new Fields( "offset" ) );
Pipe merge = new CoGroup( "cogroup", pipes, fields, Fields.size( 6 ) );
Map sources = new HashMap();
sources.put( "left", source1 );
sources.put( "middle", source2 );
sources.put( "right", source1 );
Map sinks = new HashMap();
sinks.put( "cogroup", sink );
try
{
Flow flow = new FlowConnector().connect( sources, sinks, merge );
// flow.writeDOT( "dupesource.dot" );
fail( "did not throw planner exception" );
}
catch( PlannerException exception )
{
// exception.printStackTrace();
}
}
// public void testEquivalentPaths()
// {
// Tap source1 = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "foo/merge" );
// Tap source2 = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "foo/merge" );
//
// Tap sink = new Hfs( new TextLine(), "foo" );
//
// Pipe left = new Each( new Pipe( "left" ), new Fields( "line" ), new RegexFilter( ".*46.*" ) );
// Pipe right = new Each( new Pipe( "right" ), new Fields( "line" ), new RegexFilter( ".*46.*" ) );
//
// Pipe join = new CoGroup( "cogroup", left, new Fields( "offset" ), right, new Fields( "offset" ), Fields.size( 4 ) );
//
// Map sources = new HashMap();
// sources.put( "left", source1 );
// sources.put( "right", source2 );
//
// Map sinks = new HashMap();
// sinks.put( "cogroup", sink );
//
// Flow flow = new FlowConnector().connect( sources, sinks, join );
// flow.writeDOT( "identicalpaths.dot" );
//
// List<FlowStep> steps = flow.getSteps();
//
// assertEquals( "not equal: steps.size()", 1, steps.size() );
//
// FlowStep step = steps.get( 0 );
// System.out.println( "size: " + step.sources.size() );
//
// System.out.println( "size: " + step.getNextScopes( step.sources.keySet().iterator().next()).size() );
// }
public void testMerge2()
{
Tap source1 = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "foo/merge1" );
Tap source2 = new Hfs( new SequenceFile( new Fields( "offset", "line" ) ), "foo/merge2" );
Tap sink = new Hfs( new TextLine(), "foo" );
Pipe left = new Each( new Pipe( "left" ), new Fields( "line" ), new RegexFilter( ".*46.*" ) );
Pipe right = new Each( new Pipe( "right" ), new Fields( "line" ), new RegexFilter( ".*192.*" ) );
Pipe merge = new GroupBy( "merge", Pipe.pipes( left, right ), new Fields( "offset" ) );
Map sources = new HashMap();
sources.put( "left", source1 );
sources.put( "right", source2 );
Map sinks = new HashMap();
sinks.put( "merge", sink );
Flow flow = new FlowConnector().connect( sources, sinks, merge );
// flow.writeDOT( "merged2.dot" );
List<FlowStep> steps = flow.getSteps();
assertEquals( "not equal: steps.size()", 1, steps.size() );
}
/** Tests the case where the same source is split, then re-merged */
public void testMergeSameSourceSplit()
{
Tap source = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "foo/merge1" );
Tap sink = new Hfs( new TextLine(), "foo" );
Pipe head = new Pipe( "source" );
head = new Each( head, new Fields( "line" ), new ExpressionFilter( "line.length() != 0", String.class ) );
Pipe left = new Each( new Pipe( "left", head ), new Fields( "line" ), new RegexFilter( ".*46.*" ) );
Pipe right = new Each( new Pipe( "right", head ), new Fields( "line" ), new RegexFilter( ".*192.*" ) );
Pipe merge = new GroupBy( "merge", Pipe.pipes( left, right ), new Fields( "offset" ) );
Flow flow = new FlowConnector().connect( source, sink, merge );
// flow.writeDOT( "mergedsamesource.dot" );
List<FlowStep> steps = flow.getSteps();
assertEquals( "not equal: steps.size()", 2, steps.size() );
}
public void testCoGroupAroundCoGroup() throws Exception
{
Tap source10 = new Hfs( new TextLine( new Fields( "num" ) ), "foo" );
Tap source20 = new Hfs( new TextLine( new Fields( "num" ) ), "bar" );
Map sources = new HashMap();
sources.put( "source20", source20 );
sources.put( "source101", source10 );
sources.put( "source102", source10 );
// using null pos so all fields are written
Tap sink = new Hfs( new TextLine(), "baz", true );
Pipe pipeNum20 = new Pipe( "source20" );
Pipe pipeNum101 = new Pipe( "source101" );
Pipe pipeNum102 = new Pipe( "source102" );
Pipe splice1 = new CoGroup( pipeNum20, new Fields( "num" ), pipeNum101, new Fields( "num" ), new Fields( "num1", "num2" ) );
Pipe splice2 = new CoGroup( splice1, new Fields( "num1" ), pipeNum102, new Fields( "num" ), new Fields( "num1", "num2", "num3" ) );
Flow flow = new FlowConnector().connect( sources, sink, splice2 );
// flow.writeDOT( "cogroupcogroupopt.dot" );
assertEquals( "not equal: steps.size()", 2, flow.getSteps().size() );
}
public void testCoGroupAroundCoGroupOptimized() throws Exception
{
Tap source10 = new Hfs( new TextLine( new Fields( "num" ) ), "foo" );
Tap source20 = new Hfs( new TextLine( new Fields( "num" ) ), "bar" );
Map sources = new HashMap();
sources.put( "source20", source20 );
sources.put( "source101", source10 );
sources.put( "source102", source10 );
// using null pos so all fields are written
Tap sink = new Hfs( new TextLine(), "baz", true );
Pipe pipeNum20 = new Pipe( "source20" );
Pipe pipeNum101 = new Pipe( "source101" );
Pipe pipeNum102 = new Pipe( "source102" );
Pipe splice1 = new CoGroup( pipeNum20, new Fields( "num" ), pipeNum101, new Fields( "num" ), new Fields( "num1", "num2" ) );
Pipe splice2 = new CoGroup( splice1, new Fields( "num1" ), pipeNum102, new Fields( "num" ), new Fields( "num1", "num2", "num3" ) );
Properties properties = new Properties();
FlowConnector.setIntermediateSchemeClass( properties, TextLine.class );
FlowConnector flowConnector = new FlowConnector( properties );
Flow flow = flowConnector.connect( sources, sink, splice2 );
// flow.writeDOT( "cogroupcogroupopt.dot" );
assertEquals( "not equal: steps.size()", 2, flow.getSteps().size() );
}
public void testCoGroupAroundCoGroupAroundCoGroup() throws Exception
{
Tap sourceLower = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "foo" );
Tap sourceUpper = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "bar" );
Map sources = new HashMap();
sources.put( "lower", sourceLower );
sources.put( "upper1", sourceUpper );
sources.put( "upper2", sourceUpper );
Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
// using null pos so all fields are written
Tap sink = new Hfs( new TextLine(), "output", true );
Pipe pipeLower = new Each( "lower", new Fields( "line" ), splitter );
Pipe pipeUpper1 = new Each( "upper1", new Fields( "line" ), splitter );
Pipe pipeUpper2 = new Each( "upper2", new Fields( "line" ), splitter );
Pipe splice1 = new CoGroup( pipeLower, new Fields( "num" ), pipeUpper1, new Fields( "num" ), new Fields( "num1", "char1", "num2", "char2" ) );
splice1 = new Each( splice1, new Identity() );
splice1 = new GroupBy( splice1, new Fields( 0 ) );
Pipe splice2 = new CoGroup( splice1, new Fields( "num1" ), pipeUpper2, new Fields( "num" ), new Fields( "num1", "char1", "num2", "char2", "num3", "char3" ) );
splice2 = new Each( splice2, new Identity() );
splice2 = new GroupBy( splice2, new Fields( 0 ) );
splice2 = new CoGroup( splice2, new Fields( "num1" ), splice1, new Fields( "num1" ), new Fields( "num1", "char1", "num2", "char2", "num3", "char3", "num4", "char4", "num5", "char5" ) );
Flow flow = null;
try
{
flow = new FlowConnector().connect( sources, sink, splice2 );
}
catch( FlowException exception )
{
// exception.writeDOT( "cogroupcogroup.dot" );
throw exception;
}
// flow.writeDOT( "cogroupcogroup.dot" );
assertEquals( "not equal: steps.size()", 5, flow.getSteps().size() );
}
public void testCoGroupWithResultGroupFieldsDefault() throws Exception
{
Tap sourceLower = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "foo" );
Tap sourceUpper = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "bar" );
Map sources = new HashMap();
sources.put( "lower", sourceLower );
sources.put( "upper", sourceUpper );
Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
// using null pos so all fields are written
Tap sink = new Hfs( new TextLine(), "/complex/cogroup/", true );
Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter );
Pipe splice = new CoGroup( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), new Fields( "num1", "lhs", "num2", "rhs" ) );
splice = new Every( splice, new First( new Fields( "value" ) ), new Fields( "num", "value" ) );
Flow countFlow = new FlowConnector().connect( sources, sink, splice );
}
public void testCoGroupWithResultGroupFields() throws Exception
{
Tap sourceLower = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "foo" );
Tap sourceUpper = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "bar" );
Map sources = new HashMap();
sources.put( "lower", sourceLower );
sources.put( "upper", sourceUpper );
Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
// using null pos so all fields are written
Tap sink = new Hfs( new TextLine(), "/complex/cogroup/", true );
Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter );
Pipe splice = new CoGroup( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), new Fields( "num1", "lhs", "num2", "rhs" ), new Fields( "somenum" ) );
splice = new Every( splice, new First( new Fields( "value" ) ), new Fields( "somenum", "value" ) );
Flow countFlow = new FlowConnector().connect( sources, sink, splice );
}
public void testDirectCoGroup() throws Exception
{
Tap sourceLower = new Hfs( new TextLine( new Fields( "num", "char" ) ), "foo" );
Tap sourceUpper = new Hfs( new TextLine( new Fields( "num", "char" ) ), "bar" );
Map sources = new HashMap();
sources.put( "lower1", sourceLower );
sources.put( "lower2", sourceLower );
sources.put( "upper1", sourceUpper );
sources.put( "upper2", sourceUpper );
// using null pos so all fields are written
Tap sink1 = new Hfs( new TextLine(), "output1", true );
Tap sink2 = new Hfs( new TextLine(), "output2", true );
Map sinks = new HashMap();
sinks.put( "output1", sink1 );
sinks.put( "output2", sink2 );
Pipe pipeLower1 = new Pipe( "lower1" );
Pipe pipeLower2 = new Pipe( "lower2" );
Pipe pipeUpper1 = new Pipe( "upper1" );
Pipe pipeUpper2 = new Pipe( "upper2" );
Pipe splice1 = new CoGroup( pipeLower1, new Fields( "num" ), pipeUpper1, new Fields( "num" ), new Fields( "num1", "char1", "num2", "char2" ) );
Pipe splice2 = new CoGroup( splice1, new Fields( "num1" ), pipeUpper2, new Fields( "num" ), new Fields( "num1", "char1", "num2", "char2", "num3", "char3" ) );
splice2 = new CoGroup( "output1", splice2, new Fields( "num1" ), splice1, new Fields( "num1" ), new Fields( "num1", "char1", "num2", "char2", "num3", "char3", "num4", "char4", "num5", "char5" ) );
Pipe splice3 = new CoGroup( "output2", pipeLower2, new Fields( "num" ), splice2, new Fields( "num1" ), new Fields( "num1", "char1", "num2", "char2", "num3", "char3", "num4", "char4", "num5", "char5", "num6", "char6" ) );
Flow flow = null;
try
{
flow = new FlowConnector().connect( sources, sinks, splice3 );
}
catch( FlowException exception )
{
// exception.writeDOT( "directcogroup.dot" );
throw exception;
}
// flow.writeDOT( "directcogroup.dot" );
assertEquals( "not equal: steps.size()", 5, flow.getSteps().size() );
}
/**
* verify case where same source is fed to multiple chained cogroups
*
* @throws Exception
*/
public void testMultipleCoGroupSimilarSources() throws Exception
{
Tap sourceLower = new Hfs( new TextLine( new Fields( "num", "char" ) ), "foo" );
Tap sourceUpper = new Hfs( new TextLine( new Fields( "num", "char" ) ), "bar" );
Map sources = new HashMap();
sources.put( "lower1", sourceLower );
sources.put( "upper1", sourceUpper );
// using null pos so all fields are written
Tap sink1 = new Hfs( new TextLine(), "output1", true );
Tap sink2 = new Hfs( new TextLine(), "output2", true );
Map sinks = new HashMap();
sinks.put( "output1", sink1 );
sinks.put( "output2", sink2 );
Pipe pipeLower1 = new Pipe( "lower1" );
Pipe pipeUpper1 = new Pipe( "upper1" );
Pipe splice1 = new CoGroup( pipeLower1, new Fields( "num" ), pipeUpper1, new Fields( "num" ), new Fields( "num1", "char1", "num2", "char2" ) );
Pipe splice2 = new CoGroup( splice1, new Fields( "num1" ), pipeUpper1, new Fields( "num" ), new Fields( "num1", "char1", "num2", "char2", "num3", "char3" ) );
splice2 = new CoGroup( "output1", splice2, new Fields( "num1" ), splice1, new Fields( "num1" ), new Fields( "num1", "char1", "num2", "char2", "num3", "char3", "num4", "char4", "num5", "char5" ) );
Pipe splice3 = new CoGroup( "output2", pipeUpper1, new Fields( "num" ), splice2, new Fields( "num1" ), new Fields( "num1", "char1", "num2", "char2", "num3", "char3", "num4", "char4", "num5", "char5", "num6", "char6" ) );
Flow flow = null;
try
{
flow = new FlowConnector().connect( sources, sinks, splice3 );
}
catch( FlowException exception )
{
// exception.writeDOT( "chainedcogroup.dot" );
throw exception;
}
// flow.writeDOT( "multiplecogroupsimilarsources.dot" );
assertEquals( "not equal: steps.size()", 5, flow.getSteps().size() );
}
/**
* tests to make sure splits on a pipe before a cogroup and after result in proper normalization
*
* @throws Exception
*/
public void testMultipleCoGroupSplitSources() throws Exception
{
Tap sourceLower = new Hfs( new TextLine( new Fields( "num", "char" ) ), "foo" );
Tap sourceUpper = new Hfs( new TextLine( new Fields( "num", "char" ) ), "bar" );
Map sources = new HashMap();
sources.put( "lower1", sourceLower );
sources.put( "upper1", sourceUpper );
// using null pos so all fields are written
Tap sink1 = new Hfs( new TextLine(), "output1", true );
Tap sink2 = new Hfs( new TextLine(), "output2", true );
Map sinks = new HashMap();
sinks.put( "output1", sink1 );
sinks.put( "output2", sink2 );
Pipe pipeLower1 = new Pipe( "lower1" );
Pipe pipeUpper1 = new Pipe( "upper1" );
Pipe pipeLower2 = new Each( pipeLower1, new Identity() );
pipeLower2 = new Each( pipeLower1, new Identity() );
pipeLower2 = new Each( pipeLower1, new Identity() );
pipeLower2 = new GroupBy( pipeLower2, new Fields( "num", "char" ) );
pipeLower2 = new Every( pipeLower2, new Fields( "num", "char" ), new Count(), new Fields( "num", "char" ) );
pipeLower1 = new Each( pipeLower1, new Identity() );
pipeLower1 = new Each( pipeLower1, new Identity() );
pipeLower1 = new Each( pipeLower1, new Identity() );
pipeLower1 = new Pipe( "lower2", pipeLower1 );
pipeUpper1 = new Each( pipeUpper1, new Identity() );
pipeUpper1 = new Each( pipeUpper1, new Identity() );
pipeUpper1 = new Each( pipeUpper1, new Identity() );
Pipe splice1 = new CoGroup( "group", Pipe.pipes( pipeLower1, pipeLower2, pipeUpper1 ), Fields.fields( new Fields( "num" ), new Fields( "num" ), new Fields( "num" ) ), new Fields( "num1", "char1", "num2", "char2", "num3", "char3" ), new InnerJoin() );
Pipe output1 = new Each( splice1, AssertionLevel.VALID, new AssertNotNull() );
output1 = new Each( output1, new Identity() );
output1 = new Pipe( "output1", output1 );
Pipe output2 = new Each( splice1, AssertionLevel.VALID, new AssertNull() );
output2 = new Each( output2, new Identity() );
output2 = new Pipe( "output2", output2 );
Flow flow = null;
try
{
flow = new FlowConnector().connect( sources, sinks, output1, output2 );
}
catch( FlowException exception )
{
// exception.writeDOT( "chainedcogroup.dot" );
throw exception;
}
// flow.writeDOT( "chainedcogroup.dot" );
assertEquals( "not equal: steps.size()", 4, flow.getSteps().size() );
}
/**
* verify split is homogeneous
*
* @throws Exception
*/
public void testSplitEachOnGroup() throws Exception
{
Tap sourceLower = new Hfs( new TextLine( new Fields( "num", "char" ) ), "foo" );
Map sources = new HashMap();
sources.put( "lower1", sourceLower );
// using null pos so all fields are written
Tap sink1 = new Hfs( new TextLine(), "output1", true );
Tap sink2 = new Hfs( new TextLine(), "output2", true );
Map sinks = new HashMap();
sinks.put( "output1", sink1 );
sinks.put( "output2", sink2 );
Pipe pipeLower1 = new Pipe( "lower1" );
Pipe pipe = new GroupBy( pipeLower1, new Fields( 0 ) );
Pipe left = new Each( new Pipe( "output1", pipe ), new Identity() );
Pipe right = new Each( new Pipe( "output2", pipe ), new Identity() );
Flow flow = null;
try
{
flow = new FlowConnector().connect( sources, sinks, Pipe.pipes( left, right ) );
}
catch( PlannerException exception )
{
// exception.writeDOT( "splitout.dot" );
throw exception;
}
// flow.writeDOT( "splitout.dot" );
List<FlowStep> steps = flow.getSteps();
assertEquals( "not equal: steps.size()", 3, steps.size() );
}
public void testSplitEveryOnGroup() throws Exception
{
Tap sourceLower = new Hfs( new TextLine( new Fields( "num", "char" ) ), "foo" );
Map sources = new HashMap();
sources.put( "lower1", sourceLower );
// using null pos so all fields are written
Tap sink1 = new Hfs( new TextLine(), "output1", true );
Tap sink2 = new Hfs( new TextLine(), "output2", true );
Map sinks = new HashMap();
sinks.put( "output1", sink1 );
sinks.put( "output2", sink2 );
Pipe pipeLower1 = new Pipe( "lower1" );
Pipe pipe = new GroupBy( pipeLower1, new Fields( 0 ) );
Pipe left = new Every( new Pipe( "output1", pipe ), new TestBuffer( new Fields( "left" ), true ) );
Pipe right = new Every( new Pipe( "output2", pipe ), new TestBuffer( new Fields( "right" ), true ) );
Flow flow = null;
try
{
flow = new FlowConnector().connect( sources, sinks, Pipe.pipes( left, right ) );
fail( "did not throw planner exception" );
}
catch( PlannerException exception )
{
// exception.writeDOT( "splitout.dot" );
// exception.printStackTrace( );
}
}
public void testSplitOutput() throws Exception
{
Tap sourceLower = new Hfs( new TextLine( new Fields( "num", "char" ) ), "foo" );
Map sources = new HashMap();
sources.put( "lower1", sourceLower );
// using null pos so all fields are written
Tap sink1 = new Hfs( new TextLine(), "output1", true );
Tap sink2 = new Hfs( new TextLine(), "output2", true );
Map sinks = new HashMap();
sinks.put( "output1", sink1 );
sinks.put( "output2", sink2 );
Pipe pipeLower1 = new Pipe( "lower1" );
Pipe left = new GroupBy( "output1", pipeLower1, new Fields( 0 ) );
Pipe right = new GroupBy( "output2", left, new Fields( 0 ) );
Flow flow = null;
try
{
flow = new FlowConnector().connect( sources, sinks, Pipe.pipes( left, right ) );
}
catch( FlowException exception )
{
// exception.writeDOT( "splitout.dot" );
throw exception;
}
// flow.writeDOT( "splitout.dot" );
List<FlowStep> steps = flow.getSteps();
assertEquals( "not equal: steps.size()", 3, steps.size() );
// for( FlowStep step : steps )
// {
// if( step.group != null )
// continue;
//
// Scope nextScope = step.getNextScope( step.sources.keySet().iterator().next() );
// FlowElement operator = step.getNextFlowElement( nextScope );
//
// assertTrue( "should be Pipe", operator instanceof Pipe );
// }
}
/**
* DISABLED
* found having pipes with same names was too error prone. the workaround is to bind the tap to both names.
* if the process logically must use the same tap for each branch, then the branch should be split
*
* This tests if two pipes can have the same name, and thus logically the same input source.
* <p/>
* Further, a GroupBy with two inputs would fail if the source was directly associated. but there is a Group
* function between the source and the merge, so it passes.
*
*
* @throws java.io.IOException
*/
// public void testSameHeadName() throws IOException
// {
// Map sources = new HashMap();
// Map sinks = new HashMap();
//
// sources.put( "a", new Hfs( new Fields( "first", "second" ), "input/path/a" ) );
//
// Pipe pipeA = new Pipe( "a" );
// Pipe pipeB = new Pipe( "a" );
//
// Pipe group1 = new GroupBy( "a1", pipeA, Fields.FIRST );
// Pipe group2 = new GroupBy( "a2", pipeB, Fields.FIRST );
//
// Pipe merge = new GroupBy( "tail", Pipe.pipes( group1, group2 ), new Fields( "first", "second" ) );
//
// sinks.put( merge.getName(), new Hfs( new TextLine(), "output/path" ) );
//
// Flow flow = new FlowConnector().connect( sources, sinks, merge );
//
// assertEquals( "not equal: steps.size()", 3, flow.getSteps().size() );
// }
/**
* This is an alternative to having two pipes with the same name, but uses one pipe that is split
* across two branches.
*
* @throws IOException
*/
public void testSameSourceForBranch() throws IOException
{
Map sources = new HashMap();
Map sinks = new HashMap();
sources.put( "a", new Hfs( new Fields( "first", "second" ), "input/path/a" ) );
Pipe pipeA = new Pipe( "a" );
Pipe group1 = new GroupBy( "a1", pipeA, Fields.FIRST );
Pipe group2 = new GroupBy( "a2", pipeA, Fields.FIRST );
Pipe merge = new GroupBy( "tail", Pipe.pipes( group1, group2 ), new Fields( "first", "second" ) );
sinks.put( merge.getName(), new Hfs( new TextLine(), "output/path" ) );
Flow flow = new FlowConnector().connect( sources, sinks, merge );
assertEquals( "not equal: steps.size()", 3, flow.getSteps().size() );
}
/**
* Verifies the same tap instance can be shared between two logically different pipes.
*
* @throws IOException
*/
public void testSameTaps() throws IOException
{
Map sources = new HashMap();
Map sinks = new HashMap();
Hfs tap = new Hfs( new Fields( "first", "second" ), "input/path/a" );
sources.put( "a", tap );
sources.put( "b", tap );
Pipe pipeA = new Pipe( "a" );
Pipe pipeB = new Pipe( "b" );
Pipe group1 = new GroupBy( pipeA );
Pipe group2 = new GroupBy( pipeB );
Pipe merge = new GroupBy( "tail", Pipe.pipes( group1, group2 ), new Fields( "first", "second" ) );
sinks.put( merge.getName(), new Hfs( new TextLine(), "output/path" ) );
Flow flow = new FlowConnector().connect( sources, sinks, merge );
// flow.writeDOT( "sametaps.dot" );
assertEquals( "not equal: steps.size()", 3, flow.getSteps().size() );
}
public void testDanglingHead() throws IOException
{
Map sources = new HashMap();
Map sinks = new HashMap();
Hfs source = new Hfs( new Fields( "first", "second" ), "input/path/a" );
sources.put( "a", source );
Pipe pipeA = new Pipe( "a" );
Pipe pipeB = new Pipe( "b" );
Pipe group1 = new GroupBy( pipeA );
Pipe group2 = new GroupBy( pipeB );
Pipe merge = new GroupBy( "tail", Pipe.pipes( group1, group2 ), new Fields( "first", "second" ) );
sinks.put( merge.getName(), new Hfs( new TextLine(), "output/path" ) );
try
{
Flow flow = new FlowConnector().connect( sources, sinks, merge );
fail( "did not catch missing source tap" );
}
catch( PlannerException exception )
{
// do nothing
}
catch( Exception exception )
{
fail( "threw wrong exception" );
}
}
public void testDanglingTail() throws IOException
{
Map sources = new HashMap();
Map sinks = new HashMap();
Hfs tap = new Hfs( new Fields( "first", "second" ), "input/path/a" );
sources.put( "a", tap );
sources.put( "b", tap );
Pipe pipeA = new Pipe( "a" );
Pipe pipeB = new Pipe( "b" );
Pipe group1 = new GroupBy( pipeA );
Pipe group2 = new GroupBy( pipeB );
Pipe merge = new GroupBy( "tail", Pipe.pipes( group1, group2 ), new Fields( "first", "second" ) );
// sinks.put( merge.getName(), new Hfs( new TextLine(), "output/path" ) );
try
{
Flow flow = new FlowConnector().connect( sources, sinks, merge );
fail( "did not catch missing sink tap" );
}
catch( PlannerException exception )
{
// do nothing
}
catch( Exception exception )
{
fail( "threw wrong exception" );
}
}
public void testExtraSource() throws IOException
{
Map sources = new HashMap();
Map sinks = new HashMap();
Hfs tap = new Hfs( new Fields( "first", "second" ), "input/path/a" );
sources.put( "a", tap );
sources.put( "b", tap );
sources.put( "c", tap );
Pipe pipeA = new Pipe( "a" );
Pipe pipeB = new Pipe( "b" );
Pipe group1 = new GroupBy( pipeA );
Pipe group2 = new GroupBy( pipeB );
Pipe merge = new GroupBy( "tail", Pipe.pipes( group1, group2 ), new Fields( "first", "second" ) );
sinks.put( merge.getName(), new Hfs( new TextLine(), "output/path" ) );
try
{
Flow flow = new FlowConnector().connect( sources, sinks, merge );
fail( "did not catch extra source tap" );
}
catch( PlannerException exception )
{
// exception.printStackTrace();
assertTrue( exception.getMessage().contains( "['c']" ) );
}
catch( Exception exception )
{
fail( "threw wrong exception" );
}
}
public void testExtraSink() throws IOException
{
Map sources = new HashMap();
Map sinks = new HashMap();
Hfs tap = new Hfs( new Fields( "first", "second" ), "input/path/a" );
sources.put( "a", tap );
sources.put( "b", tap );
Pipe pipeA = new Pipe( "a" );
Pipe pipeB = new Pipe( "b" );
Pipe group1 = new GroupBy( pipeA );
Pipe group2 = new GroupBy( pipeB );
Pipe merge = new GroupBy( "tail", Pipe.pipes( group1, group2 ), new Fields( "first", "second" ) );
sinks.put( merge.getName(), new Hfs( new TextLine(), "output/path" ) );
sinks.put( "c", new Hfs( new TextLine(), "output/path" ) );
try
{
Flow flow = new FlowConnector().connect( sources, sinks, merge );
fail( "did not catch extra sink tap" );
}
catch( PlannerException exception )
{
// exception.printStackTrace();
assertTrue( exception.getMessage().contains( "['c']" ) );
}
catch( Exception exception )
{
fail( "threw wrong exception" );
}
}
public void testBuffer() throws IOException
{
Map sources = new HashMap();
Map sinks = new HashMap();
sources.put( "count", new Hfs( new Fields( "first", "second" ), "input/path" ) );
sinks.put( "count", new Hfs( new Fields( 0, 1 ), "output/path" ) );
Pipe pipe = new Pipe( "count" );
pipe = new GroupBy( pipe, new Fields( 1 ) );
pipe = new Every( pipe, new Fields( 1 ), new TestBuffer( new Fields( "fourth" ), "value" ), new Fields( 0, 1 ) );
List steps = new FlowConnector().connect( sources, sinks, pipe ).getSteps();
assertEquals( "wrong size", 1, steps.size() );
FlowStep step = (FlowStep) steps.get( 0 );
step.getJobConf(); // called init the step
assertEquals( "not equal: step.sources.size()", 1, step.sources.size() );
assertNotNull( "null: step.groupBy", step.getGroup() );
assertNotNull( "null: step.sink", step.sink );
int mapDist = countDistance( step.graph, step.sources.keySet().iterator().next(), step.getGroup() );
assertEquals( "not equal: mapDist", 0, mapDist );
int reduceDist = countDistance( step.graph, step.getGroup(), step.sink );
assertEquals( "not equal: reduceDist", 1, reduceDist );
}
public void testBufferFail() throws IOException
{
Map sources = new HashMap();
Map sinks = new HashMap();
sources.put( "count", new Hfs( new Fields( "first", "second" ), "input/path" ) );
sinks.put( "count", new Hfs( new Fields( 0, 1 ), "output/path" ) );
Pipe pipe = new Pipe( "count" );
pipe = new GroupBy( pipe, new Fields( 1 ) );
pipe = new Every( pipe, new Fields( 1 ), new TestBuffer( new Fields( "fourth" ), "value" ), new Fields( 0, 1 ) );
pipe = new Every( pipe, new Fields( 1 ), new Count(), new Fields( 0, 1 ) );
try
{
new FlowConnector().connect( sources, sinks, pipe );
fail( "did not throw planner exception" );
}
catch( Exception exception )
{
// ignore
// exception.printStackTrace();
}
}
public void testBufferFail2() throws IOException
{
Map sources = new HashMap();
Map sinks = new HashMap();
sources.put( "count", new Hfs( new Fields( "first", "second" ), "input/path" ) );
sinks.put( "count", new Hfs( new Fields( 0, 1 ), "output/path" ) );
Pipe pipe = new Pipe( "count" );
pipe = new GroupBy( pipe, new Fields( 1 ) );
pipe = new Every( pipe, new Fields( 1 ), new Count(), new Fields( 0, 1 ) );
pipe = new Every( pipe, new Fields( 1 ), new TestBuffer( new Fields( "fourth" ), "value" ), new Fields( 0, 1 ) );
try
{
new FlowConnector().connect( sources, sinks, pipe );
fail( "did not throw planner exception" );
}
catch( Exception exception )
{
// ignore
// exception.printStackTrace();
}
}
public void testErrorMessages() throws Exception
{
Tap source10 = new Hfs( new TextLine( new Fields( "num" ) ), "foo" );
Tap source20 = new Hfs( new TextLine( new Fields( "num" ) ), "bar" );
Map sources = new HashMap();
sources.put( "source20", source20 );
sources.put( "source101", source10 );
sources.put( "source102", source10 );
// using null pos so all fields are written
Tap sink = new Hfs( new TextLine(), "baz", true );
Pipe pipeNum20 = new Pipe( "source20" );
Pipe pipeNum101 = new Pipe( "source101" );
Pipe pipeNum102 = new Pipe( "source102" );
Pipe splice1 = new CoGroup( pipeNum20, new Fields( "num" ), pipeNum101, new Fields( "num" ), new Fields( "num1", "num2" ) );
Pipe splice2 = new CoGroup( splice1, new Fields( "num9" ), pipeNum102, new Fields( "num" ), new Fields( "num1", "num2", "num3" ) );
FlowConnector flowConnector = new FlowConnector();
try
{
Flow flow = flowConnector.connect( sources, sink, splice2 );
fail( "did not fail on bad field" );
}
catch( Exception exception )
{
// ignore
assertTrue( "missing message", exception.getMessage().contains( "BuildJobsTest.testErrorMessages" ) );
}
}
/**
* This test verifies splits on Pipe instances are recognized
* <p/>
* This flow intentionally splits to a Each and a Tap from a Each
* <pre>
* <p/>
* .... E1 - T1 - E2 - T2
* <p/>
* </pre>
* <p/>
* this test also verifed T1 feeds E2, instead of a new copy job being created
*
* @throws Exception
*/
public void testSplitInMiddleBeforePipeOptimized() throws Exception
{
splitMiddle( true, true );
}
public void testSplitInMiddleBeforePipe() throws Exception
{
splitMiddle( true, false );
}
public void testSplitInMiddleAfterPipe() throws Exception
{
splitMiddle( false, false );
}
private void splitMiddle( boolean before, boolean testTempReplaced )
{
Tap sourceLower = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "lower" );
Tap sourceUpper = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "upper" );
Map sources = new HashMap();
sources.put( "lower", sourceLower );
sources.put( "upper", sourceUpper );
Scheme leftScheme = testTempReplaced ? new SequenceFile( new Fields( "num", "lower", "num2", "upper" ) ) : new TextLine( new Fields( "offset", "line" ), new Fields( "lower" ) );
Tap sinkLeft = new Hfs( leftScheme, "/splitmiddle/left", SinkMode.REPLACE );
Scheme rightScheme = testTempReplaced ? new SequenceFile( new Fields( "lower" ) ) : new TextLine( new Fields( "offset", "line" ), new Fields( "lower" ) );
Tap sinkRight = new Hfs( rightScheme, "/splitmiddle/right", SinkMode.REPLACE );
Map sinks = new HashMap();
sinks.put( "left", sinkLeft );
sinks.put( "right", sinkRight );
Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter );
Pipe splice = new CoGroup( "both", pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), new Fields( "num", "lower", "num2", "upper" ) );
splice = new Each( splice, new Fields( "num" ), new RegexFilter( ".*" ) );
Pipe left = splice;
if( before )
left = new Pipe( "left", left );
left = new Each( left, new Fields( "num" ), new RegexFilter( ".*" ) );
if( !before )
left = new Pipe( "left", left );
Pipe right = left;
if( before )
right = new Pipe( "right", right );
right = new Each( right, new Fields( "num" ), new RegexFilter( ".*" ) );
if( !before )
right = new Pipe( "right", right );
Flow flow = new FlowConnector().connect( "splitmiddle", sources, sinks, left, right );
// flow.writeDOT( "splitmiddle.dot" );
// flow.writeStepsDOT( "splitmiddlesteps.dot" );
List<FlowStep> steps = flow.getSteps();
assertEquals( "not equal: steps.size()", testTempReplaced ? 2 : 3, steps.size() );
FlowStep step = steps.get( 0 );
Scope nextScope = step.getNextScope( step.getGroup() );
FlowElement operator = step.getNextFlowElement( nextScope );
assertTrue( "not an Each", operator instanceof Each );
nextScope = step.getNextScope( operator );
operator = step.getNextFlowElement( nextScope );
assertTrue( "not a Each", operator instanceof Each );
nextScope = step.getNextScope( operator );
operator = step.getNextFlowElement( nextScope );
if( testTempReplaced )
{
assertEquals( "not proper sink", sinkLeft, operator );
}
else
{
assertTrue( "not a TempHfs", operator instanceof TempHfs );
}
}
public void testSourceIsSink()
{
Tap tap = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "foo/merge" );
Pipe pipe = new Pipe( "left" );
try
{
Flow flow = new FlowConnector().connect( tap, tap, pipe );
// flow.writeDOT( "dupesource.dot" );
fail( "did not throw planner exception" );
}
catch( Exception exception )
{
// exception.printStackTrace();
}
}
public void testReplaceFail() throws Exception
{
Tap source = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "foo" );
Tap sink = new Hfs( new TextLine( new Fields( "offset", "line" ), new Fields( "offset", "line2" ) ), "bar", true );
Pipe pipe = new Pipe( "test" );
Function parser = new RegexParser( new Fields( 0 ), "^[^ ]*" );
pipe = new Each( pipe, new Fields( "line" ), parser, Fields.REPLACE );
pipe = new Each( pipe, new Fields( "line" ), new Identity( Fields.ARGS ), Fields.REPLACE );
pipe = new Each( pipe, new Fields( "line" ), new Identity( new Fields( "line2" ) ), Fields.REPLACE );
try
{
Flow flow = new FlowConnector().connect( source, sink, pipe );
fail( "did not fail" );
}
catch( Exception exception )
{
}
}
private int countDistance( SimpleDirectedGraph<FlowElement, Scope> graph, FlowElement lhs, FlowElement rhs )
{
return DijkstraShortestPath.findPathBetween( graph, lhs, rhs ).size() - 1;
}
public void testNestedProperties() throws IOException
{
Tap source = new Hfs( new TextLine( new Fields( "line" ) ), "/input" );
Pipe pipe = new Pipe( "test" );
pipe = new Each( pipe, new RegexSplitter( new Fields( "first", "second", "third" ), "\\s" ), Fields.ALL );
Tap sink = new Hfs( new TextLine(), "output", true );
Properties defaultProperties = new Properties();
defaultProperties.setProperty( "test.key", "test.value" );
Flow flow = new FlowConnector( new Properties( defaultProperties ) ).connect( source, sink, pipe );
assertEquals( "test flow", "test.value", flow.getProperty( "test.key" ) );
assertEquals( "test step", "test.value", flow.getSteps().get( 0 ).getJobConf( flow.getJobConf() ).get( "test.key" ) );
}
}