/* * Copyright (c) 2007-2010 Concurrent, Inc. All Rights Reserved. * * Project and contact information: http://www.cascading.org/ * * This file is part of the Cascading project. * * Cascading is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Cascading is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Cascading. If not, see <http://www.gnu.org/licenses/>. */ package cascading; import java.io.File; import java.io.Serializable; import cascading.assembly.EuclideanDistance; import cascading.assembly.PearsonDistance; import cascading.assembly.SortElements; import cascading.flow.Flow; import cascading.flow.FlowConnector; import cascading.flow.FlowProcess; import cascading.operation.AggregatorCall; import cascading.operation.Function; import cascading.operation.FunctionCall; import cascading.operation.Identity; import cascading.operation.aggregator.First; import cascading.operation.aggregator.Sum; import cascading.operation.function.UnGroup; import cascading.operation.regex.RegexFilter; import cascading.operation.regex.RegexSplitter; import cascading.pipe.CoGroup; import cascading.pipe.Each; import cascading.pipe.Every; import cascading.pipe.GroupBy; import cascading.pipe.Pipe; import cascading.scheme.TextLine; import cascading.tap.Hfs; import cascading.tap.Tap; import cascading.tuple.Fields; import cascading.tuple.Tuple; import cascading.tuple.TupleEntry; import cascading.tuple.TupleEntryIterator; public class DistanceUseCaseTest extends ClusterTestCase implements Serializable { String inputFileCritics = "build/test/data/critics.txt"; String outputPathEuclidean = "build/test/output/euclidean/"; String outputPathPearson = "build/test/output/pearson/"; public DistanceUseCaseTest() { super( "distance", false ); } /** * Calculate the euclidean distance of the people in the critics.txt file * * @throws java.io.IOException */ public void testEuclideanDistance() throws Exception { if( !new File( inputFileCritics ).exists() ) fail( "data file not found" ); copyFromLocal( inputFileCritics ); Tap source = new Hfs( new TextLine(), inputFileCritics ); Tap sink = new Hfs( new TextLine(), outputPathEuclidean + "/long", true ); Pipe pipe = new Pipe( "euclidean" ); // unknown number of elements pipe = new Each( pipe, new Fields( "line" ), new RegexSplitter( "\t" ) ); // break not names and movies pipe = new Each( pipe, new UnGroup( new Fields( "name", "movie", "rate" ), new Fields( 0 ), 2 ) ); // name and rate against others of same movie pipe = new CoGroup( pipe, new Fields( "movie" ), 1, new Fields( "name1", "movie", "rate1", "name2", "movie2", "rate2" ) ); // remove useless fields pipe = new Each( pipe, new Fields( "movie", "name1", "rate1", "name2", "rate2" ), new Identity() ); // remove lines if the names are the same pipe = new Each( pipe, new RegexFilter( "^[^\\t]*\\t([^\\t]*)\\t[^\\t]*\\t\\1\\t.*", true ) ); // transpose values in fields by natural sort order pipe = new Each( pipe, new SortElements( new Fields( "name1", "rate1" ), new Fields( "name2", "rate2" ) ) ); // unique the pipe pipe = new GroupBy( pipe, Fields.ALL ); pipe = new Every( pipe, Fields.ALL, new First(), Fields.RESULTS ); // calculate square of diff Function sqDiff = new Identity( new Fields( "score" ) ) { public void operate( FlowProcess flowProcess, FunctionCall functionCall ) { TupleEntry input = functionCall.getArguments(); functionCall.getOutputCollector().add( new Tuple( Math.pow( input.getTuple().getDouble( 0 ) - input.getTuple().getDouble( 1 ), 2 ) ) ); } }; // out: movie, name1, rate1, name2, rate2, score pipe = new Each( pipe, new Fields( "rate1", "rate2" ), sqDiff, Fields.ALL ); // sum and sqr for each name pair pipe = new GroupBy( pipe, new Fields( "name1", "name2" ) ); Sum distance = new Sum( new Fields( "distance" ) ) { public void complete( FlowProcess flowProcess, AggregatorCall aggregatorCall ) { Tuple tuple = super.getResult( aggregatorCall ); aggregatorCall.getOutputCollector().add( new Tuple( 1 / ( 1 + tuple.getDouble( 0 ) ) ) ); } }; pipe = new Every( pipe, new Fields( "score" ), distance, new Fields( "name1", "name2", "distance" ) ); Flow flow = new FlowConnector( getProperties() ).connect( source, sink, pipe ); // flow.writeDOT( "graph.dot" ); flow.complete(); validateLength( flow, 21 ); TupleEntryIterator iterator = flow.openSink(); boolean found = false; while( iterator.hasNext() ) { if( iterator.next().get( 1 ).equals( "GeneSeymour\tLisaRose\t0.14814814814814814" ) ) { found = true; break; } } assertTrue( "did not calculate score", found ); } /** * Calculate the euclidean distance of the people in the critics.txt file * * @throws java.io.IOException */ public void testEuclideanDistanceShort() throws Exception { if( !new File( inputFileCritics ).exists() ) fail( "data file not found" ); copyFromLocal( inputFileCritics ); Tap source = new Hfs( new TextLine(), inputFileCritics ); Tap sink = new Hfs( new TextLine(), outputPathEuclidean + "/short", true ); // unknown number of elements Pipe pipe = new Each( "euclidean", new Fields( "line" ), new RegexSplitter( "\t" ) ); // break not names and movies pipe = new Each( pipe, new UnGroup( new Fields( "name", "movie", "rate" ), Fields.FIRST, 2 ) ); // name and rate against others of same movie pipe = new CoGroup( pipe, new Fields( "movie" ), 1, new Fields( "name1", "movie", "rate1", "name2", "movie2", "rate2" ) ); // remove useless fields pipe = new Each( pipe, new Fields( "movie", "name1", "rate1", "name2", "rate2" ), new Identity() ); // remove lines if the names are the same pipe = new Each( pipe, new RegexFilter( "^[^\\t]*\\t([^\\t]*)\\t[^\\t]*\\t\\1\\t.*", true ) ); // transpose values in fields by natural sort order pipe = new Each( pipe, new SortElements( new Fields( "name1", "rate1" ), new Fields( "name2", "rate2" ) ) ); // unique the pipe pipe = new GroupBy( pipe, Fields.ALL ); pipe = new Every( pipe, Fields.ALL, new First(), Fields.RESULTS ); // calculate square of diff Function sqDiff = new Identity( new Fields( "score" ) ) { public void operate( FlowProcess flowProcess, FunctionCall functionCall ) { TupleEntry input = functionCall.getArguments(); functionCall.getOutputCollector().add( new Tuple( Math.pow( input.getTuple().getDouble( 0 ) - input.getTuple().getDouble( 1 ), 2 ) ) ); } }; // out: movie, name1, rate1, name2, rate2, score pipe = new Each( pipe, new Fields( "rate1", "rate2" ), sqDiff, Fields.ALL ); // sum and sqr for each name pair pipe = new GroupBy( pipe, new Fields( "name1", "name2" ) ); Sum distance = new Sum( new Fields( "distance" ) ) { public void complete( FlowProcess flowProcess, AggregatorCall aggregatorCall ) { Tuple tuple = super.getResult( aggregatorCall ); aggregatorCall.getOutputCollector().add( new Tuple( 1 / ( 1 + tuple.getDouble( 0 ) ) ) ); } }; pipe = new Every( pipe, new Fields( "score" ), distance, new Fields( "name1", "name2", "distance" ) ); Flow flow = new FlowConnector( getProperties() ).connect( source, sink, pipe ); // flow.writeDOT( "graph.dot" ); flow.complete(); validateLength( flow, 21 ); TupleEntryIterator iterator = flow.openSink(); boolean found = false; while( iterator.hasNext() ) { if( iterator.next().get( 1 ).equals( "GeneSeymour\tLisaRose\t0.14814814814814814" ) ) { found = true; break; } } assertTrue( "did not calculate score", found ); } public void testEuclideanDistanceComposite() throws Exception { if( !new File( inputFileCritics ).exists() ) fail( "data file not found" ); copyFromLocal( inputFileCritics ); Tap source = new Hfs( new TextLine(), inputFileCritics ); Tap sink = new Hfs( new TextLine(), outputPathEuclidean + "/composite", true ); // unknown number of elements Pipe pipe = new Each( "euclidean", new Fields( "line" ), new RegexSplitter( "\t" ) ); // break not names and movies pipe = new Each( pipe, new UnGroup( new Fields( "name", "movie", "rate" ), Fields.FIRST, 2 ) ); // name and rate against others of same movie pipe = new EuclideanDistance( pipe, new Fields( "name", "movie", "rate" ), new Fields( "name1", "name2", "distance" ) ); Flow flow = new FlowConnector( getProperties() ).connect( source, sink, pipe ); // flow.writeDOT( "eucdist.dot" ); flow.complete(); validateLength( flow, 21 ); TupleEntryIterator iterator = flow.openSink(); boolean found = false; while( iterator.hasNext() ) { if( iterator.next().get( 1 ).equals( "GeneSeymour\tLisaRose\t0.14814814814814814" ) ) { found = true; break; } } assertTrue( "did not calculate score", found ); } public void testPearsonDistanceComposite() throws Exception { if( !new File( inputFileCritics ).exists() ) fail( "data file not found" ); copyFromLocal( inputFileCritics ); Tap source = new Hfs( new TextLine(), inputFileCritics ); Tap sink = new Hfs( new TextLine(), outputPathPearson + "/composite", true ); // unknown number of elements Pipe pipe = new Each( "pearson", new Fields( "line" ), new RegexSplitter( "\t" ) ); // break not names and movies pipe = new Each( pipe, new UnGroup( new Fields( "name", "movie", "rate" ), Fields.FIRST, 2 ) ); // name and rate against others of same movie pipe = new PearsonDistance( pipe, new Fields( "name", "movie", "rate" ), new Fields( "name1", "name2", "distance" ) ); Flow flow = new FlowConnector( getProperties() ).connect( source, sink, pipe ); // flow.writeDOT( "peardist.dot" ); flow.complete(); validateLength( flow, 21 ); TupleEntryIterator iterator = flow.openSink(); boolean found = false; while( iterator.hasNext() ) { if( iterator.next().get( 1 ).equals( "GeneSeymour\tLisaRose\t0.39605901719066977" ) ) { found = true; break; } } assertTrue( "did not calculate score", found ); } }