/* * Copyright (c) 2007-2010 Concurrent, Inc. All Rights Reserved. * * Project and contact information: http://www.cascading.org/ * * This file is part of the Cascading project. * * Cascading is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Cascading is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Cascading. If not, see <http://www.gnu.org/licenses/>. */ package cascading.assembly; import java.util.HashMap; import java.util.Map; import cascading.flow.FlowProcess; import cascading.operation.AggregatorCall; import cascading.pipe.Pipe; import cascading.tuple.Fields; import cascading.tuple.Tuple; import cascading.tuple.TupleEntry; /** * Computes the pearson distance between every unique set of first fields, with using the label and value of each element. * <p/> * Expects on input three values: item, label, value */ public class PearsonDistance extends CrossTab { public PearsonDistance( Pipe previous ) { this( previous, Fields.size( 3 ), new Fields( "n1", "n2", "pearson" ) ); } public PearsonDistance( Pipe previous, Fields argumentFieldSelector, Fields fieldDeclaration ) { super( previous, argumentFieldSelector, new Pearson(), fieldDeclaration ); } private static class Pearson extends CrossTabOperation<Map<String, Double>> { private static final String COUNT = "count"; private static final String SUM1 = "sum1"; private static final String SUM2 = "sum2"; private static final String SUMSQRS1 = "sumsqrs1"; private static final String SUMSQRS2 = "sumsqrs2"; private static final String SUMPROD = "sumprod"; public Pearson() { super( new Fields( "pearson" ) ); } public void start( FlowProcess flowProcess, AggregatorCall<Map<String, Double>> aggregatorCall ) { if( aggregatorCall.getContext() == null ) aggregatorCall.setContext( new HashMap<String, Double>() ); Map<String, Double> context = aggregatorCall.getContext(); context.put( COUNT, 0d ); context.put( SUM1, 0d ); context.put( SUM2, 0d ); context.put( SUMSQRS1, 0d ); context.put( SUMSQRS2, 0d ); context.put( SUMPROD, 0d ); } public void aggregate( FlowProcess flowProcess, AggregatorCall<Map<String, Double>> aggregatorCall ) { Map<String, Double> context = aggregatorCall.getContext(); TupleEntry entry = aggregatorCall.getArguments(); context.put( COUNT, ( (Double) context.get( COUNT ) ) + 1d ); context.put( SUM1, ( (Double) context.get( SUM1 ) ) + entry.getTuple().getDouble( 0 ) ); context.put( SUM2, ( (Double) context.get( SUM2 ) ) + entry.getTuple().getDouble( 1 ) ); context.put( SUMSQRS1, ( (Double) context.get( SUMSQRS1 ) ) + Math.pow( entry.getTuple().getDouble( 0 ), 2 ) ); context.put( SUMSQRS2, ( (Double) context.get( SUMSQRS2 ) ) + Math.pow( entry.getTuple().getDouble( 1 ), 2 ) ); context.put( SUMPROD, ( (Double) context.get( SUMPROD ) ) + ( entry.getTuple().getDouble( 0 ) * entry.getTuple().getDouble( 1 ) ) ); } public void complete( FlowProcess flowProcess, AggregatorCall<Map<String, Double>> aggregatorCall ) { Map<String, Double> context = aggregatorCall.getContext(); Double count = (Double) context.get( COUNT ); Double sum1 = (Double) context.get( SUM1 ); Double sum2 = (Double) context.get( SUM2 ); double num = (Double) context.get( SUMPROD ) - ( sum1 * sum2 / count ); double den = Math.sqrt( ( (Double) context.get( SUMSQRS1 ) - Math.pow( sum1, 2 ) / count ) * ( (Double) context.get( SUMSQRS2 ) - Math.pow( sum2, 2 ) / count ) ); if( den == 0 ) aggregatorCall.getOutputCollector().add( new Tuple( 0 ) ); else aggregatorCall.getOutputCollector().add( new Tuple( num / den ) ); } } }