/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.mahout.math.stats.entropy; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.DoubleWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; import org.apache.hadoop.util.ToolRunner; import org.apache.mahout.common.AbstractJob; import org.apache.mahout.common.StringTuple; import org.apache.mahout.math.VarIntWritable; import java.io.IOException; /** * A Hadoop job to compute the conditional entropy H(Value|Key) for a sequence file. * <ul> * <li>-i The input sequence file</li> * <li>-o The output sequence file</li> * </ul> */ public final class ConditionalEntropy extends AbstractJob { private long numberItems; private Path keyValueCountPath; private Path specificConditionalEntropyPath; private static final String KEY_VALUE_COUNT_FILE = "key_value_count"; private static final String SPECIFIC_CONDITIONAL_ENTROPY_FILE = "specific_conditional_entropy"; static final String NUMBER_ITEMS_PARAM = "items.number"; public static void main(String[] args) throws Exception { ToolRunner.run(new Entropy(), args); } @Override public int run(String[] args) throws IOException, ClassNotFoundException, InterruptedException { prepareArguments(args); groupAndCountByKeyAndValue(); calculateSpecificConditionalEntropy(); calculateConditionalEntropy(); return 0; } /** * Prepares and sets the arguments. */ private void prepareArguments(String[] args) throws IOException { addInputOption(); addOutputOption(); parseArguments(args); keyValueCountPath = new Path(getTempPath(), KEY_VALUE_COUNT_FILE + '-' + System.currentTimeMillis()); specificConditionalEntropyPath = new Path(getTempPath(), SPECIFIC_CONDITIONAL_ENTROPY_FILE + '_' + System.currentTimeMillis()); } /** * Groups and counts by key and value. * SQL-like: SELECT key, value, COUNT(*) FROM x GROUP BY key, value */ private void groupAndCountByKeyAndValue() throws IOException, ClassNotFoundException, InterruptedException { Job job = prepareJob(getInputPath(), keyValueCountPath, SequenceFileInputFormat.class, GroupAndCountByKeyAndValueMapper.class, StringTuple.class, VarIntWritable.class, VarIntSumReducer.class, StringTuple.class, VarIntWritable.class, SequenceFileOutputFormat.class); job.setCombinerClass(VarIntSumReducer.class); job.waitForCompletion(true); numberItems = job.getCounters().findCounter("org.apache.hadoop.mapred.Task$Counter", "MAP_INPUT_RECORDS").getValue(); } /** * Calculates the specific conditional entropy which is H(Y|X). * Needs the number of all items for normalizing. */ private void calculateSpecificConditionalEntropy() throws IOException, ClassNotFoundException, InterruptedException { Job job = prepareJob(keyValueCountPath, specificConditionalEntropyPath, SequenceFileInputFormat.class, SpecificConditionalEntropyMapper.class, Text.class, VarIntWritable.class, SpecificConditionalEntropyReducer.class, Text.class, DoubleWritable.class, SequenceFileOutputFormat.class); job.getConfiguration().set(NUMBER_ITEMS_PARAM, String.valueOf(numberItems)); job.waitForCompletion(true); } /** * Sums the calculated specific conditional entropy. Output is in the value. */ private void calculateConditionalEntropy() throws IOException, ClassNotFoundException, InterruptedException { Job job = prepareJob(specificConditionalEntropyPath, getOutputPath(), SequenceFileInputFormat.class, CalculateSpecificConditionalEntropyMapper.class, NullWritable.class, DoubleWritable.class, DoubleSumReducer.class, NullWritable.class, DoubleWritable.class, SequenceFileOutputFormat.class); job.setCombinerClass(DoubleSumReducer.class); job.waitForCompletion(true); } }