/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.mahout.math.stats.entropy;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.util.ToolRunner;
import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.math.VarIntWritable;
import java.io.IOException;
import java.util.Map;
/**
* A Hadoop job to compute the entropy of keys or values in a {@link SequenceFile}. Format has to be {@link Text} for
* key or value.
* <p/>
* <ul>
* <li>-i The input sequence file</li>
* <li>-o The output sequence file</li>
* <li>-s The source. Can be \<key\> or \<value\>. Default is \<key\></li>
* </ul>
*/
public final class Entropy extends AbstractJob {
private Path tempPath;
private long numberItems;
private String source;
private static final String TEMP_FILE = "temp";
static final String NUMBER_ITEMS_PARAM = "number.items";
public static void main(String[] args) throws Exception {
ToolRunner.run(new Entropy(), args);
}
/**
* Returns the number of elements in the file. Only works after run.
*
* @return The number of processed items
*/
public long getNumberItems() {
return numberItems;
}
@Override
public int run(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
prepareArguments(args);
groupAndCount();
calculateEntropy();
return 1;
}
/**
* Prepares and sets the arguments.
*
* @param args
* @throws IOException
*/
private void prepareArguments(String[] args) throws IOException {
addInputOption();
addOutputOption();
addOption("source", "s", "Sets, if the entropy is calculated for the keys or the values. Can be <key> or <value>"
, "key");
Map<String, String> arguments = parseArguments(args);
source = arguments.get("--source");
tempPath = new Path(getTempPath(), TEMP_FILE + '-' + System.currentTimeMillis());
}
/**
* Groups the items and counts the occur for each of them.
* SQL-like: SELECT item, COUNT(*) FROM x GROUP BY item
*
* @throws IOException
* @throws ClassNotFoundException
* @throws InterruptedException
*/
private void groupAndCount() throws IOException, ClassNotFoundException, InterruptedException {
Class<? extends Mapper> mapper = "key".equals(source) ? KeyCounterMapper.class : ValueCounterMapper.class;
Job job = prepareJob(getInputPath(), tempPath, SequenceFileInputFormat.class, mapper, Text.class,
VarIntWritable.class, VarIntSumReducer.class, Text.class, VarIntWritable.class,
SequenceFileOutputFormat.class);
job.setCombinerClass(VarIntSumReducer.class);
job.waitForCompletion(true);
numberItems =
job.getCounters().findCounter("org.apache.hadoop.mapred.Task$Counter", "MAP_INPUT_RECORDS").getValue();
}
/**
* Calculates the entropy with
* <p/>
* H(X) = -sum_i(x_i/n * log_2(x_i/n)) WITH n = sum_i(x_i)
* = -sum_i(x_i/n * (log_2(x_i) - log_2(n)))
* = -sum_i(x_i/n * log_2(x_i)) + sum_i(x_i/n * log_2(n))
* = (n * log_2(n) - sum_i(x_i * log_2(x_i)) / n
* = log_2(n) - sum_i(x_i * log_2(x_i)) / n
* = (log(n) - sum_i(x_i * log(x_i)) / n) / log(2)
*/
private void calculateEntropy() throws IOException, ClassNotFoundException, InterruptedException {
Job job = prepareJob(tempPath, getOutputPath(), SequenceFileInputFormat.class, CalculateEntropyMapper.class,
NullWritable.class, DoubleWritable.class, CalculateEntropyReducer.class, NullWritable.class,
DoubleWritable.class, SequenceFileOutputFormat.class);
job.getConfiguration().set(NUMBER_ITEMS_PARAM, String.valueOf(numberItems));
job.setCombinerClass(DoubleSumReducer.class);
job.waitForCompletion(true);
}
}