/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.mahout.ga.watchmaker.cd.hadoop;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.mahout.common.StringUtils;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileValueIterable;
import org.apache.mahout.ga.watchmaker.OutputUtils;
import org.apache.mahout.ga.watchmaker.cd.CDFitness;
import org.apache.mahout.ga.watchmaker.cd.DataSet;
import org.apache.mahout.ga.watchmaker.cd.FileInfoParser;
import org.apache.mahout.ga.watchmaker.cd.Rule;
import org.apache.mahout.ga.watchmaker.cd.hadoop.DatasetSplit.DatasetTextInputFormat;
/**
* Mahout distributed evaluator. takes a list of classification rules and an
* input path and launch a Hadoop job to evaluate the fitness of each rule. At
* the end loads the evaluations from the job output.
*/
public final class CDMahoutEvaluator {
private CDMahoutEvaluator() {
}
/**
* Uses Mahout to evaluate the classification rules using the given evaluator.
* The input path contains the dataset
*
* @param rules classification rules to evaluate
* @param target label value to evaluate the rules for
* @param inpath input path (the dataset)
* @param evaluations {@code List<CDFitness>} that contains the
* evaluated fitness for each candidate from the input population,
* sorted in the same order as the candidates.
* @param split DatasetSplit used to separate training and testing input
*/
public static void evaluate(List<? extends Rule> rules,
int target,
Path inpath,
Path output,
Collection<CDFitness> evaluations,
DatasetSplit split) throws IOException, InterruptedException, ClassNotFoundException {
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(inpath.toUri(), conf);
Preconditions.checkArgument(fs.exists(inpath) && fs.getFileStatus(inpath).isDir(), "%s is not a directory", inpath);
Job job = new Job(conf);
configureJob(job, rules, target, inpath, output, split);
job.waitForCompletion(true);
importEvaluations(fs, conf, output, evaluations);
}
/**
* Initializes the dataset
*
* @param inpath input path (the dataset)
*/
public static void initializeDataSet(Path inpath) throws IOException {
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(inpath.toUri(), conf);
// Initialize the dataset
DataSet.initialize(FileInfoParser.parseFile(fs, inpath));
}
/**
* Evaluate a single rule.
*
* @param rule classification rule to evaluate
* @param target label value to evaluate the rules for
* @param inpath input path (the dataset)
* @param split DatasetSplit used to separate training and testing input
* @return the evaluation
*/
public static CDFitness evaluate(Rule rule, int target, Path inpath, Path output, DatasetSplit split)
throws IOException, InterruptedException, ClassNotFoundException {
List<CDFitness> evals = Lists.newArrayList();
evaluate(Arrays.asList(rule), target, inpath, output, evals, split);
return evals.get(0);
}
/**
* Use all the dataset for training.
*
* @param rules classification rules to evaluate
* @param target label value to evaluate the rules for
* @param inpath input path (the dataset)
* @param evaluations {@code List<CDFitness>} that contains the
* evaluated fitness for each candidate from the input population,
* sorted in the same order as the candidates.
*/
public static void evaluate(List<? extends Rule> rules,
int target,
Path inpath,
Path output,
Collection<CDFitness> evaluations)
throws IOException, InterruptedException, ClassNotFoundException {
evaluate(rules, target, inpath, output, evaluations, new DatasetSplit(1));
}
/**
* Configure the job
*
* @param job Job to configure
* @param rules classification rules to evaluate
* @param target label value to evaluate the rules for
* @param inpath input path (the dataset)
* @param outpath output {@code Path}
* @param split DatasetSplit used to separate training and testing input
*/
private static void configureJob(Job job,
List<? extends Rule> rules,
int target,
Path inpath,
Path outpath,
DatasetSplit split) throws IOException {
split.storeJobParameters(job.getConfiguration());
FileInputFormat.setInputPaths(job, inpath);
FileOutputFormat.setOutputPath(job, outpath);
job.setJarByClass(CDMahoutEvaluator.class);
job.setOutputKeyClass(LongWritable.class);
job.setOutputValueClass(CDFitness.class);
job.setMapperClass(CDMapper.class);
job.setCombinerClass(CDReducer.class);
job.setReducerClass(CDReducer.class);
job.setInputFormatClass(DatasetTextInputFormat.class);
job.setOutputFormatClass(SequenceFileOutputFormat.class);
// store the parameters
Configuration conf = job.getConfiguration();
conf.set(CDMapper.CLASSDISCOVERY_RULES, StringUtils.toString(rules));
conf.set(CDMapper.CLASSDISCOVERY_DATASET, StringUtils.toString(DataSet.getDataSet()));
conf.setInt(CDMapper.CLASSDISCOVERY_TARGET_LABEL, target);
}
/**
* Reads back the evaluations.
*
* @param fs File System
* @param conf Job configuration
* @param outpath output {@code Path}
* @param evaluations {@code List<Fitness>} that contains the
* evaluated fitness for each candidate from the input population,
* sorted in the same order as the candidates.
*/
private static void importEvaluations(FileSystem fs,
Configuration conf, Path outpath,
Collection<CDFitness> evaluations)
throws IOException {
SequenceFile.Sorter sorter = new SequenceFile.Sorter(fs, LongWritable.class, CDFitness.class, conf);
// merge and sort the outputs
Path[] outfiles = OutputUtils.listOutputFiles(fs, outpath);
Path output = new Path(outpath, "output.sorted");
sorter.merge(outfiles, output);
// import the evaluations
for (CDFitness value : new SequenceFileValueIterable<CDFitness>(output, conf)) {
evaluations.add(value);
}
}
}