package com.skp.experiment.cf.evaluate.hadoop;
import java.io.IOException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.mahout.cf.taste.hadoop.TasteHadoopUtils;
import org.apache.mahout.common.iterator.FileLineIterator;
public class EvaluatorUtil {
public static Integer RECORD_COUNT_SUM_INDEX = -1;
/**
* iterate files under output path and sums over each column indexs value as double .
* number of record count is will be on -1 column
* @param conf
* @param output
* @param indexs
* @return
* @throws IOException
*/
public static Map<Integer, Double> getResultSumPerColumns(Configuration conf, Path output, List<Integer> indexs, boolean exclude)
throws IOException {
FileSystem fs = FileSystem.get(conf);
FileStatus[] files = fs.globStatus(new Path(output.toString() + "/^[^_]*"));
Map<Integer, Double> stats = new HashMap<Integer, Double>();
long totalCount = 0;
for (FileStatus file : files) {
FileLineIterator iter = new FileLineIterator(fs.open(file.getPath()));
while (iter.hasNext()) {
totalCount++;
String[] tokens = TasteHadoopUtils.splitPrefTokens(iter.next());
if (exclude) {
for (int idx = 0; idx < tokens.length; idx++) {
if (indexs.contains(idx)) continue;
if (!stats.containsKey(idx)) {
stats.put(idx, 0.0);
}
stats.put(idx, stats.get(idx) + Double.parseDouble(tokens[idx]));
}
} else {
for (Integer idx : indexs) {
if (idx >= 0 && idx < tokens.length) {
if (!stats.containsKey(idx)) {
stats.put(idx, 0.0);
}
stats.put(idx, stats.get(idx) + Double.parseDouble(tokens[idx]));
}
}
}
}
}
stats.put(RECORD_COUNT_SUM_INDEX, (double) totalCount);
return stats;
}
}