/* * chombo: Hadoop Map Reduce utility * Author: Pranab Ghosh * * Licensed under the Apache License, Version 2.0 (the "License"); you * may not use this file except in compliance with the License. You may * obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. */ package org.chombo.util; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Scanner; import org.apache.hadoop.conf.Configuration; /** * Loads numerical attribute stats from HDFS file and provides access methods * @author pranab * */ public class NumericalAttrStatsManager { private Map<Integer, List<Tuple>> stats = new HashMap<Integer, List<Tuple>>(); private Map<String, Map<Integer, List<Tuple>>> keyedStats = new HashMap<String, Map<Integer, List<Tuple>>>(); private static final String DEF_COND_ATTR_VAL = "$"; /** * Stats for data * @param config * @param statsFilePath * @param delim * @throws IOException */ public NumericalAttrStatsManager(Configuration config, String statsFilePathParam, String delim) throws IOException { InputStream fs = Utility.getFileStream(config, statsFilePathParam); initialize(fs, delim); } /** * @param statsFilePath * @param delim * @param fromFilePath * @throws IOException */ public NumericalAttrStatsManager(String statsFilePath, String delim, boolean fromFilePath) throws IOException { InputStream fs = Utility.getFileStream(statsFilePath); initialize(fs, delim); } /** * @param fs * @param delim * @throws NumberFormatException * @throws IOException */ private void initialize(InputStream fs, String delim) throws NumberFormatException, IOException { BufferedReader reader = new BufferedReader(new InputStreamReader(fs)); String line = null; String[] items = null; //(0)attr ord (1)cond attr (2)sum (3)sum square (4)count (5)mean (6)variance (7)std dev (8)min (9)max while((line = reader.readLine()) != null) { items = line.split(delim); Tuple tuple = new Tuple(); int i = 0; Integer attr = Integer.parseInt(items[0]); tuple.add(Tuple.STRING, items[i++]); tuple.add(Tuple.DOUBLE, items[i++]); tuple.add(Tuple.DOUBLE, items[i++]); tuple.add(Tuple.INT, items[i++]); tuple.add(Tuple.DOUBLE, items[i++]); tuple.add(Tuple.DOUBLE, items[i++]); tuple.add(Tuple.DOUBLE, items[i++]); List<Tuple> statList = stats.get(attr); if (null == statList ) { statList = new ArrayList<Tuple>(); stats.put(attr, statList ); } statList.add( tuple); } } /** * @param statsContent * @param delim * @throws IOException */ public NumericalAttrStatsManager( String statsContent, String delim) throws IOException { String line = null; String[] items = null; Scanner scanner = new Scanner(statsContent); while (scanner.hasNextLine()) { line = scanner.nextLine(); items = line.split(delim); Tuple tuple = new Tuple(); int i = 0; Integer attr = Integer.parseInt(items[0]); tuple.add(Tuple.STRING, items[i++]); tuple.add(Tuple.DOUBLE, items[i++]); tuple.add(Tuple.DOUBLE, items[i++]); tuple.add(Tuple.INT, items[i++]); tuple.add(Tuple.DOUBLE, items[i++]); tuple.add(Tuple.DOUBLE, items[i++]); tuple.add(Tuple.DOUBLE, items[i++]); List<Tuple> statList = stats.get(attr); if (null == statList ) { statList = new ArrayList<Tuple>(); stats.put(attr, statList ); } statList.add( tuple); } } /** * Stats for keyed data * @param config * @param statsFilePath * @param delim * @param idOrdinals * @throws IOException */ public NumericalAttrStatsManager(Configuration config, String statsFilePath, String delim, int[] idOrdinals) throws IOException { InputStream fs = Utility.getFileStream(config, statsFilePath); BufferedReader reader = new BufferedReader(new InputStreamReader(fs)); String line = null; String[] items = null; //(0)attr ord (1)cond attr (2)sum (3)sum square (4)count (5)mean (6)variance (7)std dev (8)min (9)max while((line = reader.readLine()) != null) { items = line.split(delim); Tuple tuple = new Tuple(); int i = 0; String compKey = Utility.join(items, 0, idOrdinals.length); i += idOrdinals.length; Integer attr = Integer.parseInt(items[i++]); tuple.add(Tuple.STRING, items[i++]); tuple.add(Tuple.DOUBLE, items[i++]); tuple.add(Tuple.DOUBLE, items[i++]); tuple.add(Tuple.INT, items[i++]); tuple.add(Tuple.DOUBLE, items[i++]); tuple.add(Tuple.DOUBLE, items[i++]); tuple.add(Tuple.DOUBLE, items[i++]); //add to map Map<Integer, List<Tuple>> stats = keyedStats.get(compKey); if (null == stats) { stats = new HashMap<Integer, List<Tuple>>(); keyedStats.put(compKey, stats); } List<Tuple> statList = stats.get(attr); if (null == statList ) { statList = new ArrayList<Tuple>(); stats.put(attr, statList ); } statList.add( tuple); } } /** * @param attr * @param condAttrVal * @return */ private Tuple getStats(int attr, String condAttrVal) { Tuple foundTuple = null; //for all cond attribute values List<Tuple> statList = stats.get(attr); //search by cond attribute for (Tuple tuple : statList) { if (tuple.getString(0).equals(condAttrVal)) { foundTuple = tuple; break; } } return foundTuple; } /** * @param attr * @return */ public double getSum(int attr) { Tuple tuple = getStats(attr, DEF_COND_ATTR_VAL); return tuple.getDouble(1); } /** * @param attr * @return */ public double getSumSq(int attr) { Tuple tuple = getStats(attr, DEF_COND_ATTR_VAL); return tuple.getDouble(2); } /** * @param attr * @return */ public int getCount(int attr) { Tuple tuple = getStats(attr, DEF_COND_ATTR_VAL); return tuple.getInt(3); } /** * @param attr * @return */ public double getMean(int attr) { Tuple tuple = getStats(attr, DEF_COND_ATTR_VAL); return tuple.getDouble(4); } /** * @param attr * @return */ public double getVariance(int attr) { Tuple tuple = getStats(attr, DEF_COND_ATTR_VAL); return tuple.getDouble(5); } /** * @param attr * @return */ public double getStdDev(int attr) { Tuple tuple = getStats(attr, DEF_COND_ATTR_VAL); return tuple.getDouble(6); } /** * @param attr * @return */ public double getMin(int attr) { Tuple tuple = getStats(attr, DEF_COND_ATTR_VAL); return tuple.getDouble(7); } /** * @param attr * @return */ public double getMax(int attr) { Tuple tuple = getStats(attr, DEF_COND_ATTR_VAL); return tuple.getDouble(8); } public StatsParameters getStatsParameters(int attr) { StatsParameters stats = new StatsParameters(); stats.setMean(getMean(attr)); stats.setStdDev(getStdDev(attr)); stats.setMin(getMin(attr)); stats.setMin(getMax(attr)); return stats; } /** * @param attr * @param condAttrVal * @return */ private Tuple getKeyedStats(String compKey, int attr, String condAttrVal) { Tuple foundTuple = null; Map<Integer, List<Tuple>> stats = keyedStats.get(compKey); List<Tuple> statList = stats.get(attr); for (Tuple tuple : statList) { if (tuple.getString(0).equals(condAttrVal)) { foundTuple = tuple; break; } } return foundTuple; } /** * @param compKey * @param attr * @return */ public double getSum(String compKey,int attr) { Tuple tuple = getKeyedStats(compKey, attr, DEF_COND_ATTR_VAL); return tuple.getDouble(1); } /** * @param compKey * @param attr * @return */ public double getSumSq(String compKey, int attr) { Tuple tuple = getKeyedStats(compKey, attr, DEF_COND_ATTR_VAL); return tuple.getDouble(2); } /** * @param compKey * @param attr * @return */ public int getCount(String compKey, int attr) { Tuple tuple = getKeyedStats(compKey, attr, DEF_COND_ATTR_VAL); return tuple.getInt(3); } /** * @param compKey * @param attr * @return */ public double getMean(String compKey, int attr) { Tuple tuple = getKeyedStats(compKey, attr, DEF_COND_ATTR_VAL); return tuple.getDouble(4); } /** * @param compKey * @param attr * @return */ public double getVariance(String compKey, int attr) { Tuple tuple = getKeyedStats(compKey, attr, DEF_COND_ATTR_VAL); return tuple.getDouble(5); } /** * @param compKey * @param attr * @return */ public double getStdDev(String compKey, int attr) { Tuple tuple = getKeyedStats(compKey, attr, DEF_COND_ATTR_VAL); return tuple.getDouble(6); } /** * @param compKey * @param attr * @return */ public double getMin(String compKey, int attr) { Tuple tuple = getKeyedStats(compKey, attr, DEF_COND_ATTR_VAL); return tuple.getDouble(7); } /** * @param compKey * @param attr * @return */ public double getMax(String compKey, int attr) { Tuple tuple = getKeyedStats(compKey, attr, DEF_COND_ATTR_VAL); return tuple.getDouble(8); } }