/* * chombo: Hadoop Map Reduce utility * Author: Pranab Ghosh * * Licensed under the Apache License, Version 2.0 (the "License"); you * may not use this file except in compliance with the License. You may * obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. */ package org.chombo.util; import java.io.BufferedReader; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStream; import java.io.PrintWriter; import java.io.StringReader; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Properties; import java.util.Set; import java.util.UUID; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.lang3.ArrayUtils; import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapreduce.Job; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.codehaus.jackson.map.ObjectMapper; import com.amazonaws.auth.PropertiesCredentials; import com.amazonaws.services.s3.AmazonS3; import com.amazonaws.services.s3.AmazonS3Client; import com.amazonaws.services.s3.model.GetObjectRequest; import com.amazonaws.services.s3.model.S3Object; import com.typesafe.config.Config; import com.typesafe.config.ConfigFactory; /** * Generic Utility * @author pranab * */ public class Utility { private static final String CONF_FILE_PROP_NAME = "conf.path"; private static final String FS_DEF_CONFIG_DIR = "/var/mawazo/"; private static final String HDFS_DEF_CONFIG_DIR = "/var/mawazo/"; private static final String HDFS_PREFIX = "hdfs:"; private static final int HDFS_PREFIX_LEN = 5; private static final String S3_PREFIX = "s3n:"; private static final String PROP_FILE_EXT = ".properties"; public static final Integer ZERO = 0; public static final Integer ONE = 1; public static final String DEF_FIELD_DELIM = ","; private static Pattern s3pattern = Pattern.compile("s3n:/+([^/]+)/+(.*)"); public static String configDelim = ","; public static String configSubFieldDelim = ":"; public static long MILISEC_PER_HOUR = 60L * 1000 * 1000; public static long MILISEC_PER_HALF_DAY = 12 * MILISEC_PER_HOUR; public static long MILISEC_PER_DAY = 24 * MILISEC_PER_HOUR; /** * sets configuration * @param conf * @throws Exception */ public static void setConfiguration(Configuration conf) throws Exception{ String confFilePath = conf.get("conf.path"); if (null != confFilePath){ FileInputStream fis = new FileInputStream(confFilePath); Properties configProps = new Properties(); configProps.load(fis); for (Object key : configProps.keySet()){ String keySt = key.toString(); conf.set(keySt, configProps.getProperty(keySt)); } } } /** * sets configuration and defaults to project name based config file * @param conf * @param project * @throws Exception */ public static void setConfiguration(Configuration conf, String project) throws Exception{ boolean found = false; String confFilePath = conf.get(CONF_FILE_PROP_NAME); //user provided config file path if (null != confFilePath){ if (confFilePath.startsWith(S3_PREFIX)) { loadConfigS3(conf, confFilePath); System.out.println("config found in user specified Amazon S3 file"); } else if (confFilePath.startsWith(HDFS_PREFIX)) { loadConfigHdfs( conf, confFilePath.substring(HDFS_PREFIX_LEN)); System.out.println("config found in user specified HDFS file"); } else { loadConfig( conf, confFilePath, false); System.out.println("config found in user specified FS file"); } } else { //default file system path confFilePath = FS_DEF_CONFIG_DIR + project + PROP_FILE_EXT; found = loadConfig( conf, confFilePath, true); //default HDFS path if (!found) { confFilePath = HDFS_DEF_CONFIG_DIR + project + PROP_FILE_EXT; loadConfigHdfs( conf, confFilePath); System.out.println("config found in default HDFS location"); } else { System.out.println("config found in default FS location"); } } } /** * @param conf * @param confFilePath * @param handleErr * @return * @throws IOException */ private static boolean loadConfig(Configuration conf, String confFilePath, boolean handleErr ) throws IOException { boolean found = false; try { FileInputStream fis = new FileInputStream(confFilePath); Properties configProps = new Properties(); configProps.load(fis); for (Object key : configProps.keySet()){ String keySt = key.toString(); conf.set(keySt, configProps.getProperty(keySt)); } found = true; } catch (FileNotFoundException ex) { if (!handleErr) { throw ex; } } return found; } /** * @param conf * @param confFilePath * @return * @throws IOException */ private static boolean loadConfigHdfs(Configuration conf, String confFilePath) throws IOException { boolean found = false; FileSystem dfs = FileSystem.get(conf); Path src = new Path(confFilePath); FSDataInputStream fis = dfs.open(src); Properties configProps = new Properties(); configProps.load(fis); for (Object key : configProps.keySet()){ String keySt = key.toString(); conf.set(keySt, configProps.getProperty(keySt)); } found = true; return found; } private static boolean loadConfigS3(Configuration conf, String confFilePath) throws IOException { Matcher matcher = s3pattern.matcher(confFilePath); matcher.matches(); String bucket = matcher.group(1); String key = matcher.group(2); AmazonS3 s3 = new AmazonS3Client(new PropertiesCredentials(Utility.class.getResourceAsStream("AwsCredentials.properties"))); S3Object object = s3.getObject(new GetObjectRequest(bucket, key)); InputStream is = object.getObjectContent(); Properties configProps = new Properties(); configProps.load(is); for (Object keyObj : configProps.keySet()){ String keySt = keyObj.toString(); conf.set(keySt, configProps.getProperty(keySt)); } return true; } /** * sets configuration and defaults to project name based config file * @param conf * @param project * @param filterByGroup * @throws Exception */ public static void setConfiguration(Configuration conf, String project, boolean filterByGroup) throws Exception{ if (filterByGroup) { ConfigurationLoader configLoader = new ConfigurationLoader(conf, project); configLoader.set(); } else { setConfiguration(conf, project); } } /** * @param vec * @param val */ public static <T> void initializeArray(T[] vec, T val) { for(int i = 0; i < vec.length; ++i) { vec[i] = val; } } /** * @param list * @param array */ public static <T> void toList(List<T> list, T[] array) { for (T val : array) { list.add(val); } } /** * @param map * @param itemDelim * @param keyDelim * @return */ public static <K,V> String serializeMap(Map<K, V> map, String itemDelim, String keyDelim) { StringBuilder stBld = new StringBuilder(); for (K key : map.keySet()) { stBld.append(key).append(keyDelim).append(map.get(key)).append(itemDelim); } return stBld.substring(0, stBld.length() -1); } /** * @param data * @param itemDelim * @param keyDelim * @return */ public static Map<String,String> deserializeMap(String data, String itemDelim, String keyDelim) { Map<String,String> map = new HashMap<String,String>(); String[] items = data.split(itemDelim); for (String item : items) { String[] fields = item.split(keyDelim) ; map.put(fields[0], fields[1]); } return map; } /** * @param conf * @param pathConfig * @return * @throws IOException */ public static InputStream getFileStream(Configuration conf, String pathConfig) throws IOException { String filePath = conf.get(pathConfig); FSDataInputStream fs = null; if (null != filePath) { FileSystem dfs = FileSystem.get(conf); Path src = new Path(filePath); fs = dfs.open(src); } return fs; } /** * @param conf * @param pathConfig * @return * @throws IOException */ public static InputStream getFileStream(String filePath) throws IOException { Configuration conf = new Configuration(); FSDataInputStream fs = null; if (null != filePath) { FileSystem dfs = FileSystem.get(conf); Path src = new Path(filePath); fs = dfs.open(src); } return fs; } /** * @param conf * @param pathConfig * @return * @throws IOException */ public static OutputStream getCreateFileOutputStream(Configuration conf, String pathConfig) throws IOException { String filePath = conf.get(pathConfig); FSDataOutputStream fs = null; if (null != filePath) { FileSystem dfs = FileSystem.get(conf); Path src = new Path(filePath); fs = dfs.create(src, true); } return fs; } /** * @param conf * @param pathConfig * @param data * @throws IOException */ public static void writeToFile(Configuration conf, String pathConfig, String data) throws IOException { OutputStream os = Utility.getCreateFileOutputStream(conf, pathConfig); PrintWriter writer = new PrintWriter(os); writer.write(data); writer.close(); os.close(); } /** * @param conf * @param pathConfig * @return * @throws IOException */ public static OutputStream getAppendFileOutputStream(Configuration conf, String pathConfig) throws IOException { String filePath = conf.get(pathConfig); FSDataOutputStream fs = null; if (null != filePath) { FileSystem dfs = FileSystem.get(conf); Path src = new Path(filePath); fs = dfs.append(src); } return fs; } /** * @param conf * @param pathConfig * @param data * @throws IOException */ public static void appendToFile(Configuration conf, String pathConfig, String data) throws IOException { OutputStream os = Utility.getAppendFileOutputStream(conf, pathConfig); PrintWriter writer = new PrintWriter(os); writer.write(data); writer.close(); os.close(); } /** * @param conf * @param filePathParam * @param fieldDelimRegex * @return * @throws IOException */ public static List<String[]> parseFileLines(Configuration conf, String filePathParam, String fieldDelimRegex) throws IOException { List<String[]> lines = new ArrayList<String[]>(); InputStream fs = getFileStream(conf, filePathParam); if (null != fs) { BufferedReader reader = new BufferedReader(new InputStreamReader(fs)); String line = null; String[] items = null; while((line = reader.readLine()) != null) { items = line.split(fieldDelimRegex); lines.add(items); } } return lines; } /** * @param conf * @param filePathParam * @return * @throws IOException */ public static List<String> getFileLines(Configuration conf, String filePathParam) throws IOException { List<String> lines = new ArrayList<String>(); InputStream fs = getFileStream(conf, filePathParam); if (null != fs) { BufferedReader reader = new BufferedReader(new InputStreamReader(fs)); String line = null; while((line = reader.readLine()) != null) { lines.add(line); } } return lines; } /** * @param filePath * @return * @throws IOException */ public static List<String> getFileLines(String filePath) throws IOException { List<String> lines = new ArrayList<String>(); InputStream fs = getFileStream(filePath); if (null != fs) { BufferedReader reader = new BufferedReader(new InputStreamReader(fs)); String line = null; while((line = reader.readLine()) != null) { lines.add(line); } } return lines; } /** * @param text * @param analyzer * @return * @throws IOException */ public static List<String> tokenize(String text, Analyzer analyzer) throws IOException { TokenStream stream = analyzer.tokenStream("contents", new StringReader(text)); List<String> tokens = new ArrayList<String>(); CharTermAttribute termAttribute = (CharTermAttribute)stream.getAttribute(CharTermAttribute.class); while (stream.incrementToken()) { String token = termAttribute.toString(); tokens.add(token); } return tokens; } /** * @param data * @return */ public static String normalize(String data) { String[] items = data.toLowerCase().split("\\s+"); return items.length > 0 ? StringUtils.join(items, " ") : items[0]; } /** * @param record * @param remFieldOrdinal * @param delim * @return */ public static String removeField(String record, int[] remFieldOrdinal, String delimRegex, String delim) { StringBuilder stBld = new StringBuilder(); String[] items = record.split(delimRegex); boolean first = true; for (int i = 0; i < items.length; ++i) { if (!ArrayUtils.contains(remFieldOrdinal, i)) { if (first) { stBld.append(items[i]); first = false; } else { stBld.append(delim).append(items[i]); } } } return stBld.toString(); } /** creates tuple * @param recordItems record fields * @param remFieldOrdinal record fields to be excluded * @return */ public static void createTuple(String[] recordItems, int[] remFieldOrdinal, Tuple tuple) { tuple.initialize(); for (int i = 0; i < recordItems.length; ++i) { if (!ArrayUtils.contains(remFieldOrdinal, i)) { tuple.add(recordItems[i]); } } } /** * @param recordItems * @param filterFieldOrdinal * @param tuple * @param toInclude */ public static void createStringTuple(String[] recordItems, int[] filterFieldOrdinal, Tuple tuple, boolean toInclude) { tuple.initialize(); for (int i = 0; i < recordItems.length; ++i) { if (!toInclude && !ArrayUtils.contains(filterFieldOrdinal, i) || toInclude && ArrayUtils.contains(filterFieldOrdinal, i)) { tuple.add(recordItems[i]); } } } /** * @param recordItems * @param filterFieldOrdinal * @param tuple */ public static void createStringTuple(String[] recordItems, int[] filterFieldOrdinal, Tuple tuple) { createStringTuple(recordItems, filterFieldOrdinal, tuple, true); } /** * @param recordItems * @param filterFieldOrdinal * @param tuple * @param toInclude */ public static void createIntTuple(String[] recordItems, int[] filterFieldOrdinal, Tuple tuple, boolean toInclude) { tuple.initialize(); for (int i = 0; i < recordItems.length; ++i) { if (!toInclude && !ArrayUtils.contains(filterFieldOrdinal, i) || toInclude && ArrayUtils.contains(filterFieldOrdinal, i)) { tuple.add(Integer.parseInt(recordItems[i])); } } } /** * @param recordItems * @param filterFieldOrdinal * @param tuple */ public static void createIntTuple(String[] recordItems, int[] filterFieldOrdinal, Tuple tuple) { createIntTuple(recordItems, filterFieldOrdinal, tuple, true); } /** creates tuple * @param record coma separated fields * @param tuple */ public static void createTuple(String record, Tuple tuple) { String[] items = record.split(","); createStringTuple(items, 0, items.length, tuple); } /** * @param record * @param offset * @param tuple */ public static void createStringTupleFromBegining(String[] record, int offset, Tuple tuple) { createStringTuple(record, 0, offset, tuple); } /** * @param record * @param offset * @param tuple */ public static void createStringTupleFromEnd(String[] record, int offset, Tuple tuple) { createStringTuple(record, offset, record.length, tuple); } /** * @param record * @param beg * @param end * @param tuple */ public static void createStringTuple(String[] record, int beg, int end, Tuple tuple) { tuple.initialize(); for (int i = beg; i < end; ++i) { tuple.add(record[i]); } } /** * @param record * @param delimRegex * @return */ public static int[] intArrayFromString(String record, String delimRegex ) { int[] data = null; if (null != record) { String[] items = record.split(delimRegex); data = new int[items.length]; for (int i = 0; i < items.length; ++i) { data[i] = Integer.parseInt(items[i]); } } return data; } /** * @param record * @return */ public static int[] intArrayFromString(String record) { return intArrayFromString(record, DEF_FIELD_DELIM); } /** * @param record * @param delimRegex * @return */ public static double[] doubleArrayFromString(String record, String delimRegex ) { String[] items = record.split(delimRegex); double[] data = new double[items.length]; for (int i = 0; i < items.length; ++i) { data[i] = Double.parseDouble(items[i]); } return data; } /** * @param record * @return */ public static double[] doubleArrayFromString(String record) { return doubleArrayFromString(record, DEF_FIELD_DELIM); } /** * @param items * @param fields * @return */ public static String[] extractFieldsAsStringArray(String[] items , int[] fields) { String[] fieldValues = new String[fields.length]; for (int i = 0; i < fields.length; ++i) { fieldValues[i] = items[fields[i]]; } return fieldValues; } /** * @param items * @param fields * @return */ public static int[] extractFieldsAsIntArray(String[] items , int[] fields) { int[] fieldValues = new int[fields.length]; for (int i = 0; i < fields.length; ++i) { fieldValues[i] = Integer.parseInt((items[fields[i]])); } return fieldValues; } /** * @param items * @param fields * @param delim * @param sortKeyFields * @return */ public static String extractFields(String[] items , int[] fields, String delim, boolean sortKeyFields) { StringBuilder stBld = new StringBuilder(); List<String> keyFields = new ArrayList<String>(); for (int i = 0; i < fields.length; ++i) { keyFields.add(items[fields[i]]); } if (sortKeyFields) { Collections.sort(keyFields); } boolean first = true; for (String key : keyFields) { if (first) { stBld.append(key); first = false; } else { stBld.append(delim).append(key); } } return stBld.toString(); } /** * @param items * @param filteredFields * @return */ public static String[] filterOutFields(String[] items , int[] filteredFields) { String[] extractedFields = new String[items.length - filteredFields.length ]; for (int i = 0, j=0; i < items.length; ++i) { if (!ArrayUtils.contains(filteredFields, i)) { extractedFields[j++] = items[i]; } } return extractedFields; } /** * @param from * @param toBeRemoved * @return */ public static int[] removeItems(int[] from, int[] toBeRemoved) { int[] subtracted = null; List<Integer> subtractedList = new ArrayList<Integer>(); for (int i = 0; i < from.length; ++i) { int item = from[i]; if (!ArrayUtils.contains(toBeRemoved, item)) { subtractedList.add(item); } } subtracted = fromListToIntArray(subtractedList); return subtracted; } /** * @param valueList * @return */ public static int[] fromListToIntArray(List<Integer> valueList) { int[] values = new int[valueList.size()]; for (int i = 0; i < valueList.size(); ++i) { values[i] = valueList.get(i); } return values; } /** * @param values * @return */ public static List<Integer> fromIntArrayToList( int[] values) { List<Integer> valueList = new ArrayList<Integer>(); for (int value : values) { valueList.add(value);; } return valueList; } /** * @param list * @return */ public static <T> String join(List<T> list, String delim) { String joined = null; if (list.size() == 1) { joined = list.get(0).toString(); } else { StringBuilder stBld = new StringBuilder(); for (T obj : list) { stBld.append(obj).append(delim); } joined = stBld.substring(0, stBld.length() -1); } return joined; } /** * @param list * @param begIndex * @param endIndex * @param delim * @return */ public static <T> String join(List<T> list, int begIndex, int endIndex, String delim) { StringBuilder stBld = new StringBuilder(); for (int i = begIndex; i < endIndex; ++i) { stBld.append(list.get(i)).append(delim); } return stBld.substring(0, stBld.length() -1); } /** * @param list * @return */ public static <T> String join(List<T> list) { return join(list, ","); } /** * @param arr * @param delim * @return */ public static <T> String join(T[] arr, String delim) { StringBuilder stBld = new StringBuilder(); for (T obj : arr) { stBld.append(obj).append(delim); } return stBld.substring(0, stBld.length() -1); } /** * @param arr * @param begIndex * @param endIndex * @param delim * @return */ public static <T> String join(T[] arr, int begIndex, int endIndex, String delim) { StringBuilder stBld = new StringBuilder(); for (int i = begIndex; i < endIndex; ++i) { stBld.append(arr[i]).append(delim); } return stBld.substring(0, stBld.length() -1); } /** * @param arr * @param obj * @return */ public static <T> int getIndex(T[] arr, T obj) { int i = 0; boolean found = false; for (T thisObj : arr) { if (thisObj.equals(obj)) { found = true; break; } ++i; } if (!found) { throw new IllegalArgumentException("object not found in array"); } return i; } /** * @param arr * @return */ public static <T> String join(T[] arr) { return join(arr, ","); } /** * @param arr * @param begIndex * @param endIndex * @return */ public static <T> String join(T[] arr, int begIndex, int endIndex) { return join(arr, begIndex, endIndex, ","); } /** * @param arr * @param indexes * @param delim * @return */ public static <T> String join(T[] arr, int[] indexes, String delim) { StringBuilder stBld = new StringBuilder(); for (int index : indexes) { stBld.append(arr[index]).append(delim); } return stBld.substring(0, stBld.length() -1); } /** * @param arr * @param indexes * @return */ public static <T> String join(T[] arr, int[] indexes) { return join(arr, indexes, ","); } /** * @param table * @param data * @param delim * @param row * @param numCol */ public static void deseralizeTableRow(double[][] table, String data, String delim, int row, int numCol) { String[] items = data.split(delim); if (items.length != numCol) { throw new IllegalArgumentException( "Row serialization failed, number of tokens in string does not match with number of columns"); } for (int c = 0; c < numCol; ++c) { table[row][c] = Double.parseDouble(items[c]); } } /** * @param table * @param data * @param delim * @param row * @param numCol */ public static void deseralizeTableRow(int[][] table, String data, String delim, int row, int numCol) { String[] items = data.split(delim); int k = 0; for (int c = 0; c < numCol; ++c) { table[row][c] = Integer.parseInt(items[k++]); } } /** * Returns sibling path * @param path * @param sibling * @return */ public static String getSiblingPath(String path, String sibling) { int pos = path.lastIndexOf('/'); return path.substring(0, pos + 1) + sibling; } /** * @param data * @return */ public static boolean isBlank(String data) { return data == null || data.isEmpty(); } /** * @param record * @param fieldDelim * @param subFieldDelim * @return */ public static List<Pair<Integer, Integer>> getIntPairList(String record, String fieldDelim, String subFieldDelim) { List<Pair<Integer, Integer>> intPairs = new ArrayList<Pair<Integer, Integer>>(); String[] items = record.split(fieldDelim); for (String item : items) { String[] subItems = item.split(subFieldDelim); Pair<Integer, Integer> pair = new Pair<Integer, Integer>(Integer.parseInt(subItems[0]), Integer.parseInt(subItems[1])); intPairs.add(pair); } return intPairs; } /** * @param record * @param fieldDelim * @param subFieldDelim * @return */ public static List<Pair<Integer, String>> getIntStringList(String record, String fieldDelim, String subFieldDelim) { List<Pair<Integer, String>> intStringPairs = new ArrayList<Pair<Integer, String>>(); String[] items = record.split(fieldDelim); for (String item : items) { String[] subItems = item.split(subFieldDelim); Pair<Integer, String> pair = new Pair<Integer, String>(Integer.parseInt(subItems[0]), subItems[1]); intStringPairs.add(pair); } return intStringPairs; } /** * @param record * @param fieldDelim * @param subFieldDelim * @return */ public static List<Pair<String, String>> getStringPairList(String record, String fieldDelim, String subFieldDelim) { List<Pair<String, String>> stringStringPairs = new ArrayList<Pair<String, String>>(); String[] items = record.split(fieldDelim); for (String item : items) { String[] subItems = item.split(subFieldDelim); Pair<String, String> pair = new Pair<String, String>(subItems[0], subItems[1]); stringStringPairs.add(pair); } return stringStringPairs; } /** * @return */ public static String generateId() { return UUID.randomUUID().toString().replaceAll("-", ""); } /** * @param config * @param param * @param msg */ public static String assertConfigParam(Configuration config, String param, String msg) { return assertStringConfigParam( config,param, msg); } /** * @param config * @param param * @param msg * @return */ public static String assertStringConfigParam(Configuration config, String param, String msg) { String value = config.get(param); if (value == null) { throw new IllegalStateException(msg); } return value; } /** * @param config * @param param * @param msg * @return */ public static int assertIntConfigParam(Configuration config, String param, String msg) { int value = Integer.MIN_VALUE; assertStringConfigParam( config, param, msg); value = config.getInt(param, Integer.MIN_VALUE); return value; } /** * @param config * @param param * @param msg * @return */ public static double assertDoubleConfigParam(Configuration config, String param, String msg) { double value = Double.MIN_VALUE; String stParamValue = assertStringConfigParam(config, param, msg); value = Double.parseDouble(stParamValue); return value; } /** * @param config * @param param * @param msg * @return */ public static boolean assertBooleanConfigParam(Configuration config, String param, String msg) { boolean value = false; assertStringConfigParam(config, param, msg); value = config.getBoolean(param, false); return value; } /** * @param config * @param param * @param delimRegex * @param msg * @return */ public static int[] assertIntArrayConfigParam(Configuration config, String param, String delimRegex, String msg) { int[] data = null; String stParamValue = assertStringConfigParam(config, param, msg); data = intArrayFromString(stParamValue, delimRegex); return data; } /** * @param config * @param param * @param delimRegex * @param msg * @return */ public static String[] assertStringArrayConfigParam(Configuration config, String param, String delimRegex, String msg) { String stParamValue = assertStringConfigParam( config, param, msg); return stParamValue.split(delimRegex); } /** * @param config * @param param * @param delimRegex * @param msg * @return */ public static double[] assertDoubleArrayConfigParam(Configuration config, String param, String delimRegex, String msg) { double[] data = null; String stParamValue = assertStringConfigParam( config, param, msg); data = doubleArrayFromString(stParamValue, delimRegex); return data; } /** * @param config * @param param * @param delimRegex * @param subFieldDelim * @param msg * @return */ public static Map<String, Integer> assertStringIntegerMapConfigParam(Configuration config, String param, String delimRegex, String subFieldDelim, String msg) { String stParamValue = assertStringConfigParam( config, param, msg); String[] items = stParamValue.split(delimRegex); Map<String, Integer> data = new HashMap<String, Integer>() ; for (String item : items) { String[] parts = item.split(subFieldDelim); data.put(parts[0], Integer.parseInt(parts[1])); } return data; } /** * @param config * @param param * @param delimRegex * @param subFieldDelim * @param msg * @return */ public static Map<String, Double> assertStringDoubleMapConfigParam(Configuration config, String param, String delimRegex, String subFieldDelim, String msg) { String stParamValue = assertStringConfigParam( config, param, msg); String[] items = stParamValue.split(delimRegex); Map<String, Double> data = new HashMap<String, Double>() ; for (String item : items) { String[] parts = item.split(subFieldDelim); data.put(parts[0], Double.parseDouble(parts[1])); } return data; } /** * @param config * @param param * @param delimRegex * @param subFieldDelim * @param msg * @return */ public static Map<Integer, Integer> assertIntIntegerIntegerMapConfigParam(Configuration config, String param, String delimRegex, String subFieldDelim, String msg) { return assertIntegerIntegerMapConfigParam(config, param, delimRegex, subFieldDelim, msg, true); } /** * @param config * @param param * @param delimRegex * @param subFieldDelim * @param msg * @param rangeInKey * @return */ public static Map<Integer, Integer> assertIntegerIntegerMapConfigParam(Configuration config, String param, String delimRegex, String subFieldDelim, String msg, boolean rangeInKey) { String stParamValue = assertStringConfigParam( config, param, msg); String[] items = stParamValue.split(delimRegex); Map<Integer, Integer> data = new HashMap<Integer, Integer>() ; if (rangeInKey) { for (String item : items) { String[] parts = item.split(subFieldDelim); String[] rangeLimits = parts[0].split("\\-"); if (rangeLimits.length == 1) { data.put(Integer.parseInt(parts[0]), Integer.parseInt(parts[1])); } else { int rangeBeg = Integer.parseInt(rangeLimits[0]); int rangeEnd = Integer.parseInt(rangeLimits[1]); int val = Integer.parseInt(parts[1]); for (int r = rangeBeg; r <= rangeEnd; ++r) { //key:hour value:hour group data.put(r, val); } } } } else { for (String item : items) { String[] parts = item.split(subFieldDelim); data.put(Integer.parseInt(parts[0]), Integer.parseInt(parts[1])); } } return data; } /** * @param config * @param param * @param delimRegex * @param subFieldDelim * @param msg * @return */ public static Map<Integer, Double> assertIntegerDoubleMapConfigParam(Configuration config, String param, String delimRegex, String subFieldDelim, String msg) { String stParamValue = assertStringConfigParam( config, param, msg); String[] items = stParamValue.split(delimRegex); Map<Integer, Double> data = new HashMap<Integer, Double>() ; for (String item : items) { String[] parts = item.split(subFieldDelim); data.put(Integer.parseInt(parts[0]), Double.parseDouble(parts[1])); } return data; } /** * @param config * @param param * @param delimRegex * @param subFieldDelim * @param msg * @return */ public static Map<Integer, String> assertIntegerStringMapConfigParam(Configuration config, String param, String delimRegex, String subFieldDelim, String msg) { String stParamValue = assertStringConfigParam(config, param, msg); String[] items = stParamValue.split(delimRegex); Map<Integer, String> data = new HashMap<Integer, String>() ; for (String item : items) { String[] parts = item.split(subFieldDelim); data.put(Integer.parseInt(parts[0]), parts[1]); } return data; } /** * @param config * @param param * @param delimRegex * @param subFieldDelim * @param msg * @return */ public static Map<String, Double> assertDoubleMapConfigParam(Configuration config, String param, String delimRegex, String subFieldDelim, String msg) { String stParamValue = assertStringConfigParam( config, param, msg); String[] items = stParamValue.split(delimRegex); Map<String, Double> data = new HashMap<String, Double>() ; for (String item : items) { String[] parts = item.split(subFieldDelim); data.put(parts[0], Double.parseDouble(parts[1])); } return data; } /** * @param record * @param fieldDelim * @param subFieldDelim * @return */ public static List<Pair<String, String>> assertStringPairListConfigParam(Configuration config, String param, String fieldDelim, String subFieldDelim, String msg) { String record = assertStringConfigParam(config, param, msg); return getStringPairList(record, fieldDelim, subFieldDelim); } /** * @param record * @param fieldDelim * @param subFieldDelim * @return */ public static List<Pair<Integer, String>> assertIntStringListConfigParam(Configuration config, String param, String fieldDelim, String subFieldDelim, String msg) { String record = assertStringConfigParam(config, param, msg); return getIntStringList(record, fieldDelim, subFieldDelim); } /** * @param record * @param fieldDelim * @param subFieldDelim * @return */ public static List<Pair<Integer, Integer>> assertIntPairListConfigParam(Configuration config, String param, String fieldDelim, String subFieldDelim, String msg) { String record = assertStringConfigParam(config, param, msg); return getIntPairList(record, fieldDelim, subFieldDelim); } /** * @param config * @param param * @param delimRegex * @param msg * @return */ public static String[] optionalStringArrayConfigParam(Configuration config, String param, String delimRegex) { String[] data = null; String stParamValue = config.get(param); if (null != stParamValue) { data = stParamValue.split(delimRegex); } return data; } /** * @param config * @param param * @param delimRegex * @param msg * @return */ public static int[] optionalIntArrayConfigParam(Configuration config, String param, String delimRegex) { int[] data = null; String stParamValue = config.get(param); if (null != stParamValue) { data = intArrayFromString(stParamValue, delimRegex); } return data; } /** * @param config * @param param * @param delimRegex * @param msg * @return */ public static double[] optionalDoubleArrayConfigParam(Configuration config, String param, String delimRegex) { double[] data = null; String stParamValue = config.get(param); if (null != stParamValue) { data = doubleArrayFromString(stParamValue, delimRegex); } return data; } /** * @param config * @param param * @param delimRegex * @param subFieldDelim * @return */ public static Map<String, Double> OptionalStringDoubleMapConfigParam(Configuration config, String param, String delimRegex, String subFieldDelim) { Map<String, Double> data = null; String stParamValue = config.get(param); if (null != stParamValue) { String[] items = stParamValue.split(delimRegex); data = new HashMap<String, Double>() ; for (String item : items) { String[] parts = item.split(subFieldDelim); data.put(parts[0], Double.parseDouble(parts[1])); } } return data; } /** * @param list * @return */ public static <T> T selectRandom(List<T> list) { int index = (int)(Math.random() * list.size()); return list.get(index); } /** * @param record * @param numFields * @param throwEx * @return */ public static boolean isFieldCountValid(String[] record, int numFields, boolean failOnInvalid) { boolean valid = true; if (record.length != numFields) { valid = false; if (failOnInvalid) { throw new IllegalArgumentException("invalid field count expected " + numFields + " found " + record.length); } } return valid; } /** * @param record * @param fieldDelim * @param numFields * @param failOnInvalid * @return */ public static String[] splitFields(String record, String fieldDelim, int numFields, boolean failOnInvalid) { String[] items = record.split(fieldDelim, -1); if (items.length != numFields) { if (items.length < numFields) { //check if trailing blank fields int delimCount = StringUtils.countMatches(record, fieldDelim); if (delimCount == numFields - 1) { //trailing blank fields String[] extItems = new String[numFields]; for (int i = 0; i < numFields; ++i) { if (i < items.length) { extItems[i] = items[i]; } else { //fill trailing fields with blanks extItems[i] = ""; } } items = extItems; } else { //got too few fields items = null; } } else { //got too many fields items = null; } if (null == items && failOnInvalid) { throw new IllegalArgumentException("invalid field count expected " + numFields + " found " + items.length); } } return items; } /** * @param record * @param fieldDelem * @param numFields * @param throwEx * @return */ public static String[] getFields(String record, String fieldDelem, int numFields, boolean failOnInvalid) { String[] fields = record.split(fieldDelem); if (fields.length != numFields) { if (failOnInvalid) { throw new IllegalArgumentException("invalid field count expected " + numFields + " found " + fields.length); } fields = null; } return fields; } /** * @param items * @return */ public static boolean anyEmptyField(String[] items) { boolean isEmpty = false; for (String item : items) { if (item.isEmpty()) { isEmpty = true; break; } } return isEmpty; } /** * @param conf * @param pathConfig * @return Hconf config object * @throws IOException */ public static Config getHoconConfig(Configuration conf, String pathConfig) throws IOException { Config config = null; if (null != conf.get(pathConfig)) { InputStream is = getFileStream(conf, pathConfig); BufferedReader bufRead =new BufferedReader(new InputStreamReader(is)); config = ConfigFactory.parseReader(bufRead); } return config; } /** * @param filePath * @return * @throws IOException */ public static Config getHoconConfig(String filePath) throws IOException { Config config = null; if (null != filePath) { InputStream is = getFileStream(filePath); BufferedReader bufRead =new BufferedReader(new InputStreamReader(is)); config = ConfigFactory.parseReader(bufRead); } return config; } /** * @param conf * @param pathParam * @return * @throws IOException */ public static RichAttributeSchema getRichAttributeSchema(Configuration conf, String pathParam) throws IOException { String filePath = conf.get(pathParam); FileSystem dfs = FileSystem.get(conf); Path src = new Path(filePath); FSDataInputStream fs = dfs.open(src); ObjectMapper mapper = new ObjectMapper(); RichAttributeSchema schema = mapper.readValue(fs, RichAttributeSchema.class); return schema; } /** * @param conf * @param pathParam * @return * @throws IOException */ public static GenericAttributeSchema getGenericAttributeSchema(Configuration conf, String pathParam) throws IOException { GenericAttributeSchema schema = null; InputStream is = Utility.getFileStream(conf, pathParam); if (null != is) { ObjectMapper mapper = new ObjectMapper(); schema = mapper.readValue(is, GenericAttributeSchema.class); } return schema; } /** * @param conf * @param pathParam * @return * @throws IOException */ public static FeatureSchema getFeatureSchema(Configuration conf, String pathParam) throws IOException { FeatureSchema schema = null; InputStream is = Utility.getFileStream(conf, pathParam); if (null != is) { ObjectMapper mapper = new ObjectMapper(); schema = mapper.readValue(is, FeatureSchema.class); } return schema; } /** * @param conf * @param pathParam * @return * @throws IOException */ public static ProcessorAttributeSchema getProcessingSchema(Configuration conf, String pathParam) throws IOException { InputStream is = Utility.getFileStream(conf, pathParam); ObjectMapper mapper = new ObjectMapper(); ProcessorAttributeSchema processingSchema = mapper.readValue(is, ProcessorAttributeSchema.class); return processingSchema; } /** * @param filePath * @return * @throws IOException */ public static ProcessorAttributeSchema getProcessingSchema( String filePath) throws IOException { InputStream is = Utility.getFileStream(filePath); ObjectMapper mapper = new ObjectMapper(); ProcessorAttributeSchema processingSchema = mapper.readValue(is, ProcessorAttributeSchema.class); return processingSchema; } /** * @param config * @param params * @return */ public static Map<String, String> collectConfiguration(Configuration config, String... params ) { Map<String, String> collectedConfig = new HashMap<String, String>(); for (String param : params) { collectedConfig.put(param, config.get(param)); } return collectedConfig; } /** * @param list * @param count * @return */ public static <T> List<T> selectRandomFromList(List<T> list, int count) { List<T> selList = null; if (count > list.size()) { throw new IllegalArgumentException("new list size is larget than source list size"); } else if (count == list.size()) { selList = list; } else { selList = new ArrayList<T>(); Set<T> selSet = new HashSet<T>(); while (selSet.size() != count) { int index = (int)(Math.random() * list.size()); selSet.add(list.get(index)); } selList.addAll(selSet); } return selList; } /** * @param curList * @return */ public static <T> List<T> cloneList(List<T> curList) { List<T> newList = new ArrayList<T>(); newList.addAll(curList); return newList; } /** * @param list * @param subList * @return */ public static <T> List<T> listDifference(List<T> list, List<T> subList) { List<T> diff = new ArrayList<T>(); for (T item : list) { if (!subList.contains(item)) { diff.add(item); } } return diff; } /** * @param list * @param maxSubListSize * @return */ public static <T> List<List<T>> generateSublists(List<T> list, int maxSubListSize) { List<List<T>> subLists = new ArrayList<List<T>>(); //for each item in list generate sublists up to max length for (int i = 0; i < list.size(); ++i) { List<T> subList = new ArrayList<T>(); subList.add(list.get(i)); subLists.add(subList); generateSublists(list, subList, i, subLists, maxSubListSize); } return subLists; } /** * generates sub lists of varying size from a list * @param list * @param subList * @return */ public static <T> void generateSublists(List<T> list, List<T> subList, int lastIndex, List<List<T>> subLists, int maxSubListSize) { for (int i = lastIndex + 1; i < list.size(); ++i) { List<T> biggerSubList = new ArrayList<T>(); biggerSubList.addAll(subList); biggerSubList.add(list.get(i)); subLists.add(biggerSubList); if (biggerSubList.size() < maxSubListSize) { //recurse generateSublists(list, biggerSubList, i, subLists, maxSubListSize); } } } /** * Takes user specified attributes or builds list of attributes of right type from schema * @param attrListParam * @param configDelim * @param schema * @param config * @param includeTypes * @return */ public static int[] getAttributes(String attrListParam, String configDelim, GenericAttributeSchema schema, Configuration config, String... includeTypes) { int[] attributes = Utility.intArrayFromString(config.get(attrListParam), configDelim); List<Attribute> attrsMetaData = schema != null ? schema.getQuantAttributes(includeTypes) : null; if (null == attributes) { //use schema and pick all attributes of right type if (null == attrsMetaData) { throw new IllegalStateException("Neither attribute ordinal list ot schema available"); } attributes = new int[attrsMetaData.size()]; for (int i = 0; i < attrsMetaData.size(); ++i) { attributes[i] = attrsMetaData.get(i).getOrdinal(); } } else { //use user provided but verify type if (null != attrsMetaData) { //if schema is available for (int ord : attributes ) { boolean found = false; for (Attribute attr : attrsMetaData) { if (attr.getOrdinal() == ord) { found = true; break; } } if (!found) { throw new IllegalArgumentException("attribute not found in metada"); } } } } return attributes; } /** * @param record * @param attributes * @param schema * @param tuple */ public static void intializeTuple(String[] record, int[] attributes, GenericAttributeSchema schema, Tuple tuple) { tuple.initialize(); for (int attr : attributes) { String dataType = schema.findAttribute(attr).getDataType(); if (dataType.equals(Attribute.DATA_TYPE_INT)) { tuple.add(Integer.parseInt(record[attr])); } else if (dataType.equals(Attribute.DATA_TYPE_LONG)) { tuple.add(Long.parseLong(record[attr])); } else { tuple.add(record[attr]); } } } /** * @param val * @param prec * @return */ public static String formatDouble(double val, int prec) { String formatter = "%." + prec + "f"; return String.format(formatter, val); } /** * @param val * @param size * @return */ public static String formatInt(int val, int size) { String formatter = "%0" + size + "d"; return String.format(formatter, val); } /** * @param val * @param size * @return */ public static String formatLong(long val, int size) { String formatter = "%0" + size + "d"; return String.format(formatter, val); } /** * Analyzes text and return analyzed text * @param text * @return * @throws IOException */ public static String analyze(String text, Analyzer analyzer) throws IOException { TokenStream stream = analyzer.tokenStream("contents", new StringReader(text)); StringBuilder stBld = new StringBuilder(); stream.reset(); CharTermAttribute termAttribute = (CharTermAttribute)stream.getAttribute(CharTermAttribute.class); while (stream.incrementToken()) { String token = termAttribute.toString(); stBld.append(token).append(" "); } stream.end(); stream.close(); return stBld.toString(); } /** * @param dateTimeStamp * @param dateFormat * @return * @throws ParseException */ public static long getEpochTime(String dateTimeStamp, SimpleDateFormat dateFormat) throws ParseException { return getEpochTime(dateTimeStamp, false, dateFormat,0); } /** * @param dateTimeStamp * @param isEpochTime * @param dateFormat * @return * @throws ParseException */ public static long getEpochTime(String dateTimeStamp, boolean isEpochTime, SimpleDateFormat dateFormat) throws ParseException { return getEpochTime(dateTimeStamp, isEpochTime, dateFormat,0); } /** * @param dateTimeStamp * @param isEpochTime * @param dateFormat * @param timeZoneShiftHour * @return * @throws ParseException */ public static long getEpochTime(String dateTimeStamp, boolean isEpochTime, SimpleDateFormat dateFormat, int timeZoneShiftHour) throws ParseException { long epochTime = 0; if (isEpochTime) { epochTime = Long.parseLong(dateTimeStamp); } else { epochTime = dateFormat.parse(dateTimeStamp).getTime(); epochTime += timeZoneShiftHour * MILISEC_PER_HOUR; } return epochTime; } /** * @param config * @param fieldDelimParam * @param defFieldDelimParam * @param defFieldDelim * @return */ public static String getFieldDelimiter(Configuration config, String fieldDelimParam, String defFieldDelimParam, String defFieldDelim) { String fieldDelim = config.get(fieldDelimParam); if (null == fieldDelim) { //get default fieldDelim = config.get(defFieldDelimParam, defFieldDelim); } return fieldDelim; } /** * @param epochTime * @param timeUnit * @return */ public static long convertTimeUnit(long epochTime, String timeUnit) { long modTime = epochTime; if (timeUnit.equals("hour")) { modTime /= MILISEC_PER_HOUR; } else if (timeUnit.equals("day")) { modTime /= MILISEC_PER_DAY; } else { throw new IllegalArgumentException("invalid time unit"); } return modTime; } /** * @param thisVector * @param thatVector * @return */ public static double dotProduct(double[] thisVector, double[] thatVector) { double product = 0; if (thisVector.length != thatVector.length) { throw new IllegalArgumentException("mismatched size for vector dot product"); } for (int i = 0; i < thisVector.length; ++i) { product += thisVector[i] * thatVector[i]; } return product; } /** * @param job */ public static void setTuplePairSecondarySorting(Job job) { job.setGroupingComparatorClass(SecondarySort.TuplePairGroupComprator.class); job.setPartitionerClass(SecondarySort.TuplePairPartitioner.class); } }