package org.apache.blur.mapreduce.lib; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.io.UnsupportedEncodingException; import java.math.BigInteger; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.TreeSet; import org.apache.blur.mapreduce.lib.BlurMutate.MUTATE_TYPE; import org.apache.commons.codec.binary.Base64; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileSplit; import com.google.common.base.Splitter; /** * This will parse a standard csv file into a {@link BlurMutate} object. Use the * static addColumns, and setSeparator methods to configure the class. */ public class CsvBlurMapper extends BaseBlurMapper<Writable, Text> { public static final String UTF_8 = "UTF-8"; public static final String BLUR_CSV_AUTO_GENERATE_RECORD_ID_AS_HASH_OF_DATA = "blur.csv.auto.generate.record.id.as.hash.of.data"; public static final String BLUR_CSV_AUTO_GENERATE_ROW_ID_AS_HASH_OF_DATA = "blur.csv.auto.generate.row.id.as.hash.of.data"; public static final String BLUR_CSV_FAMILY_PATH_MAPPINGS_FAMILIES = "blur.csv.family.path.mappings.families"; public static final String BLUR_CSV_FAMILY_PATH_MAPPINGS_FAMILY_PREFIX = "blur.csv.family.path.mappings.family."; public static final String BLUR_CSV_SEPARATOR_BASE64 = "blur.csv.separator.base64"; public static final String BLUR_CSV_FAMILY_COLUMN_PREFIX = "blur.csv.family."; public static final String BLUR_CSV_FAMILIES = "blur.csv.families"; public static final String HIVE_NULL = "\\N"; protected Map<String, List<String>> _columnNameMap; protected String _separator = Base64.encodeBase64String(",".getBytes()); protected Splitter _splitter; protected boolean _familyNotInFile; protected String _familyFromPath; protected boolean _autoGenerateRecordIdAsHashOfData; protected MessageDigest _digest; protected boolean _autoGenerateRowIdAsHashOfData; /** * Add a mapping for a family to a path. This is to be used when an entire * path is to be processed as a single family and the data itself does not * contain the family.<br/> * <br/> * * NOTE: the familyNotInFile property must be set before this method can be * called. * * @param job * the job to setup. * @param family * the family. * @param path * the path. */ public static void addFamilyPath(Job job, String family, Path path) { addFamilyPath(job.getConfiguration(), family, path); } /** * Add a mapping for a family to a path. This is to be used when an entire * path is to be processed as a single family and the data itself does not * contain the family.<br/> * <br/> * * NOTE: the familyNotInFile property must be set before this method can be * called. * * @param configuration * the configuration to setup. * @param family * the family. * @param path * the path. */ public static void addFamilyPath(Configuration configuration, String family, Path path) { append(configuration, BLUR_CSV_FAMILY_PATH_MAPPINGS_FAMILIES, family); append(configuration, BLUR_CSV_FAMILY_PATH_MAPPINGS_FAMILY_PREFIX + family, path.toString()); } protected static void append(Configuration configuration, String name, String value) { Collection<String> set = configuration.getStringCollection(name); if (set == null) { set = new TreeSet<String>(); } set.add(value); configuration.setStrings(name, set.toArray(new String[set.size()])); } /** * If set to true the record id will be automatically generated as a hash of * the data that the record contains. * * @param job * the job to setup. * @param autoGenerateRecordIdAsHashOfData * boolean. */ public static void setAutoGenerateRecordIdAsHashOfData(Job job, boolean autoGenerateRecordIdAsHashOfData) { setAutoGenerateRecordIdAsHashOfData(job.getConfiguration(), autoGenerateRecordIdAsHashOfData); } /** * If set to true the record id will be automatically generated as a hash of * the data that the record contains. * * @param configuration * the configuration to setup. * @param autoGenerateRecordIdAsHashOfData * boolean. */ public static void setAutoGenerateRecordIdAsHashOfData(Configuration configuration, boolean autoGenerateRecordIdAsHashOfData) { configuration.setBoolean(BLUR_CSV_AUTO_GENERATE_RECORD_ID_AS_HASH_OF_DATA, autoGenerateRecordIdAsHashOfData); } /** * Gets whether or not to generate a recordid for the record based on the * data. * * @param configuration * the configuration. * @return boolean. */ public static boolean isAutoGenerateRecordIdAsHashOfData(Configuration configuration) { return configuration.getBoolean(BLUR_CSV_AUTO_GENERATE_RECORD_ID_AS_HASH_OF_DATA, false); } /** * If set to true the record id will be automatically generated as a hash of * the data that the record contains. * * @param job * the job to setup. * @param autoGenerateRecordIdAsHashOfData * boolean. */ public static void setAutoGenerateRowIdAsHashOfData(Job job, boolean autoGenerateRowIdAsHashOfData) { setAutoGenerateRowIdAsHashOfData(job.getConfiguration(), autoGenerateRowIdAsHashOfData); } /** * If set to true the record id will be automatically generated as a hash of * the data that the record contains. * * @param configuration * the configuration to setup. * @param autoGenerateRecordIdAsHashOfData * boolean. */ public static void setAutoGenerateRowIdAsHashOfData(Configuration configuration, boolean autoGenerateRowIdAsHashOfData) { configuration.setBoolean(BLUR_CSV_AUTO_GENERATE_ROW_ID_AS_HASH_OF_DATA, autoGenerateRowIdAsHashOfData); } /** * Gets whether or not to generate a recordid for the record based on the * data. * * @param configuration * the configuration. * @return boolean. */ public static boolean isAutoGenerateRowIdAsHashOfData(Configuration configuration) { return configuration.getBoolean(BLUR_CSV_AUTO_GENERATE_ROW_ID_AS_HASH_OF_DATA, false); } /** * Sets all the family and column definitions. * * @param job * the job to setup. * @param strDefinition * the string definition. <br/> * <br/> * Example:<br/> * "cf1:col1,col2,col3|cf2:col1,col2,col3"<br/> * Where "cf1" is a family name that contains columns "col1", "col2" * and "col3" and a second family of "cf2" with columns "col1", * "col2", and "col3". */ public static void setColumns(Job job, String strDefinition) { setColumns(job.getConfiguration(), strDefinition); } /** * Sets all the family and column definitions. * * @param configuration * the configuration to setup. * @param strDefinition * the string definition. <br/> * <br/> * Example:<br/> * "cf1:col1,col2,col3|cf2:col1,col2,col3"<br/> * Where "cf1" is a family name that contains columns "col1", "col2" * and "col3" and a second family of "cf2" with columns "col1", * "col2", and "col3". */ public static void setColumns(Configuration configuration, String strDefinition) { Iterable<String> familyDefs = Splitter.on('|').split(strDefinition); for (String familyDef : familyDefs) { int indexOf = familyDef.indexOf(':'); if (indexOf < 0) { throwMalformedDefinition(strDefinition); } String family = familyDef.substring(0, indexOf); Iterable<String> cols = Splitter.on(',').split(familyDef.substring(indexOf + 1)); List<String> colnames = new ArrayList<String>(); for (String columnName : cols) { colnames.add(columnName); } if (family.trim().isEmpty() || colnames.isEmpty()) { throwMalformedDefinition(strDefinition); } addColumns(configuration, family, colnames.toArray(new String[colnames.size()])); } } protected static void throwMalformedDefinition(String strDefinition) { throw new RuntimeException("Family and column definition string not valid [" + strDefinition + "] should look like \"family1:colname1,colname2|family2:colname1,colname2,colname3\""); } /** * Adds the column layout for the given family. * * @param job * the job to apply the layout. * @param family * the family name. * @param columns * the column names. */ public static void addColumns(Job job, String family, String... columns) { addColumns(job.getConfiguration(), family, columns); } /** * Adds the column layout for the given family. * * @param configuration * the configuration to apply the layout. * @param family * the family name. * @param columns * the column names. */ public static void addColumns(Configuration configuration, String family, String... columns) { Collection<String> families = new TreeSet<String>(configuration.getStringCollection(BLUR_CSV_FAMILIES)); families.add(family); configuration.setStrings(BLUR_CSV_FAMILIES, families.toArray(new String[] {})); configuration.setStrings(BLUR_CSV_FAMILY_COLUMN_PREFIX + family, columns); } public static Collection<String> getFamilyNames(Configuration configuration) { return configuration.getStringCollection(BLUR_CSV_FAMILIES); } public static Map<String, List<String>> getFamilyAndColumnNameMap(Configuration configuration) { Map<String, List<String>> columnNameMap = new HashMap<String, List<String>>(); for (String family : getFamilyNames(configuration)) { String[] columnsNames = configuration.getStrings(BLUR_CSV_FAMILY_COLUMN_PREFIX + family); columnNameMap.put(family, Arrays.asList(columnsNames)); } return columnNameMap; } /** * Sets the separator of the file, by default it is ",". * * @param job * the job to apply the separator change. * @param separator * the separator. */ public static void setSeparator(Job job, String separator) { setSeparator(job.getConfiguration(), separator); } /** * Sets the separator of the file, by default it is ",". * * @param configuration * the configuration to apply the separator change. * @param separator * the separator. */ public static void setSeparator(Configuration configuration, String separator) { try { configuration.set(BLUR_CSV_SEPARATOR_BASE64, Base64.encodeBase64String(separator.getBytes(UTF_8))); } catch (UnsupportedEncodingException e) { throw new RuntimeException(e); } } @Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); Configuration configuration = context.getConfiguration(); _autoGenerateRecordIdAsHashOfData = isAutoGenerateRecordIdAsHashOfData(configuration); _autoGenerateRowIdAsHashOfData = isAutoGenerateRowIdAsHashOfData(configuration); if (_autoGenerateRecordIdAsHashOfData || _autoGenerateRowIdAsHashOfData) { try { _digest = MessageDigest.getInstance("MD5"); } catch (NoSuchAlgorithmException e) { throw new IOException(e); } } _columnNameMap = getFamilyAndColumnNameMap(configuration); _separator = new String(Base64.decodeBase64(configuration.get(BLUR_CSV_SEPARATOR_BASE64, _separator)), UTF_8); _splitter = Splitter.on(_separator); Path fileCurrentlyProcessing = getCurrentFile(context); Collection<String> families = configuration.getStringCollection(BLUR_CSV_FAMILY_PATH_MAPPINGS_FAMILIES); OUTER: for (String family : families) { Collection<String> pathStrCollection = configuration .getStringCollection(BLUR_CSV_FAMILY_PATH_MAPPINGS_FAMILY_PREFIX + family); for (String pathStr : pathStrCollection) { Path path = new Path(pathStr); FileSystem fileSystem = path.getFileSystem(configuration); path = path.makeQualified(fileSystem.getUri(), fileSystem.getWorkingDirectory()); if (isParent(path, fileCurrentlyProcessing)) { _familyFromPath = family; _familyNotInFile = true; break OUTER; } } } } protected boolean isParent(Path possibleParent, Path child) { if (child == null) { return false; } if (possibleParent.equals(child.getParent())) { return true; } return isParent(possibleParent, child.getParent()); } protected Path getCurrentFile(Context context) throws IOException { InputSplit split = context.getInputSplit(); if (split != null && split instanceof FileSplit) { FileSplit inputSplit = (FileSplit) split; Path path = inputSplit.getPath(); FileSystem fileSystem = path.getFileSystem(context.getConfiguration()); return path.makeQualified(fileSystem.getUri(), fileSystem.getWorkingDirectory()); } return null; } @Override protected void map(Writable k, Text value, Context context) throws IOException, InterruptedException { BlurRecord record = _mutate.getRecord(); record.clearColumns(); String str = value.toString(); Iterable<String> split = _splitter.split(str); List<String> list = toList(split); int offset = 0; boolean gen = false; if (!_autoGenerateRowIdAsHashOfData) { record.setRowId(list.get(offset++)); } else { _digest.reset(); byte[] bs = value.getBytes(); int length = value.getLength(); _digest.update(bs, 0, length); record.setRowId(new BigInteger(_digest.digest()).toString(Character.MAX_RADIX)); gen = true; } if (!_autoGenerateRecordIdAsHashOfData) { record.setRecordId(list.get(offset++)); } else { if (gen) { record.setRecordId(record.getRowId()); } else { _digest.reset(); byte[] bs = value.getBytes(); int length = value.getLength(); _digest.update(bs, 0, length); record.setRecordId(new BigInteger(_digest.digest()).toString(Character.MAX_RADIX)); } } String family; if (_familyNotInFile) { family = _familyFromPath; } else { family = list.get(offset++); } record.setFamily(family); List<String> columnNames = _columnNameMap.get(family); if (columnNames == null) { throw new IOException("Family [" + family + "] is missing in the definition."); } if (list.size() - offset != columnNames.size()) { String options = ""; if (!_autoGenerateRowIdAsHashOfData) { options += "rowid,"; } if (!_autoGenerateRecordIdAsHashOfData) { options += "recordid,"; } if (!_familyNotInFile) { options += "family,"; } String msg = "Record [" + str + "] does not match defined record [" + options + getColumnNames(columnNames) + "]."; throw new IOException(msg); } for (int i = 0; i < columnNames.size(); i++) { String val = handleHiveNulls(list.get(i + offset)); if (val != null) { record.addColumn(columnNames.get(i), val); _columnCounter.increment(1); } } _key.set(record.getRowId()); _mutate.setMutateType(MUTATE_TYPE.REPLACE); context.write(_key, _mutate); _recordCounter.increment(1); context.progress(); } protected String handleHiveNulls(String value) { if (value.equals(HIVE_NULL)) { return null; } return value; } public void setFamilyFromPath(String familyFromPath) { this._familyFromPath = familyFromPath; } protected String getColumnNames(List<String> columnNames) { StringBuilder builder = new StringBuilder(); for (String c : columnNames) { if (builder.length() != 0) { builder.append(','); } builder.append(c); } return builder.toString(); } protected List<String> toList(Iterable<String> split) { List<String> lst = new ArrayList<String>(); for (String s : split) { lst.add(s); } return lst; } }