/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with this * work for additional information regarding copyright ownership. The ASF * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package org.apache.pig.backend.hadoop.accumulo; import java.io.IOException; import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.Iterator; import java.util.Map; import java.util.Map.Entry; import java.util.SortedMap; import org.apache.accumulo.core.client.IteratorSetting; import org.apache.accumulo.core.client.mapreduce.AccumuloInputFormat; import org.apache.accumulo.core.data.Key; import org.apache.accumulo.core.data.Mutation; import org.apache.accumulo.core.data.PartialKey; import org.apache.accumulo.core.data.Range; import org.apache.accumulo.core.data.Value; import org.apache.accumulo.core.iterators.user.WholeRowIterator; import org.apache.commons.cli.ParseException; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.log4j.Logger; import org.apache.pig.ResourceSchema.ResourceFieldSchema; import org.apache.pig.backend.executionengine.ExecException; import org.apache.pig.data.DataByteArray; import org.apache.pig.data.DataType; import org.apache.pig.data.Tuple; import org.apache.pig.data.TupleFactory; /** * Basic PigStorage implementation that uses Accumulo as the backing store. * * <p> * When writing data, the first entry in the {@link Tuple} is treated as the row * in the Accumulo key, while subsequent entries in the tuple are handled as * columns in that row. {@link Map}s are expanded, placing the map key in the * column family and the map value in the Accumulo value. Scalars are placed * directly into the value with an empty column qualifier. If the columns * argument on the constructor is omitted, null or the empty String, no column * family is provided on the Keys created for Accumulo * </p> * * <p> * When reading data, if aggregateColfams is true, elements in the same row and * column family are aggregated into a single {@link Map}. This will result in a * {@link Tuple} of length (unique_column_families + 1) for the given row. If * aggregateColfams is false, column family and column qualifier are * concatenated (separated by a colon), and placed into a {@link Map}. This will * result in a {@link Tuple} with two entries, where the latter element has a * number of elements equal to the number of columns in the given row. * </p> */ public class AccumuloStorage extends AbstractAccumuloStorage { private static final Logger log = Logger.getLogger(AccumuloStorage.class); private static final String COLON = ":", EMPTY = ""; private static final Text EMPTY_TEXT = new Text(new byte[0]); private static final DataByteArray EMPTY_DATA_BYTE_ARRAY = new DataByteArray( new byte[0]); // Not sure if AccumuloStorage instances need to be thread-safe or not final Text _cfHolder = new Text(), _cqHolder = new Text(); /** * Creates an AccumuloStorage which writes all values in a {@link Tuple} * with an empty column family and doesn't group column families together on * read (creates on {@link Map} for all columns) */ public AccumuloStorage() throws ParseException, IOException { this(EMPTY, EMPTY); } /** * Create an AccumuloStorage with a CSV of columns-families to use on write * and whether columns in a row should be grouped by family on read. * * @param columns * A comma-separated list of column families to use when writing * data, aligned to the n'th entry in the tuple * @param aggregateColfams * Should unique column qualifier and value pairs be grouped * together by column family when reading data */ public AccumuloStorage(String columns) throws ParseException, IOException { this(columns, EMPTY); } public AccumuloStorage(String columnStr, String args) throws ParseException, IOException { super(columnStr, args); } @Override protected Tuple getTuple(Key key, Value value) throws IOException { SortedMap<Key, Value> rowKVs = WholeRowIterator.decodeRow(key, value); Tuple tuple = TupleFactory.getInstance().newTuple(columns.size() + 1); final Text cfHolder = new Text(); final Text cqHolder = new Text(); final Text row = key.getRow(); int tupleOffset = 0; tuple.set( tupleOffset, new DataByteArray(Text.decode(row.getBytes(), 0, row.getLength()))); for (Column column : this.columns) { tupleOffset++; switch (column.getType()) { case LITERAL: cfHolder.set(column.getColumnFamily()); if (null != column.getColumnQualifier()) { cqHolder.set(column.getColumnQualifier()); } else { cqHolder.set(EMPTY_TEXT); } // Get the key where our literal would exist (accounting for // "colf:colq" or "colf:" empty colq) Key literalStartKey = new Key(row, cfHolder, cqHolder); SortedMap<Key, Value> tailMap = rowKVs.tailMap(literalStartKey); // Find the element if (tailMap.isEmpty()) { tuple.set(tupleOffset, EMPTY_DATA_BYTE_ARRAY); } else { Key actualKey = tailMap.firstKey(); // Only place it in the tuple if it matches the user // request, avoid using a value from a // key with the wrong colqual if (0 == literalStartKey.compareTo(actualKey, PartialKey.ROW_COLFAM_COLQUAL)) { tuple.set(tupleOffset, new DataByteArray(tailMap.get(actualKey).get())); } else { // This row doesn't have the column we were looking for tuple.set(tupleOffset, EMPTY_DATA_BYTE_ARRAY); } } break; case COLFAM_PREFIX: cfHolder.set(column.getColumnFamily()); Range colfamPrefixRange = Range.prefix(row, cfHolder); Key colfamPrefixStartKey = new Key(row, cfHolder); SortedMap<Key, Value> cfTailMap = rowKVs .tailMap(colfamPrefixStartKey); // Find the element if (cfTailMap.isEmpty()) { tuple.set(tupleOffset, EMPTY_DATA_BYTE_ARRAY); } else { HashMap<String, DataByteArray> tupleMap = new HashMap<String, DataByteArray>(); // Build up a map for all the entries in this row that match // the colfam prefix for (Entry<Key, Value> entry : cfTailMap.entrySet()) { if (colfamPrefixRange.contains(entry.getKey())) { entry.getKey().getColumnFamily(cfHolder); entry.getKey().getColumnQualifier(cqHolder); DataByteArray val = new DataByteArray(entry .getValue().get()); // Avoid adding an extra ':' when colqual is empty if (0 == cqHolder.getLength()) { tupleMap.put(cfHolder.toString(), val); } else { tupleMap.put(cfHolder.toString() + COLON + cqHolder.toString(), val); } } else { break; } } if (!tupleMap.isEmpty()) { tuple.set(tupleOffset, tupleMap); } } break; case COLQUAL_PREFIX: cfHolder.set(column.getColumnFamily()); cqHolder.set(column.getColumnQualifier()); Range colqualPrefixRange = Range .prefix(row, cfHolder, cqHolder); Key colqualPrefixStartKey = new Key(row, cfHolder, cqHolder); SortedMap<Key, Value> cqTailMap = rowKVs .tailMap(colqualPrefixStartKey); if (cqTailMap.isEmpty()) { tuple.set(tupleOffset, EMPTY_DATA_BYTE_ARRAY); } else { HashMap<String, DataByteArray> tupleMap = new HashMap<String, DataByteArray>(); // Build up a map for all the entries in this row that match // the colqual prefix for (Entry<Key, Value> entry : cqTailMap.entrySet()) { if (colqualPrefixRange.contains(entry.getKey())) { entry.getKey().getColumnFamily(cfHolder); entry.getKey().getColumnQualifier(cqHolder); DataByteArray val = new DataByteArray(entry .getValue().get()); // Avoid the extra ':' on empty colqual if (0 == cqHolder.getLength()) { tupleMap.put(cfHolder.toString(), val); } else { tupleMap.put(cfHolder.toString() + COLON + cqHolder.toString(), val); } } else { break; } } if (!tupleMap.isEmpty()) { tuple.set(tupleOffset, tupleMap); } } break; default: break; } } return tuple; } @Override protected void configureInputFormat(Job job) { AccumuloInputFormat.addIterator(job, new IteratorSetting(100, WholeRowIterator.class)); } @Override protected Collection<Mutation> getMutations(Tuple tuple) throws ExecException, IOException { final ResourceFieldSchema[] fieldSchemas = (schema == null) ? null : schema.getFields(); Iterator<Object> tupleIter = tuple.iterator(); if (1 >= tuple.size()) { log.debug("Ignoring tuple of size " + tuple.size()); return Collections.emptyList(); } Mutation mutation = new Mutation(objectToText(tupleIter.next(), (null == fieldSchemas) ? null : fieldSchemas[0])); int tupleOffset = 1; Iterator<Column> columnIter = columns.iterator(); while (tupleIter.hasNext() && columnIter.hasNext()) { Object o = tupleIter.next(); Column column = columnIter.next(); // Grab the type for this field final byte type = schemaToType(o, (null == fieldSchemas) ? null : fieldSchemas[tupleOffset]); switch (column.getType()) { case LITERAL: byte[] bytes = objToBytes(o, type); if (null != bytes) { Value value = new Value(bytes); // We don't have any column name from non-Maps addColumn(mutation, column.getColumnFamily(), column.getColumnQualifier(), value); } break; case COLFAM_PREFIX: case COLQUAL_PREFIX: Map<String, Object> map; try { map = (Map<String, Object>) o; } catch (ClassCastException e) { log.error("Expected Map at tuple offset " + tupleOffset + " but was " + o.getClass().getSimpleName()); throw e; } for (Entry<String, Object> entry : map.entrySet()) { String key = entry.getKey(); Object objValue = entry.getValue(); byte valueType = DataType.findType(objValue); byte[] mapValue = objToBytes(objValue, valueType); if (Column.Type.COLFAM_PREFIX == column.getType()) { addColumn(mutation, column.getColumnFamily() + key, null, new Value(mapValue)); } else if (Column.Type.COLQUAL_PREFIX == column.getType()) { addColumn(mutation, column.getColumnFamily(), column.getColumnQualifier() + key, new Value( mapValue)); } else { throw new IOException("Unknown column type"); } } break; default: log.info("Ignoring unhandled column type"); continue; } tupleOffset++; } if (0 == mutation.size()) { return Collections.emptyList(); } return Collections.singletonList(mutation); } /** * Adds the given column family, column qualifier and value to the given * mutation * * @param mutation * @param colfam * @param colqual * @param columnValue */ protected void addColumn(Mutation mutation, String colfam, String colqual, Value columnValue) { if (null != colfam) { _cfHolder.set(colfam); } else { _cfHolder.clear(); } if (null != colqual) { _cqHolder.set(colqual); } else { _cqHolder.clear(); } mutation.put(_cfHolder, _cqHolder, columnValue); } }