/* (c) 2014 LinkedIn Corp. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use * this file except in compliance with the License. You may obtain a copy of the * License at http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR * CONDITIONS OF ANY KIND, either express or implied. */ package com.linkedin.cubert.block; import com.linkedin.cubert.io.BlockSerializationType; import com.linkedin.cubert.io.rubix.RubixFile; import com.linkedin.cubert.io.rubix.RubixFile.KeyData; import java.io.IOException; import java.io.Serializable; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Set; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.pig.data.Tuple; /** * Represents the index for relation stored in rubix format. * * @author Maneesh Varshney * */ public class Index implements Serializable { private static final long serialVersionUID = -7149581368638305803L; private final Map<Integer, List<IndexEntry>> entryMap = new HashMap<Integer, List<IndexEntry>>(); private int numHashPartitions; private BlockSerializationType serializationType; private transient IndexEntry cachedEntry; private transient Map<Long, IndexEntry> blockIdMap; public static Index extractFromRelation(Configuration conf, String rootdir) throws IOException, InstantiationException, IllegalAccessException, ClassNotFoundException { Index index = new Index(); FileStatus[] allFiles = RubixFile.getRubixFiles(new Path(rootdir), FileSystem.get(conf)); boolean first = true; for (FileStatus status : allFiles) { Path path = status.getPath(); RubixFile<Tuple, Void> rubixFile = new RubixFile<Tuple, Void>(conf, path); final List<KeyData<Tuple>> keyDataList = rubixFile.getKeyData(); if (first) { index.serializationType = rubixFile.getBlockSerializationType(); first = false; } if (keyDataList == null || keyDataList.size() == 0) { // create an index entry with length = 0. how do we get // the hash partitionId? continue; } for (KeyData<Tuple> keyData : keyDataList) { final int reducerId = keyData.getReducerId(); List<IndexEntry> entries = index.entryMap.get(reducerId); if (entries == null) { entries = new ArrayList<IndexEntry>(); index.entryMap.put(reducerId, entries); } entries.add(new IndexEntry(path.toString(), keyData.getKey(), keyData.getOffset(), keyData.getLength(), keyData.getBlockId(), keyData.getNumRecords())); } } index.numHashPartitions = index.entryMap.size(); for (Integer i : index.entryMap.keySet()) Collections.sort(index.entryMap.get(i)); return index; } public BlockSerializationType getSerializationType() { return serializationType; } public long getBlockId(Tuple key) { final int reducerId = getReducerId(key); List<IndexEntry> list = entryMap.get(reducerId); if (list == null) throw new RuntimeException("Cannot find blockid for " + key); if (cachedEntry == null) { cachedEntry = new IndexEntry(null, null, 0, 0, -1, -1); } cachedEntry.setKey(key); int idx = Collections.binarySearch(list, cachedEntry); if (idx >= 0) { return list.get(idx).getBlockId(); } else { // if idx is negative, it refers to (-insertion point - 1) int insertPoint = -idx - 2; if (insertPoint == -1) insertPoint = 0; return list.get(insertPoint).getBlockId(); } } public IndexEntry getEntry(long blockId) { if (blockIdMap == null) buildBlockIdMap(); return blockIdMap.get(blockId); } public Set<Long> getAllBlockIds() { if (blockIdMap == null) buildBlockIdMap(); return blockIdMap.keySet(); } public IndexEntry getNextEntry(long blockId) { for (List<IndexEntry> list : entryMap.values()) { for (int i = 0; i < list.size(); i++) { IndexEntry entry = list.get(i); if (blockId == entry.getBlockId()) { return i + 1 >= list.size() ? null : list.get(i + 1); } } } throw new RuntimeException("Couldn't locate block ID in Index entries."); } public int getReducerId(Tuple key) { long hashcode = BlockUtils.getBlockId(key); return (int) (hashcode % numHashPartitions); } private void buildBlockIdMap() { blockIdMap = new HashMap<Long, IndexEntry>(); for (List<IndexEntry> list : entryMap.values()) { for (IndexEntry entry : list) { blockIdMap.put(entry.getBlockId(), entry); } } } @Override public String toString() { int sum = 0; for (List<IndexEntry> entries : entryMap.values()) { sum += entries.size(); } return String.format("Index [entries=%d, numHashPartitions=%s, mapEntries:%s]", sum, numHashPartitions, entryMap.toString()); } public void print() { int sum = 0; for (List<IndexEntry> entries : entryMap.values()) { sum += entries.size(); } System.out.format("Index [entries=%d, numHashPartitions=%s]\n", sum, numHashPartitions); for (Integer key : entryMap.keySet()) { System.out.println("Key: " + key); for (IndexEntry e : entryMap.get(key)) { System.out.println("\t" + e); } } } /** * This distributes the blocks in a round robin fashion. * * The algorithm is as follows: * Step 1> Flatten all the blocks in each reducer bucket to a list. * Step 2> Sort by block ID * Step 3> Every ith element in the list goes to the (i % nReducers)th reducer. * * @param nReducers the configured number of reducers * @return a mapping of blockId --> The reducer to which it should be redirected to */ public Map<Long, Integer> getBlockIdPartitionMap(int nReducers) { Map<Long, Integer> blockIdReducerMap = new HashMap<Long, Integer>(); List<Long> blockIds = new ArrayList<Long>(); for (List<IndexEntry> indexEntries : entryMap.values()) { for (IndexEntry indexEntry : indexEntries) { blockIds.add(indexEntry.getBlockId()); } } Collections.sort(blockIds); for (int i = 0; i < blockIds.size(); ++i) { blockIdReducerMap.put(blockIds.get(i), i % nReducers); } return blockIdReducerMap; } }