/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.ignite.igfs; import org.apache.ignite.internal.processors.cache.GridCacheDefaultAffinityKeyMapper; import org.apache.ignite.internal.processors.igfs.IgfsBaseBlockKey; import org.apache.ignite.internal.util.typedef.internal.A; import org.apache.ignite.internal.util.typedef.internal.S; import org.apache.ignite.lang.IgniteUuid; /** * {@code IGFS} class providing ability to group file's data blocks together on one node. * All blocks within the same group are guaranteed to be cached together on the same node. * Group size parameter controls how many sequential blocks will be cached together on the same node. * <p> * For example, if block size is {@code 64kb} and group size is {@code 256}, then each group will contain * {@code 64kb * 256 = 16Mb}. Larger group sizes would reduce number of splits required to run map-reduce * tasks, but will increase inequality of data size being stored on different nodes. * <p> * Note that {@link #getGroupSize()} parameter must correlate to Hadoop split size parameter defined * in Hadoop via {@code mapred.max.split.size} property. Ideally you want all blocks accessed * within one split to be mapped to {@code 1} group, so they can be located on the same grid node. * For example, default Hadoop split size is {@code 64mb} and default {@code IGFS} block size * is {@code 64kb}. This means that to make sure that each split goes only through blocks on * the same node (without hopping between nodes over network), we have to make the {@link #getGroupSize()} * value be equal to {@code 64mb / 64kb = 1024}. * <p> * It is required for {@code IGFS} data cache to be configured with this mapper. Here is an * example of how it can be specified in XML configuration: * <pre name="code" class="xml"> * <bean id="cacheCfgBase" class="org.apache.ignite.cache.CacheConfiguration" abstract="true"> * ... * <property name="affinityMapper"> * <bean class="org.apache.ignite.igfs.IgfsGroupDataBlocksKeyMapper"> * <!-- How many sequential blocks will be stored on the same node. --> * <property name="groupSize" value="512"/> * </bean> * </property> * ... * </bean> * </pre> */ public class IgfsGroupDataBlocksKeyMapper extends GridCacheDefaultAffinityKeyMapper { /** */ private static final long serialVersionUID = 0L; /** Default group size.*/ public static final int DFLT_GRP_SIZE = 1024; /** Size of the group. */ private int grpSize = DFLT_GRP_SIZE; /** * Default constructor. */ public IgfsGroupDataBlocksKeyMapper() { // No-op. } /*** * Constructs affinity mapper to group several data blocks with the same key. * * @param grpSize Size of the group in blocks. */ public IgfsGroupDataBlocksKeyMapper(int grpSize) { A.ensure(grpSize >= 1, "grpSize >= 1"); this.grpSize = grpSize; } /** {@inheritDoc} */ @Override public Object affinityKey(Object key) { if (key instanceof IgfsBaseBlockKey) { IgfsBaseBlockKey blockKey = (IgfsBaseBlockKey)key; IgniteUuid affKey = blockKey.affinityKey(); if (affKey != null) return affKey; long grpId = blockKey.blockId() / grpSize; return blockKey.fileHash() + (int)(grpId ^ (grpId >>> 32)); } return super.affinityKey(key); } /** * Get group size. * <p> * Group size defines how many sequential file blocks will reside on the same node. This parameter * must correlate to Hadoop split size parameter defined in Hadoop via {@code mapred.max.split.size} * property. Ideally you want all blocks accessed within one split to be mapped to {@code 1} group, * so they can be located on the same grid node. For example, default Hadoop split size is {@code 64mb} * and default {@code IGFS} block size is {@code 64kb}. This means that to make sure that each split * goes only through blocks on the same node (without hopping between nodes over network), we have to * make the group size be equal to {@code 64mb / 64kb = 1024}. * <p> * Defaults to {@link #DFLT_GRP_SIZE}. * * @return Group size. */ public int getGroupSize() { return grpSize; } /** * Set group size. See {@link #getGroupSize()} for more information. * * @param grpSize Group size. * @return {@code this} for chaining. */ public IgfsGroupDataBlocksKeyMapper setGroupSize(int grpSize) { this.grpSize = grpSize; return this; } /** {@inheritDoc} */ @Override public String toString() { return S.toString(IgfsGroupDataBlocksKeyMapper.class, this); } }