/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.kylin.engine.mr.steps;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.kylin.common.KylinConfig;
import org.apache.kylin.common.util.ByteArray;
import org.apache.kylin.common.util.BytesUtil;
import org.apache.kylin.common.util.Dictionary;
import org.apache.kylin.common.util.Pair;
import org.apache.kylin.common.util.SplittedBytes;
import org.apache.kylin.cube.CubeInstance;
import org.apache.kylin.cube.CubeManager;
import org.apache.kylin.cube.CubeSegment;
import org.apache.kylin.cube.common.RowKeySplitter;
import org.apache.kylin.cube.cuboid.Cuboid;
import org.apache.kylin.cube.kv.RowConstants;
import org.apache.kylin.cube.kv.RowKeyEncoder;
import org.apache.kylin.cube.kv.RowKeyEncoderProvider;
import org.apache.kylin.cube.model.CubeDesc;
import org.apache.kylin.dict.DictionaryManager;
import org.apache.kylin.engine.mr.IMROutput2;
import org.apache.kylin.engine.mr.KylinMapper;
import org.apache.kylin.engine.mr.MRUtil;
import org.apache.kylin.engine.mr.common.AbstractHadoopJob;
import org.apache.kylin.engine.mr.common.BatchConstants;
import org.apache.kylin.measure.BufferedMeasureCodec;
import org.apache.kylin.measure.MeasureIngester;
import org.apache.kylin.measure.MeasureType;
import org.apache.kylin.metadata.model.MeasureDesc;
import org.apache.kylin.metadata.model.TableRef;
import org.apache.kylin.metadata.model.TblColRef;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
/**
* @author ysong1, honma
*/
@SuppressWarnings({ "rawtypes", "unchecked" })
public class MergeCuboidMapper extends KylinMapper<Text, Text, Text, Text> {
private KylinConfig config;
private String cubeName;
private String segmentID;
private CubeManager cubeManager;
private CubeInstance cube;
private CubeDesc cubeDesc;
private CubeSegment mergedCubeSegment;
private CubeSegment sourceCubeSegment; // Must be unique during a mapper's life cycle
private Text outputKey = new Text();
private byte[] newKeyBodyBuf;
private ByteArray newKeyBuf;
private RowKeySplitter rowKeySplitter;
private RowKeyEncoderProvider rowKeyEncoderProvider;
private HashMap<TblColRef, Boolean> dimensionsNeedDict = new HashMap<TblColRef, Boolean>();
// for re-encode measures that use dictionary
private List<Pair<Integer, MeasureIngester>> dictMeasures;
private Map<TblColRef, Dictionary<String>> oldDicts;
private Map<TblColRef, Dictionary<String>> newDicts;
private List<MeasureDesc> measureDescs;
private BufferedMeasureCodec codec;
private Object[] measureObjs;
private Text outputValue;
@Override
protected void setup(Context context) throws IOException, InterruptedException {
super.bindCurrentConfiguration(context.getConfiguration());
cubeName = context.getConfiguration().get(BatchConstants.CFG_CUBE_NAME).toUpperCase();
segmentID = context.getConfiguration().get(BatchConstants.CFG_CUBE_SEGMENT_ID);
config = AbstractHadoopJob.loadKylinPropsAndMetadata();
cubeManager = CubeManager.getInstance(config);
cube = cubeManager.getCube(cubeName);
cubeDesc = cube.getDescriptor();
mergedCubeSegment = cube.getSegmentById(segmentID);
// int colCount = cubeDesc.getRowkey().getRowKeyColumns().length;
newKeyBodyBuf = new byte[RowConstants.ROWKEY_BUFFER_SIZE];// size will auto-grow
newKeyBuf = ByteArray.allocate(RowConstants.ROWKEY_BUFFER_SIZE);
// decide which source segment
FileSplit fileSplit = (FileSplit) context.getInputSplit();
IMROutput2.IMRMergeOutputFormat outputFormat = MRUtil.getBatchMergeOutputSide2(mergedCubeSegment).getOuputFormat();
sourceCubeSegment = outputFormat.findSourceSegment(fileSplit, cube);
rowKeySplitter = new RowKeySplitter(sourceCubeSegment, 65, 255);
rowKeyEncoderProvider = new RowKeyEncoderProvider(mergedCubeSegment);
measureDescs = cubeDesc.getMeasures();
codec = new BufferedMeasureCodec(measureDescs);
measureObjs = new Object[measureDescs.size()];
outputValue = new Text();
dictMeasures = Lists.newArrayList();
oldDicts = Maps.newHashMap();
newDicts = Maps.newHashMap();
for (int i = 0; i < measureDescs.size(); i++) {
MeasureDesc measureDesc = measureDescs.get(i);
MeasureType measureType = measureDesc.getFunction().getMeasureType();
List<TblColRef> columns = measureType.getColumnsNeedDictionary(measureDesc.getFunction());
boolean needReEncode = false;
for (TblColRef col : columns) {
//handle the column that all records is null
if (sourceCubeSegment.getDictionary(col) == null) {
continue;
}
if (!sourceCubeSegment.getDictionary(col).equals(mergedCubeSegment.getDictionary(col))) {
oldDicts.put(col, sourceCubeSegment.getDictionary(col));
newDicts.put(col, mergedCubeSegment.getDictionary(col));
needReEncode = true;
}
}
if (needReEncode) {
dictMeasures.add(Pair.newPair(i, measureType.newIngester()));
}
}
}
@Override
public void doMap(Text key, Text value, Context context) throws IOException, InterruptedException {
long cuboidID = rowKeySplitter.split(key.getBytes());
Cuboid cuboid = Cuboid.findById(cubeDesc, cuboidID);
RowKeyEncoder rowkeyEncoder = rowKeyEncoderProvider.getRowkeyEncoder(cuboid);
SplittedBytes[] splittedByteses = rowKeySplitter.getSplitBuffers();
int bufOffset = 0;
int bodySplitOffset = rowKeySplitter.getBodySplitOffset();
for (int i = 0; i < cuboid.getColumns().size(); ++i) {
int useSplit = i + bodySplitOffset;
TblColRef col = cuboid.getColumns().get(i);
if (this.checkNeedMerging(col)) {
// if dictionary on fact table column, needs rewrite
DictionaryManager dictMgr = DictionaryManager.getInstance(config);
Dictionary<String> mergedDict = dictMgr.getDictionary(mergedCubeSegment.getDictResPath(col));
Dictionary<String> sourceDict;
// handle the column that all records is null
if (sourceCubeSegment.getDictionary(col) == null) {
BytesUtil.writeUnsigned(mergedDict.nullId(), newKeyBodyBuf, bufOffset, mergedDict.getSizeOfId());
bufOffset += mergedDict.getSizeOfId();
continue;
} else {
sourceDict = dictMgr.getDictionary(sourceCubeSegment.getDictResPath(col));
}
while (sourceDict.getSizeOfValue() > newKeyBodyBuf.length - bufOffset || //
mergedDict.getSizeOfValue() > newKeyBodyBuf.length - bufOffset || //
mergedDict.getSizeOfId() > newKeyBodyBuf.length - bufOffset) {
byte[] oldBuf = newKeyBodyBuf;
newKeyBodyBuf = new byte[2 * newKeyBodyBuf.length];
System.arraycopy(oldBuf, 0, newKeyBodyBuf, 0, oldBuf.length);
}
int idInSourceDict = BytesUtil.readUnsigned(splittedByteses[useSplit].value, 0, splittedByteses[useSplit].length);
int idInMergedDict;
//int size = sourceDict.getValueBytesFromId(idInSourceDict, newKeyBodyBuf, bufOffset);
String v = sourceDict.getValueFromId(idInSourceDict);
if (v == null) {
idInMergedDict = mergedDict.nullId();
} else {
idInMergedDict = mergedDict.getIdFromValue(v);
}
BytesUtil.writeUnsigned(idInMergedDict, newKeyBodyBuf, bufOffset, mergedDict.getSizeOfId());
bufOffset += mergedDict.getSizeOfId();
} else {
// keep as it is
while (splittedByteses[useSplit].length > newKeyBodyBuf.length - bufOffset) {
byte[] oldBuf = newKeyBodyBuf;
newKeyBodyBuf = new byte[2 * newKeyBodyBuf.length];
System.arraycopy(oldBuf, 0, newKeyBodyBuf, 0, oldBuf.length);
}
System.arraycopy(splittedByteses[useSplit].value, 0, newKeyBodyBuf, bufOffset, splittedByteses[useSplit].length);
bufOffset += splittedByteses[useSplit].length;
}
}
int fullKeySize = rowkeyEncoder.getBytesLength();
while (newKeyBuf.array().length < fullKeySize) {
newKeyBuf.set(new byte[newKeyBuf.length() * 2]);
}
newKeyBuf.set(0, fullKeySize);
rowkeyEncoder.encode(new ByteArray(newKeyBodyBuf, 0, bufOffset), newKeyBuf);
outputKey.set(newKeyBuf.array(), 0, fullKeySize);
// re-encode measures if dictionary is used
if (dictMeasures.size() > 0) {
codec.decode(ByteBuffer.wrap(value.getBytes(), 0, value.getLength()), measureObjs);
for (Pair<Integer, MeasureIngester> pair : dictMeasures) {
int i = pair.getFirst();
MeasureIngester ingester = pair.getSecond();
measureObjs[i] = ingester.reEncodeDictionary(measureObjs[i], measureDescs.get(i), oldDicts, newDicts);
}
ByteBuffer valueBuf = codec.encode(measureObjs);
outputValue.set(valueBuf.array(), 0, valueBuf.position());
value = outputValue;
}
context.write(outputKey, value);
}
private Boolean checkNeedMerging(TblColRef col) throws IOException {
Boolean ret = dimensionsNeedDict.get(col);
if (ret != null)
return ret;
ret = cubeDesc.getRowkey().isUseDictionary(col);
if (ret) {
TableRef srcTable = DictionaryManager.getInstance(config).decideSourceData(cubeDesc.getModel(), col).getTableRef();
ret = cubeDesc.getModel().isFactTable(srcTable);
}
dimensionsNeedDict.put(col, ret);
return ret;
}
}