/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package com.aliyun.odps.mapred.bridge;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Collections;
import com.aliyun.odps.mapred.bridge.utils.TypeUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import com.aliyun.odps.Column;
import com.aliyun.odps.OdpsType;
import com.aliyun.odps.data.TableInfo;
import com.aliyun.odps.data.VolumeInfo;
import com.aliyun.odps.mapred.conf.BridgeJobConf;
import com.aliyun.odps.mapred.conf.JobConf;
import com.aliyun.odps.mapred.utils.InputUtils;
import com.aliyun.odps.mapred.utils.OutputUtils;
import com.aliyun.odps.pipeline.Pipeline;
import com.aliyun.odps.pipeline.Pipeline.TransformNode;
import apsara.odps.ExpressionProtos.Constant;
import apsara.odps.ExpressionProtos.Reference;
import apsara.odps.ExpressionProtos.Null;
import apsara.odps.LanguageProtos.Language;
import apsara.odps.OrderProtos.Order;
import apsara.odps.PartitionSpecProtos.PartitionSpec;
import apsara.odps.TypesProtos;
import apsara.odps.lot.DataSinkProtos.DataSink;
import apsara.odps.lot.DataSourceProtos.DataSource;
import apsara.odps.lot.DistributeByProtos.DistributeBy;
import apsara.odps.lot.ExpressionProtos.ScalarExpression;
import apsara.odps.lot.ExpressionProtos.ScalarFunction;
import apsara.odps.lot.FakeSinkProtos;
import apsara.odps.lot.FilterProtos.Filter;
import apsara.odps.lot.LanguageSourceProtos.LanguageSource;
import apsara.odps.lot.LanguageTransformProtos.LanguageTransform;
import apsara.odps.lot.Lot.LogicalOperator;
import apsara.odps.lot.Lot.LogicalOperatorTree;
import apsara.odps.lot.Lottask.LotTask;
import apsara.odps.lot.SchemaProtos.Schema;
import apsara.odps.lot.SelectProtos.Select;
import apsara.odps.lot.SortByProtos.SortBy;
import apsara.odps.lot.StreamingTransformProtos.StreamingTransform;
import apsara.odps.lot.TableScanProtos;
import apsara.odps.lot.TableSinkProtos.TableSink;
import apsara.odps.lot.TransformProtos.Transform;
import apsara.odps.lot.UnionAllProtos.UnionAll;
import apsara.odps.lot.VolumeProtos.Volume;
public class LOTGenerator {
private static final Log LOG = LogFactory.getLog(LOTGenerator.class);
private static final String NO_OUTPUT_DUMMY_COLUMN = "__no_output__";
private static final String MULTI_INSERT_SELECTOR = "__multiins_selector__";
private static final String PARTITION_ID = "__partition_id__";
private static final String MAP_OUT_KEY_PREFIX = "k_";
private static final String MAP_OUT_VAL_PREFIX = "v_";
private static final String INNER_OUTPUT_SELECTOR = "__inner_output_selector__";
private final String project;
private final BridgeJobConf job;
private Pipeline pipeline;
private boolean pipeMode; // use pipeline mapreduce
private List<ResourceItem> resourceItems;
private int opId = 0;
private boolean isStreamingMap;
private boolean isStreamingReduce;
private boolean hasReducer;
private boolean hasPartitioner;
private boolean isMultiInsert;
private boolean isNoOutput;
private boolean isInnerOutput;
private boolean isTableOverwrite;
private TableInfo[] inputTableInfos;
private VolumeInfo[] inputVolumeInfos;
private TableInfo[] outputTableInfos;
private VolumeInfo[] outputVolumeInfos;
private List<Column> outputColumns = new ArrayList<Column>();
private Map<String, Integer> outputIndexes = new HashMap<String, Integer>();
public LOTGenerator(String project, JobConf job, Pipeline pipeline) {
this.project = project;
this.job = new BridgeJobConf(job);
this.pipeline = pipeline;
if (this.pipeline != null) {
this.pipeMode = true;
}
hasReducer = this.pipeMode ? pipeline.getNodeNum() > 1 : job.getNumReduceTasks() > 0;
hasPartitioner = this.pipeMode ? pipeline.getFirstNode().getPartitionerClass() != null :
job.getPartitionerClass() != null;
isStreamingMap = job.get("stream.map.streamprocessor", null) != null;
isStreamingReduce = job.get("stream.reduce.streamprocessor", null) != null;
isInnerOutput = job.getInnerOutputEnable();
isTableOverwrite = isInnerOutput ? false : job.getOutputOverwrite();
}
public byte[] generate() {
LotTask.Builder builder = LotTask.newBuilder();
builder.setLot(genTree());
LotTask lotTask = builder.build();
LOG.debug(lotTask.toString());
return lotTask.toByteArray();
}
LogicalOperatorTree genTree() {
resourceItems = buildResourceList();
LogicalOperatorTree.Builder builder = LogicalOperatorTree.newBuilder();
// prepare input
inputTableInfos = InputUtils.getTables(job);
inputVolumeInfos = InputUtils.getVolumes(job);
// FIXME multi-mapper
Map<TableInfoKey, List<LinkedHashMap<String, String>>>
inputTables =
mergeInputTableInfos(inputTableInfos);
// prepare output
outputTableInfos = OutputUtils.getTables(job);
outputVolumeInfos = OutputUtils.getVolumes(job);
// FIXME multi-insert from m-r's mapper
isNoOutput = outputTableInfos == null;
isMultiInsert = !isNoOutput && outputTableInfos.length > 1;
// streaming job has string output columns
boolean isStreamingOutput = job.getNumReduceTasks() > 0 ?
isStreamingReduce :
isStreamingMap;
List<OdpsType> outputColumnTypes = new ArrayList<OdpsType>();
if (isMultiInsert) {
// concat output columns for multi-insert
for (TableInfo ti : outputTableInfos) {
List<OdpsType> tbColumnTypes = new ArrayList<OdpsType>();
for (Column col : job.getOutputSchema(ti.getLabel())) {
tbColumnTypes.add(col.getType());
}
// check if the same columns already exists
int idx = Collections.indexOfSubList(outputColumnTypes, tbColumnTypes);
if (idx >= 0) {
// merge columns for tableinfos with the same schema
outputIndexes.put(ti.getLabel(), idx);
continue;
}
idx = outputColumns.size();
outputIndexes.put(ti.getLabel(), idx);
for (Column col : job.getOutputSchema(ti.getLabel())) {
String colName = "multiins" + idx + "_" + col.getName();
if (isStreamingOutput) {
outputColumns.add(new Column(colName, OdpsType.STRING));
outputColumnTypes.add(OdpsType.STRING);
} else {
outputColumns.add(TypeUtils.createColumnWithNewName(colName, col));
outputColumnTypes.add(col.getType());
}
}
}
outputColumns.add(new Column(MULTI_INSERT_SELECTOR, OdpsType.STRING));
} else if (isNoOutput) {
// FIXME currently UDTF need a output column
outputColumns.add(new Column(NO_OUTPUT_DUMMY_COLUMN, OdpsType.STRING));
} else {
for (Column col : job.getOutputSchema(outputTableInfos[0].getLabel())) {
if (isStreamingOutput) {
outputColumns.add(new Column(col.getName(), OdpsType.STRING));
} else {
outputColumns.add(TypeUtils.cloneColumn(col));
}
}
}
// prepare intermediate key/value
// FIXME types/signature
// FIXME use col name or not?
List<Column> mapOutColumns;
List<Column> firstReduceInColumns = null;
int innerOutputIndex = 0;
if (hasReducer) {
mapOutColumns = new ArrayList<Column>();
firstReduceInColumns = new ArrayList<Column>();
if (hasPartitioner) {
mapOutColumns.add(new Column(PARTITION_ID, OdpsType.BIGINT));
}
Column[] keys = this.pipeMode ?
pipeline.getFirstNode().getOutputKeySchema() :
job.getMapOutputKeySchema() ;
for (Column col : keys) {
Column keyCol = TypeUtils.createColumnWithNewName(MAP_OUT_KEY_PREFIX + col.getName(), col);
mapOutColumns.add(keyCol);
firstReduceInColumns.add(keyCol);
}
Column[] values = this.pipeMode ?
pipeline.getFirstNode().getOutputValueSchema() :
job.getMapOutputValueSchema();
for (Column col : values) {
Column valCol = TypeUtils.createColumnWithNewName(MAP_OUT_VAL_PREFIX + col.getName(), col);
mapOutColumns.add(valCol);
firstReduceInColumns.add(valCol);
}
} else {
mapOutColumns = outputColumns;
}
//XXX: lot not support multi inputs with inner output
String mapperId = genMapBlock(builder, inputTables, mapOutColumns, innerOutputIndex,
hasReducer && isInnerOutput && (inputTables.size() <= 1));
if (hasReducer) {
genReduceBlock(builder, firstReduceInColumns, mapperId);
} else {
// map only output
handleOutput(builder, false, outputColumns, mapperId, isTableOverwrite, innerOutputIndex);
}
return builder.build();
}
private int appendInnerOutputColumns(List<Column> mapOutColumns) {
int innerOutputIndex;
innerOutputIndex = mapOutColumns.size();
for (Column col : outputColumns) {
if (col.getName().equals(MULTI_INSERT_SELECTOR)) {
// not to change the name of multi insert selector
mapOutColumns.add(col);
continue;
}
Column valCol =
TypeUtils.createColumnWithNewName(
"inneroutputs" + innerOutputIndex + "_" + col.getName(), col);
mapOutColumns.add(valCol);
}
mapOutColumns.add(new Column(INNER_OUTPUT_SELECTOR, OdpsType.STRING));
return innerOutputIndex;
}
private String genMapBlock(LogicalOperatorTree.Builder builder,
Map<TableInfoKey, List<LinkedHashMap<String, String>>> inputTables,
List<Column> mapOutColumns, int innerOutputIndex, boolean innerOutput) {
if (innerOutput) {
innerOutputIndex = appendInnerOutputColumns(mapOutColumns);
}
// XXX one mapper only process one table
List<String> mappers = new ArrayList<String>();
List<Column> mapCols = mapOutColumns;
if (inputTables.size() == 0) {
int numMapTasks = job.getNumMapTasks();
if (isStreamingMap) {
DataSource emptySource = genEmptyStreamingSource(builder, numMapTasks);
String mapperId =
genMapper(builder, emptySource.getId(), new Column[0], mapOutColumns,
emptySource.getId());
mappers.add(mapperId);
} else {
// no input, implement mapper as Java DataSource
DataSource mapper = genJavaSource(builder, numMapTasks, mapOutColumns);
String mapperId = mapper.getId();
if (hasReducer && innerOutput) {
mapCols = mapOutColumns.subList(0, innerOutputIndex);
mapperId =
this.genInnerOutputBlock(builder, mapOutColumns, innerOutputIndex, mapperId, mapperId);
}
mappers.add(mapperId);
}
} else {
for (Map.Entry<TableInfoKey, List<LinkedHashMap<String, String>>> e : inputTables.entrySet()) {
TableInfo inputTable = e.getKey().getTableInfo();
List<LinkedHashMap<String, String>> partList = e.getValue();
DataSource tableSource = genTableSource(builder, inputTable);
String mapParentId = tableSource.getId();
if (!partList.isEmpty()) {
Filter partFilter =
genPartitionFilter(builder, tableSource.getId(), partList, tableSource.getId());
mapParentId = partFilter.getId();
}
String mapperId =
genMapper(builder, tableSource.getId(), job.getInputSchema(inputTable), mapOutColumns,
mapParentId);
if (hasReducer && innerOutput) {
mapCols = mapOutColumns.subList(0, innerOutputIndex);
mapperId =
this.genInnerOutputBlock(builder, mapOutColumns, innerOutputIndex, mapperId, mapperId);
}
mappers.add(mapperId);
}
}
String mapperId;
if (mappers.size() > 1) {
UnionAll.Builder unionAllBuilder = UnionAll.newBuilder();
for (String mid : mappers) {
Select mapOutSelect = genSelect(builder, mid, mapCols, mid);
unionAllBuilder.addParents(mapOutSelect.getId());
}
unionAllBuilder.setId("UNION_" + opId++);
UnionAll mapper = unionAllBuilder.build();
LogicalOperator.Builder b = LogicalOperator.newBuilder();
b.setUnionAll(mapper);
builder.addOperators(b.build());
mapperId = mapper.getId();
} else {
mapperId = mappers.get(0);
}
return mapperId;
}
private String genReduceBlock(LogicalOperatorTree.Builder tree,
List<Column> firstReduceInColumns, String finalId) {
List<List<Column>> reduceInColumnsList = new ArrayList<List<Column>>();
List<List<Column>> reduceOutColumnsList = new ArrayList<List<Column>>();
reduceInColumnsList.add(firstReduceInColumns);
int innerOutputIndex = 0;
if (this.pipeMode) {
for (int i = 1; i < pipeline.getNodeNum() - 1; i++) {
List<Column> reduceOut = new ArrayList<Column>();
List<Column> reduceIn = new ArrayList<Column>();
if (pipeline.getNode(i).getPartitionerClass() != null) {
reduceOut.add(new Column(PARTITION_ID, OdpsType.BIGINT));
}
for (Column col : pipeline.getNode(i).getOutputKeySchema()) {
Column keyCol =
TypeUtils.createColumnWithNewName(MAP_OUT_KEY_PREFIX + col.getName(), col);
reduceOut.add(keyCol);
reduceIn.add(keyCol);
}
for (Column col : pipeline.getNode(i).getOutputValueSchema()) {
Column valCol =
TypeUtils.createColumnWithNewName(MAP_OUT_VAL_PREFIX + col.getName(), col);
reduceOut.add(valCol);
reduceIn.add(valCol);
}
if (isInnerOutput) {
innerOutputIndex = appendInnerOutputColumns(reduceOut);
}
reduceOutColumnsList.add(reduceOut);
reduceInColumnsList.add(reduceIn);
}
}
reduceOutColumnsList.add(outputColumns);
// TODO combiner (currently run in mapper)
TransformNode node = null;
for (int i = 0; i < reduceInColumnsList.size(); i++) {
List<Column> reduceInCols = reduceInColumnsList.get(i);
List<Column> reduceOutCols = reduceOutColumnsList.get(i);
String sourceId = finalId;
String parentId = finalId;
if (this.pipeMode) {
// reduce block index from 0 to max
// reducers in node index from 1 to max [Mapper, Reducer, Reducer, ...]
// this reduce block's shuffle setting is set by previous map/reduce node
node = pipeline.getNode(i + 1 - 1);
}
boolean hasPartitioner =
this.pipeMode ? node.getPartitionerClass() != null : job.getPartitionerClass() != null;
// partitioner
String[] partitionColumns =
hasPartitioner ? new String[] {PARTITION_ID} : transformKeyColumnNames(pipeMode ? node
.getPartitionColumns() : job.getPartitionColumns());
DistributeBy shuffle = genShuffle(tree, sourceId, Arrays.asList(partitionColumns), parentId);
String[] sortColumns =
transformKeyColumnNames(pipeMode ? node.getOutputKeySortColumns() : job
.getOutputKeySortColumns());
JobConf.SortOrder[] order =
pipeMode ? node.getOutputKeySortOrder() : job.getOutputKeySortOrder();
SortBy sort = genSort(tree, sourceId, sortColumns, order, shuffle.getId());
// XXX group comparer only used inside reduce udtf, so no responding lot operator
parentId = sort.getId();
Transform reducer = genReducer(tree, sourceId, reduceInCols, reduceOutCols, parentId);
sourceId = parentId = reducer.getId();
// output
if (i == reduceInColumnsList.size() - 1) {
handleOutput(tree, false, outputColumns, sourceId, isTableOverwrite, 0);
} else if (isInnerOutput) {
finalId = genInnerOutputBlock(tree, reduceOutCols, innerOutputIndex, sourceId, parentId);
} else {
finalId = reducer.getId();
}
}
return finalId;
}
private String genInnerOutputBlock(LogicalOperatorTree.Builder builder,
List<Column> reduceOutCols, int innerOutputIndex, String sourceId, String parentId) {
String finalId;
handleOutput(builder, true, reduceOutCols, sourceId, false, innerOutputIndex);
Filter innertOutputSelector =
genInnerOutputSelector(builder, sourceId, parentId, TableInfo.DEFAULT_LABEL);
parentId = innertOutputSelector.getId();
Select reduceOutSelect =
genSelect(builder, sourceId, reduceOutCols.subList(0, innerOutputIndex), parentId);
finalId = reduceOutSelect.getId();
return finalId;
}
private String handleOutput(LogicalOperatorTree.Builder tree, boolean innerOutput,
List<Column> outputColumns, String finalTaskId,
boolean overwrite, int innerOutputIndex) {
String parentId = null;
if (isMultiInsert) {
for (TableInfo ti : outputTableInfos) {
// multi-insert filter
Filter multiInsertSelector =
genMultiInsertSelector(tree, finalTaskId, ti.getLabel(), finalTaskId);
parentId = multiInsertSelector.getId();
if (innerOutput) {
Filter innertOutputSelector =
genInnerOutputSelector(tree, finalTaskId, parentId, TableInfo.INNER_OUTPUT_LABEL);
parentId = innertOutputSelector.getId();
}
int idx = innerOutputIndex + outputIndexes.get(ti.getLabel());
List<Column> columns =
outputColumns.subList(idx, idx + job.getOutputSchema(ti.getLabel()).length);
Select outputSelect = genSelect(tree, finalTaskId, columns, parentId);
parentId = outputSelect.getId();
genTableSink(tree, ti, parentId, overwrite);
}
} else if (isNoOutput) {
genFakeSink(tree, finalTaskId);
} else {
List<Column> columns = outputColumns;
parentId = finalTaskId;
if (innerOutput) {
Filter innertOutputSelector =
genInnerOutputSelector(tree, finalTaskId, finalTaskId, TableInfo.INNER_OUTPUT_LABEL);
parentId = innertOutputSelector.getId();
columns = outputColumns.subList(innerOutputIndex, outputColumns.size() - 1);
}
Select outputSelect = genSelect(tree, finalTaskId, columns, parentId);
parentId = outputSelect.getId();
genTableSink(tree, outputTableInfos[0], parentId, overwrite);
}
return parentId;
}
private DataSource genTableSource(LogicalOperatorTree.Builder tree, TableInfo tableInfo) {
LogicalOperator.Builder builder = LogicalOperator.newBuilder();
DataSource.Builder db = DataSource.newBuilder();
TableScanProtos.TableScan.Builder tb = apsara.odps.lot.TableScanProtos.TableScan
.newBuilder();
tb.setTable(tableInfo.getTableName());
tb.setProject(tableInfo.getProjectName() == null ? project : tableInfo.getProjectName());
db.setTableScan(tb.build());
db.setId("DataSource_" + opId++);
DataSource dataSource = db.build();
builder.setDataSource(dataSource);
tree.addOperators(builder.build());
return dataSource;
}
private Filter genPartitionFilter(LogicalOperatorTree.Builder tree, String sourceId,
List<LinkedHashMap<String, String>> partList, String parentId) {
LogicalOperator.Builder builder = LogicalOperator.newBuilder();
Filter.Builder fb = Filter.newBuilder();
List<ScalarExpression> parts = new ArrayList<ScalarExpression>();
for (LinkedHashMap<String, String> partSpec : partList) {
ScalarExpression lastCol = null;
for (Map.Entry<String, String> e : partSpec.entrySet()) {
ScalarExpression.Builder colBuilder = ScalarExpression.newBuilder();
ScalarFunction.Builder eqBuilder = ScalarFunction.newBuilder();
eqBuilder.setProject(project);
eqBuilder.setName("EQ");
ScalarExpression.Builder keyBuilder = ScalarExpression.newBuilder();
Reference.Builder keyReference = Reference.newBuilder();
keyReference.setName(e.getKey());
keyReference.setFrom(sourceId);
keyBuilder.setReference(keyReference.build());
eqBuilder.addParameters(keyBuilder.build());
ScalarExpression.Builder valueBuilder = ScalarExpression.newBuilder();
Constant.Builder valueConstant = Constant.newBuilder();
valueConstant.setString(e.getValue());
valueBuilder.setConstant(valueConstant.build());
eqBuilder.addParameters(valueBuilder.build());
colBuilder.setExpression(eqBuilder.build());
if (lastCol == null) {
lastCol = colBuilder.build();
} else {
ScalarExpression.Builder newColBuilder = ScalarExpression.newBuilder();
ScalarFunction.Builder andBuilder = ScalarFunction.newBuilder();
andBuilder.setProject(project);
andBuilder.setName("AND");
andBuilder.addParameters(lastCol);
andBuilder.addParameters(colBuilder.build());
newColBuilder.setExpression(andBuilder.build());
lastCol = newColBuilder.build();
}
}
parts.add(lastCol);
}
// generate condition expression as binary tree, to limit nesting depth
List<ScalarExpression> children = parts;
while (children.size() > 1) {
List<ScalarExpression> parents = new ArrayList<ScalarExpression>(children.size() / 2 + 1);
for (int i = 0; i < children.size(); i += 2) {
ScalarExpression eLeft = children.get(i);
if (i + 1 >= children.size()) {
parents.add(eLeft);
} else {
ScalarExpression eRight = children.get(i + 1);
ScalarExpression.Builder parentBuilder = ScalarExpression.newBuilder();
ScalarFunction.Builder orBuilder = ScalarFunction.newBuilder();
orBuilder.setProject(project);
orBuilder.setName("OR");
orBuilder.addParameters(eLeft);
orBuilder.addParameters(eRight);
parentBuilder.setExpression(orBuilder.build());
parents.add(parentBuilder.build());
}
}
children = parents;
}
ScalarExpression partCond = children.get(0);
fb.setCondition(partCond);
fb.setParentId(parentId);
fb.setId("FIL_" + opId++);
Filter filter = fb.build();
builder.setFilter(filter);
tree.addOperators(builder.build());
return filter;
}
private String genMapper(LogicalOperatorTree.Builder tree, String sourceId, Column[] inColumns,
List<Column> outColumns, String parentId) {
LogicalOperator.Builder builder = LogicalOperator.newBuilder();
boolean isStreaming = isStreamingMap;
Transform.Builder ab = Transform.newBuilder();
for (ResourceItem item : resourceItems) {
Transform.Resources.Builder rb = Transform.Resources.newBuilder();
rb.setProject(item.projectName);
rb.setResourceName(item.resourceName);
ab.addResources(rb.build());
}
if (!isStreaming) {
LanguageTransform.Builder tb = LanguageTransform.newBuilder();
tb.setClassName(LotMapperUDTF.class.getName());
tb.setLanguage(Language.Java);
ab.setLanguageTransform(tb.build());
} else {
StreamingTransform.Builder sb = StreamingTransform.newBuilder();
sb.setCmd(job.get("stream.map.streamprocessor", null));
// TODO properties to pb fields
fillStreamingMapProperties(sb);
ab.setStreamingTransform(sb.build());
}
for (Column col : inColumns) {
ScalarExpression.Builder exprBuilder = ScalarExpression.newBuilder();
Reference.Builder refBuilder = Reference.newBuilder();
refBuilder.setName(col.getName());
refBuilder.setFrom(sourceId);
if (isStreaming) {
if (col.getType().equals(OdpsType.BOOLEAN)) {
exprBuilder.setExpression(castBooleanAsStreamingString(refBuilder.build()));
} else if (!col.getType().equals(OdpsType.STRING)) {
// cast as string
ScalarFunction.Builder castBuilder = ScalarFunction.newBuilder();
castBuilder.setProject(project);
castBuilder.setName("TOSTRING");
castBuilder.addParameters(ScalarExpression.newBuilder().setReference(refBuilder.build()).build());
exprBuilder.setExpression(castBuilder.build());
}
else {
exprBuilder.setReference(refBuilder.build());
}
} else {
exprBuilder.setReference(refBuilder.build());
}
ab.addParameters(exprBuilder.build());
}
Schema.Builder schemaBuilder = Schema.newBuilder();
for (Column col : outColumns) {
Schema.Columns.Builder scb = Schema.Columns.newBuilder();
scb.setName(col.getName());
scb.setType((isStreaming && !col.getName().equals(PARTITION_ID)) ? TypesProtos.Type.String
: TypeUtils.getLotTypeFromColumn(col));
schemaBuilder.addColumns(scb.build());
}
ab.setSchema(schemaBuilder.build());
ab.setParentId(parentId);
ab.setId("MapTransform_" + opId++);
//volume related
if (inputVolumeInfos != null && inputVolumeInfos.length > 0) {
for (VolumeInfo vol : inputVolumeInfos) {
Volume.Builder volumeBuilder = Volume.newBuilder();
volumeBuilder.setProject(vol.getProjectName());
volumeBuilder.setVolumeName(vol.getVolumeName());
volumeBuilder.setPartition(vol.getPartSpec());
volumeBuilder.setLabel(vol.getLabel());
volumeBuilder.setIsInput(true);
ab.addVolumes(volumeBuilder.build());
}
}
if (outputVolumeInfos != null && outputVolumeInfos.length > 0) {
for (VolumeInfo vol : outputVolumeInfos) {
Volume.Builder volumeBuilder = Volume.newBuilder();
volumeBuilder.setProject(vol.getProjectName());
volumeBuilder.setVolumeName(vol.getVolumeName());
volumeBuilder.setPartition(vol.getPartSpec());
volumeBuilder.setLabel(vol.getLabel());
volumeBuilder.setIsInput(false);
ab.addVolumes(volumeBuilder.build());
}
}
Transform mapper = ab.build();
builder.setTransform(mapper);
tree.addOperators(builder.build());
String mapperId = mapper.getId();
if (job.getNumReduceTasks() > 0 && isStreaming) {
// convert key type for shuffle keys
boolean hasNonStringOutput = false;
for (Column col : outColumns) {
if (!col.getName().equals(PARTITION_ID) && !col.getType().equals(OdpsType.STRING)) {
hasNonStringOutput = true;
}
}
if (hasNonStringOutput) {
// add select for converting from string
Select.Builder sb = Select.newBuilder();
for (Column col : outColumns) {
Select.Expressions.Builder seb = Select.Expressions.newBuilder();
ScalarExpression.Builder eb = ScalarExpression.newBuilder();
Reference.Builder refBuilder = Reference.newBuilder();
refBuilder.setName(col.getName());
refBuilder.setFrom(mapper.getId());
if (col.getName().equals(PARTITION_ID) || col.getType().equals(OdpsType.STRING)) {
eb.setReference(refBuilder.build());
} else {
// cast from string
ScalarFunction.Builder castBuilder = ScalarFunction.newBuilder();
castBuilder.setProject(project);
castBuilder.setName("TO" + col.getType().toString());
castBuilder.addParameters(
ScalarExpression.newBuilder().setReference(refBuilder.build()).build());
eb.setExpression(castBuilder.build());
}
seb.setExpression(eb.build());
seb.setAlias(col.getName());
sb.addExpressions(seb.build());
}
sb.setParentId(mapper.getId());
sb.setId("SEL_" + opId++);
Select select = sb.build();
tree.addOperators(LogicalOperator.newBuilder().setSelect(select).build());
mapperId = select.getId();
}
}
return mapperId;
}
private DataSource genJavaSource(LogicalOperatorTree.Builder tree, int instanceCount,
List<Column> outColumns) {
LogicalOperator.Builder builder = LogicalOperator.newBuilder();
DataSource.Builder db = DataSource.newBuilder();
LanguageSource.Builder jb = LanguageSource.newBuilder();
jb.setClassName(LotMapperUDTF.class.getName());
jb.setLanguage(Language.Java);
for (ResourceItem item : resourceItems) {
LanguageSource.Resources.Builder rb = LanguageSource.Resources.newBuilder();
rb.setProject(item.projectName);
rb.setResourceName(item.resourceName);
jb.addResources(rb.build());
}
jb.setInstanceCount(instanceCount);
Schema.Builder schemaBuilder = Schema.newBuilder();
for (Column col : outColumns) {
Schema.Columns.Builder scb = Schema.Columns.newBuilder();
scb.setName(col.getName());
scb.setType(TypeUtils.getLotTypeFromColumn(col));
schemaBuilder.addColumns(scb.build());
}
jb.setSchema(schemaBuilder.build());
//volume related
if (inputVolumeInfos != null && inputVolumeInfos.length > 0) {
for (VolumeInfo vol : inputVolumeInfos) {
Volume.Builder volumeBuilder = Volume.newBuilder();
volumeBuilder.setProject(vol.getProjectName());
volumeBuilder.setVolumeName(vol.getVolumeName());
volumeBuilder.setPartition(vol.getPartSpec());
volumeBuilder.setLabel(vol.getLabel());
volumeBuilder.setIsInput(true);
jb.addVolumes(volumeBuilder.build());
}
}
if (outputVolumeInfos != null && outputVolumeInfos.length > 0) {
for (VolumeInfo vol : outputVolumeInfos) {
Volume.Builder volumeBuilder = Volume.newBuilder();
volumeBuilder.setProject(vol.getProjectName());
volumeBuilder.setVolumeName(vol.getVolumeName());
volumeBuilder.setPartition(vol.getPartSpec());
volumeBuilder.setLabel(vol.getLabel());
volumeBuilder.setIsInput(false);
jb.addVolumes(volumeBuilder.build());
}
}
db.setLanguageSource(jb.build());
db.setId("MapJavaSource_" + opId++);
DataSource mapper = db.build();
builder.setDataSource(mapper);
tree.addOperators(builder.build());
return mapper;
}
// HACK for no-input streaming
private DataSource genEmptyStreamingSource(LogicalOperatorTree.Builder tree, int instanceCount) {
LogicalOperator.Builder builder = LogicalOperator.newBuilder();
DataSource.Builder db = DataSource.newBuilder();
LanguageSource.Builder jb = LanguageSource.newBuilder();
jb.setClassName(EmptyDataSource.class.getName());
jb.setLanguage(Language.Java);
jb.setInstanceCount(instanceCount);
jb.setSchema(Schema.newBuilder().build());
db.setLanguageSource(jb.build());
db.setId("EmptySource_" + opId++);
DataSource emptySource = db.build();
builder.setDataSource(emptySource);
tree.addOperators(builder.build());
return emptySource;
}
private Transform genReducer(LogicalOperatorTree.Builder tree, String sourceId,
List<Column> mapOutColumns, List<Column> outputColumns,
String parentId) {
LogicalOperator.Builder builder = LogicalOperator.newBuilder();
Transform.Builder ab = Transform.newBuilder();
boolean isStreaming = isStreamingReduce;
for (ResourceItem item : resourceItems) {
Transform.Resources.Builder rb = Transform.Resources.newBuilder();
rb.setProject(item.projectName);
rb.setResourceName(item.resourceName);
ab.addResources(rb.build());
}
if (!isStreaming) {
LanguageTransform.Builder tb = LanguageTransform.newBuilder();
tb.setClassName(LotReducerUDTF.class.getName());
tb.setLanguage(Language.Java);
ab.setLanguageTransform(tb.build());
} else {
StreamingTransform.Builder sb = StreamingTransform.newBuilder();
sb.setCmd(job.get("stream.reduce.streamprocessor", null));
fillStreamingReduceProperties(sb);
ab.setStreamingTransform(sb.build());
}
for (Column col : mapOutColumns) {
ScalarExpression.Builder exprBuilder = ScalarExpression.newBuilder();
Reference.Builder refBuilder = Reference.newBuilder();
refBuilder.setName(col.getName());
refBuilder.setFrom(sourceId);
if (isStreaming && !col.getType().equals(OdpsType.STRING)) {
// cast as string
ScalarFunction.Builder castBuilder = ScalarFunction.newBuilder();
castBuilder.setProject(project);
castBuilder.setName("TOSTRING");
castBuilder
.addParameters(ScalarExpression.newBuilder().setReference(refBuilder.build()).build());
exprBuilder.setExpression(castBuilder.build());
} else {
exprBuilder.setReference(refBuilder.build());
}
ab.addParameters(exprBuilder.build());
}
Schema.Builder schemaBuilder = Schema.newBuilder();
for (Column col : outputColumns) {
Schema.Columns.Builder scb = Schema.Columns.newBuilder();
scb.setName(col.getName());
scb.setType(TypeUtils.getLotTypeFromColumn(col));
schemaBuilder.addColumns(scb.build());
}
ab.setSchema(schemaBuilder.build());
ab.setParentId(parentId);
ab.setId("ReduceTransform_" + opId++);
Transform reducer = ab.build();
builder.setTransform(reducer);
tree.addOperators(builder.build());
return reducer;
}
private DistributeBy genShuffle(LogicalOperatorTree.Builder tree, String sourceId,
List<String> columns, String parentId) {
LogicalOperator.Builder builder = LogicalOperator.newBuilder();
DistributeBy.Builder db = DistributeBy.newBuilder();
for (String col : columns) {
Reference.Builder refBuilder = Reference.newBuilder();
refBuilder.setName(col);
refBuilder.setFrom(sourceId);
db.addColumns(refBuilder.build());
}
db.setParentId(parentId);
db.setId("DIS_" + opId++);
DistributeBy shuffle = db.build();
builder.setDistributeBy(shuffle);
tree.addOperators(builder.build());
return shuffle;
}
private SortBy genSort(LogicalOperatorTree.Builder tree, String sourceId, String[] sortColumns,
JobConf.SortOrder[] order, String parentId) {
LogicalOperator.Builder builder = LogicalOperator.newBuilder();
SortBy.Builder sb = SortBy.newBuilder();
sb.setIsPartial(false);
assert sortColumns.length == order.length;
for (int i = 0; i < sortColumns.length; i++) {
Order.Builder o = Order.newBuilder();
Reference.Builder refBuilder = Reference.newBuilder();
refBuilder.setName(sortColumns[i]);
refBuilder.setFrom(sourceId);
o.setColumn(refBuilder.build());
o.setAsc(order[i] == JobConf.SortOrder.ASC);
sb.addOrders(o.build());
}
sb.setParentId(parentId);
sb.setId("SORT_" + opId++);
SortBy sort = sb.build();
builder.setSortBy(sort);
tree.addOperators(builder.build());
return sort;
}
private Select genSelect(LogicalOperatorTree.Builder tree, String sourceId, List<Column> columns,
String parentId) {
LogicalOperator.Builder builder = LogicalOperator.newBuilder();
Select.Builder sb = Select.newBuilder();
for (Column col : columns) {
Select.Expressions.Builder seb = Select.Expressions.newBuilder();
ScalarExpression.Builder eb = ScalarExpression.newBuilder();
Reference.Builder refBuilder = Reference.newBuilder();
refBuilder.setName(col.getName());
refBuilder.setFrom(sourceId);
eb.setReference(refBuilder.build());
seb.setExpression(eb.build());
seb.setAlias(col.getName());
sb.addExpressions(seb.build());
}
sb.setParentId(parentId);
sb.setId("SEL_" + opId++);
Select select = sb.build();
builder.setSelect(sb.build());
tree.addOperators(builder.build());
return select;
}
private DataSink genTableSink(LogicalOperatorTree.Builder tree, TableInfo tableInfo, String parentId, boolean overwrite) {
LogicalOperator.Builder builder = LogicalOperator.newBuilder();
DataSink.Builder db = DataSink.newBuilder();
TableSink.Builder tw = TableSink.newBuilder();
tw.setProject(tableInfo.getProjectName() == null ? project : tableInfo.getProjectName());
tw.setTable(tableInfo.getTableName());
tw.setIsOverwrite(overwrite);
LinkedHashMap<String, String> partSpec = tableInfo.getPartSpec();
if (!partSpec.isEmpty()) {
PartitionSpec.Builder pb = PartitionSpec.newBuilder();
for (Map.Entry<String, String> e : partSpec.entrySet()) {
PartitionSpec.Items.Builder ib = PartitionSpec.Items.newBuilder();
ib.setKey(e.getKey());
Constant.Builder cb = Constant.newBuilder();
cb.setString(e.getValue());
ib.setValue(cb.build());
pb.addItems(ib.build());
}
tw.setPartition(pb.build());
}
db.setTableSink(tw.build());
db.setParentId(parentId);
db.setId("DataSink_" + opId++);
DataSink dataSink = db.build();
builder.setDataSink(dataSink);
tree.addOperators(builder.build());
return dataSink;
}
private Filter genMultiInsertSelector(LogicalOperatorTree.Builder tree, String sourceId,
String label, String parentId) {
LogicalOperator.Builder builder = LogicalOperator.newBuilder();
Filter.Builder fb = Filter.newBuilder();
ScalarExpression.Builder condBuilder = ScalarExpression.newBuilder();
ScalarFunction.Builder opBuilder = ScalarFunction.newBuilder();
opBuilder.setProject(project);
opBuilder.setName("EQ");
{
ScalarExpression.Builder eb = ScalarExpression.newBuilder();
Reference.Builder refBuilder = Reference.newBuilder();
refBuilder.setName(MULTI_INSERT_SELECTOR);
refBuilder.setFrom(sourceId);
eb.setReference(refBuilder.build());
opBuilder.addParameters(eb.build());
}
{
ScalarExpression.Builder eb = ScalarExpression.newBuilder();
Constant.Builder cBuilder = Constant.newBuilder();
cBuilder.setString(label);
eb.setConstant(cBuilder.build());
opBuilder.addParameters(eb.build());
}
condBuilder.setExpression(opBuilder.build());
fb.setCondition(condBuilder.build());
fb.setParentId(parentId);
fb.setId("FIL_" + opId++);
Filter selector = fb.build();
builder.setFilter(selector);
tree.addOperators(builder.build());
return selector;
}
private Filter genInnerOutputSelector(LogicalOperatorTree.Builder tree, String sourceId, String parentId, String label) {
LogicalOperator.Builder builder = LogicalOperator.newBuilder();
Filter.Builder fb = Filter.newBuilder();
ScalarExpression.Builder condBuilder = ScalarExpression.newBuilder();
ScalarFunction.Builder opBuilder = ScalarFunction.newBuilder();
opBuilder.setProject(project);
opBuilder.setName("EQ");
{
ScalarExpression.Builder eb = ScalarExpression.newBuilder();
Reference.Builder refBuilder = Reference.newBuilder();
refBuilder.setName(INNER_OUTPUT_SELECTOR);
refBuilder.setFrom(sourceId);
eb.setReference(refBuilder.build());
opBuilder.addParameters(eb.build());
}
{
ScalarExpression.Builder eb = ScalarExpression.newBuilder();
Constant.Builder cBuilder = Constant.newBuilder();
cBuilder.setString(label);
eb.setConstant(cBuilder.build());
opBuilder.addParameters(eb.build());
}
condBuilder.setExpression(opBuilder.build());
fb.setCondition(condBuilder.build());
fb.setParentId(parentId);
fb.setId("FIL_" + opId++);
Filter selector = fb.build();
builder.setFilter(selector);
tree.addOperators(builder.build());
return selector;
}
private DataSink genFakeSink(LogicalOperatorTree.Builder tree, String parentId) {
LogicalOperator.Builder builder = LogicalOperator.newBuilder();
DataSink.Builder db = DataSink.newBuilder();
db.setFakeSink(FakeSinkProtos.FakeSink.newBuilder().build());
db.setParentId(parentId);
db.setId("DataSink_" + opId++);
DataSink dataSink = db.build();
builder.setDataSink(dataSink);
tree.addOperators(builder.build());
return dataSink;
}
private static class TableInfoKey {
private TableInfo tableInfo;
public TableInfoKey(TableInfo tableInfo) {
this.tableInfo = tableInfo;
}
public TableInfo getTableInfo() {
return tableInfo;
}
@Override
public boolean equals(Object o) {
if (o == null || !(o instanceof TableInfoKey)) {
return false;
}
TableInfoKey other = (TableInfoKey) o;
String prj = this.tableInfo.getProjectName();
String otherPrj = other.tableInfo.getProjectName();
// XXX default project issue
if (StringUtils.equals(prj, otherPrj)
&& StringUtils.equals(this.tableInfo.getTableName(), other.tableInfo.getTableName())) {
return true;
} else {
return false;
}
}
@Override
public int hashCode() {
int code = this.tableInfo.getTableName().hashCode();
String prj = this.tableInfo.getProjectName();
if (prj != null) {
code = code * 71 + prj.hashCode();
}
return code;
}
}
/**
* Group table infos by table name, and gather partitions.
* XXX non-partitioned table will have empty partition list
* XXX one mapper should only process one table?
* XXX one table/partition should only be processed by one mapper?
*/
private Map<TableInfoKey, List<LinkedHashMap<String, String>>> mergeInputTableInfos(
TableInfo[] inputTableInfos) {
Map<TableInfoKey, List<LinkedHashMap<String, String>>>
inputTables =
new HashMap<TableInfoKey, List<LinkedHashMap<String, String>>>();
if (inputTableInfos == null) {
return inputTables;
}
for (TableInfo ti : inputTableInfos) {
LinkedHashMap<String, String> partSpec = null;
if (ti.getPartSpec() != null && !ti.getPartSpec().isEmpty()) {
partSpec = ti.getPartSpec();
}
TableInfoKey key = new TableInfoKey(ti);
List<LinkedHashMap<String, String>> partList = inputTables.get(key);
if (partList == null) {
// new table
partList = new ArrayList<LinkedHashMap<String, String>>();
if (partSpec != null) {
partList.add(partSpec);
}
inputTables.put(key, partList);
} else {
// detect conflict
if (partList.isEmpty()) {
if (partSpec != null) {
throw new IllegalArgumentException("conflict input for table:" + ti.getTableName());
} else {
throw new IllegalArgumentException("duplicate input for table:" + ti.getTableName());
}
}
if (partSpec == null) {
throw new IllegalArgumentException("conflict input for table:" + ti.getTableName());
}
partList.add(partSpec);
}
}
return inputTables;
}
private static class ResourceItem {
public String projectName;
public String resourceName;
public String linkName;
public ResourceItem(String projectName, String resourceName, String linkName) {
this.projectName = projectName;
this.resourceName = resourceName;
this.linkName = linkName;
}
}
private List<ResourceItem> buildResourceList() {
List<ResourceItem> r = new ArrayList<ResourceItem>();
if (job.getResources() == null) {
return r;
}
for (String res : job.getResources()) {
// FIXME parse resource
String resProject;
String resName;
String linkName = null;
String[] parts = res.split("/");
if (parts.length == 1) {
resProject = this.project;
resName = parts[0];
} else if (parts.length == 3 && parts[1].equals("resources")) {
resProject = parts[0];
resName = parts[2];
} else {
throw new IllegalArgumentException("Invalid resource name: '" + res + "'");
}
String[] nameParts = resName.split("#");
if (nameParts.length == 1) {
// normal
} else if (nameParts.length == 2) {
// resName#alias
resName = nameParts[0];
linkName = nameParts[1];
} else {
throw new IllegalArgumentException("Invalid resource name: '" + resName + "'");
}
// TODO merge these resource aliases
r.add(new ResourceItem(resProject, resName, linkName));
}
return r;
}
private String[] transformKeyColumnNames(String[] cols) {
String[] keyCols = new String[cols.length];
for (int i = 0; i < cols.length; i++) {
keyCols[i] = MAP_OUT_KEY_PREFIX + cols[i];
}
return keyCols;
}
private void fillStreamingMapProperties(StreamingTransform.Builder sb) {
// TODO set per table streaming properties?
for (Map.Entry<String, String> e : job) {
addStreamingProperty(sb, e.getKey(), e.getValue());
}
addStreamingProperty(sb, "stream.stage", "map");
}
private void fillStreamingReduceProperties(StreamingTransform.Builder sb) {
for (Map.Entry<String, String> e : job) {
addStreamingProperty(sb, e.getKey(), e.getValue());
}
addStreamingProperty(sb, "stream.stage", "reduce");
}
private void addStreamingProperty(StreamingTransform.Builder sb, String name, String value) {
sb.addProperties(
StreamingTransform.Properties.newBuilder().setKey(name).setValue(value).build());
}
// WHEN(EQ(col, true), "true", WHEN(EQ(col, false), "false", NULL))
private ScalarFunction castBooleanAsStreamingString(Reference colRef)
{
ScalarFunction.Builder caseBuilder = ScalarFunction.newBuilder();
caseBuilder.setProject(project);
caseBuilder.setName("WHEN");
ScalarFunction.Builder eqBuilder = ScalarFunction.newBuilder();
eqBuilder.setProject(project);
eqBuilder.setName("EQ");
eqBuilder.addParameters(ScalarExpression.newBuilder().setReference(colRef).build());
eqBuilder.addParameters(ScalarExpression.newBuilder().setConstant(Constant.newBuilder().setBool(true).build()).build());
caseBuilder.addParameters(ScalarExpression.newBuilder().setExpression(eqBuilder.build()).build());
caseBuilder.addParameters(ScalarExpression.newBuilder().setConstant(Constant.newBuilder().setString("true").build()).build());
ScalarFunction.Builder caseBuilder2 = ScalarFunction.newBuilder();
caseBuilder2.setProject(project);
caseBuilder2.setName("WHEN");
ScalarFunction.Builder eqBuilder2 = ScalarFunction.newBuilder();
eqBuilder2.setProject(project);
eqBuilder2.setName("EQ");
eqBuilder2.addParameters(ScalarExpression.newBuilder().setReference(colRef).build());
eqBuilder2.addParameters(ScalarExpression.newBuilder().setConstant(Constant.newBuilder().setBool(false).build()).build());
caseBuilder2.addParameters(ScalarExpression.newBuilder().setExpression(eqBuilder2.build()).build());
caseBuilder2.addParameters(ScalarExpression.newBuilder().setConstant(Constant.newBuilder().setString("false").build()).build());
caseBuilder2.addParameters(ScalarExpression.newBuilder().setNull(Null.newBuilder().build()).build());
caseBuilder.addParameters(ScalarExpression.newBuilder().setExpression(caseBuilder2.build()).build());
return caseBuilder.build();
}
}