/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package com.aliyun.odps.pipeline; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import com.aliyun.odps.utils.StringUtils; import com.aliyun.odps.Column; import com.aliyun.odps.data.RecordComparator; import com.aliyun.odps.io.WritableComparator; import com.aliyun.odps.mapred.Job; import com.aliyun.odps.mapred.Mapper; import com.aliyun.odps.mapred.Partitioner; import com.aliyun.odps.mapred.Reducer; import com.aliyun.odps.mapred.conf.JobConf; import com.aliyun.odps.mapred.conf.JobConf.SortOrder; import com.aliyun.odps.mapred.utils.SchemaUtils; /** * MapReduce Pipeline扩展<br/> * <p> * 在传统的MapReduce计算模型上做了扩展,可以在一轮Map/Reduce运算之后再加上一个或多个Reduce, * 即Map-Reduce-Reduce-Reduce...。一个Pipeline由多个节点顺序组成,每个节点是Mapper或Reducer<br/><br/> * 和普通的MapReduce一样,使用{@link Job}定义并提交作业,包括输入输入表的设置等。 * 除此之外,还需要定义一个Pipeline对象(使用Pipeline.builder()方法),添加一个Mapper以及一个 * 或多个Reducer。 * <br/><br/> * 任何一个Mapper或处于中间状态的Reducer都需要显式定义输出结果的Key和Value的Schema定义 * (定义方式类似于{@link JobConf#setMapOutputKeySchema(Column[])})。同时还可以设置 * OutputKeySortColumns、PartitionColumns等。 * <br/><br/> * 代码示例如下: * </p> * * <pre> * Job job = new Job(); * * Pipeline pipeline = Pipeline.builder() * .addMapper(TokenizerMapper.class) * .setOutputKeySchema( * new Column[] { new Column("word", OdpsType.STRING) }) * .setOutputValueSchema( * new Column[] { new Column("count", OdpsType.BIGINT) }) * .addReducer(SumReducer.class) * .setOutputKeySchema( * new Column[] { new Column("count", OdpsType.BIGINT) }) * .setOutputValueSchema( * new Column[] { new Column("word", OdpsType.STRING), * new Column("count", OdpsType.BIGINT) }) * .addReducer(IdentityReducer.class).createPipeline(); * * job.setPipeline(pipeline); * job.addInput(...) * job.addOutput(...) * job.submit(); * </pre> */ public class Pipeline { private final static String PIPELINE_LIST = "odps.pipeline.list"; private final static String PIPELINE = "odps.pipeline."; private final static String OUTPUT_KEY_SCHEMA = ".output.key.schema"; private final static String OUTPUT_VALUE_SCHEMA = ".output.value.schema"; private final static String OUTPUT_KEY_SORT_COLUMNS = ".output.key.sort.columns"; private final static String OUTPUT_KEY_SORT_ORDER = ".output.key.sort.order"; private final static String OUTPUT_GROUP_COLUMNS = ".output.group.columns"; private final static String PARTITION_COLUMNS = ".partition.columns"; private final static String PARTITION_CLASS = ".partition.class"; private final static String OUTPUT_KEY_COMPARATOR_CLASS = ".output.key.comparator.class"; private final static String OUTPUT_KEY_GROUPING_COMPARATOR_CLASS = ".output.key.grouping.comparator.class"; private List<TransformNode> nodes = new ArrayList<TransformNode>(); /** * Pipeline的节点,是一个Mapper或者Reducer */ public abstract static class TransformNode { Column[] keySchema; Column[] valueSchema; String[] sortCols; SortOrder[] order; String[] partCols; Class<? extends Partitioner> partitionerClass; @SuppressWarnings("rawtypes") Class<? extends RecordComparator> keyComparatorClass; @SuppressWarnings("rawtypes") Class<? extends RecordComparator> keyGroupingComparatorClass; String[] groupCols; String type; TransformNode prevNode; TransformNode nextNode; int taskNum = -1; int taskMemoryMB = -1; int jvmMemoryMB = -1; /** * 获取Pipeline节点的类型 * * @return 字符串,map 或者 reduce */ public String getType() { return this.type; } /** * 设置Pipeline上的前一个节点 * * @param prev * 前一个节点 */ public void setPreviousNode(TransformNode prev) { this.prevNode = prev; } /** * 获取Pipeline上的前一个节点 * * @return 前一个节点 */ public TransformNode getPreviousNode() { return this.prevNode; } /** * 设置Pipeline上的后一个节点 * * @param next * 后一个节点 */ public void setNextNode(TransformNode next) { this.nextNode = next; } /** * 获取Pipeline上的后一个节点 * * @return 后一个节点 */ public TransformNode getNextNode() { return this.nextNode; } /** * 获取本节点的输入数据Key Schema * * @return 输入数据的Key Schema */ public Column[] getInputKeySchema() { if (this.prevNode != null) { return this.prevNode.getOutputKeySchema(); } else { return null; } } /** * 获取本节点的输入数据Value Schema * * @return 输入数据的Value Schema,如果是Pipeline的第一个节点,则返回null */ public Column[] getInputValueSchema() { if (this.prevNode != null) { return this.prevNode.getOutputValueSchema(); } else { return null; } } /** * 获取本节点输入的Grouping列,也就是前一节点输出的Grouping列 * * @return Grouping列,如果是Pipeline的第一个节点,则返回null */ public String[] getInputGroupingColumns() { if (this.prevNode != null) { return this.prevNode.getOutputGroupingColumns(); } else { return null; } } /** * 设置本节点的输出Key格式 * * @param keySchema * Key格式 */ public void setOutputKeySchema(Column[] keySchema) { this.keySchema = keySchema; } /** * 获取本节点的输出Key格式 * * @return Key Schema列定义 */ public Column[] getOutputKeySchema() { return this.keySchema; } /** * 设置本节点的输出Value格式 * * @param valueSchema * Value格式 */ public void setOutputValueSchema(Column[] valueSchema) { this.valueSchema = valueSchema; } /** * 获取本节点的输出Value格式 * * @return Value Schema列定义 */ public Column[] getOutputValueSchema() { return this.valueSchema; } /** * 设置输出按Key的排序方式 * * @param order * 排序方式,是升序还是降序 */ public void setOutputKeySortOrder(SortOrder[] order) { this.order = order; } /** * 获取输出按Key的排序方式 * * @return Key排序方式,是升序还是降序 */ public SortOrder[] getOutputKeySortOrder() { SortOrder[] order = this.order; if ((order == null || order.length == 0) && this.getOutputKeySchema() != null) { order = new SortOrder[this.getOutputKeySchema().length]; Arrays.fill(order, SortOrder.ASC); } else if (order == null) { order = new SortOrder[0]; } return order; } /** * 设置输出排序的列 * * @param sortCols * 排序列 */ public void setOutputKeySortColumns(String[] sortCols) { this.sortCols = sortCols; } /** * 获取输出排序的列 * * @return 输出排序的列,如果没有显式指定,则按照输出Key排序 */ public String[] getOutputKeySortColumns() { if (this.sortCols != null) { return this.sortCols; } else if (this.keySchema != null) { return SchemaUtils.getNames(getOutputKeySchema()); } else { return null; } } /** * 设置输出Partition的列 * * @param partCols * Partition列 */ public void setPartitionColumns(String[] partCols) { this.partCols = partCols; } /** * 获取输出数据切分列 * * @return 输出数据切分列,如果没有显式指定,则按照输出Key做切分 */ public String[] getPartitionColumns() { if (this.partCols != null) { return this.partCols; } else if (this.keySchema != null) { return SchemaUtils.getNames(getOutputKeySchema()); } else { return null; } } /** * 设置自定义的数据切分的class * * @param theClass * 切分的class */ public void setPartitionerClass(Class<? extends Partitioner> theClass) { this.partitionerClass = theClass; } /** * 获取自定义的切分class * * @return 切分class */ public Class<? extends Partitioner> getPartitionerClass() { return this.partitionerClass; } /** * 设置输出分组的列,如果不指定,默认按照输出Key分组 * * @param groupCols * 分组列 */ public void setOutputGroupingColumns(String[] groupCols) { this.groupCols = groupCols; } /** * 获取输出分组的列 * * @return 分组列,如果不显式指定,默认返回输出Key */ public String[] getOutputGroupingColumns() { if (this.groupCols != null) { return this.groupCols; } else if (this.keySchema != null) { return SchemaUtils.getNames(getOutputKeySchema()); } else { return null; } } /** * 获取本节点输入 Key 的排序比较器,默认返回上级节点的#getOutputKeyComparatorClass, * 如果没有上级节点则返回null. * * @see RecordComparator * @return 输入 Key 排序比较器. * */ @SuppressWarnings("rawtypes") public Class<? extends RecordComparator> getInputKeyComparatorClass() { if (this.getPreviousNode() != null) { return this.getPreviousNode().getOutputKeyComparatorClass(); } return null; } /** * 获取本节点输入 Key 的分组比较器,默认返回上级节点的#getOutputKeyGroupingComparatorClass, * 如果没有上级节点则返回null. * * @see RecordComparator * @return 输入 Key 的分组比较器. * */ @SuppressWarnings("rawtypes") public Class<? extends RecordComparator> getInputKeyGroupingComparatorClass() { if (this.getPreviousNode() != null) { return this.getPreviousNode().getOutputKeyGroupingComparatorClass(); } return null; } /** * 获取指定的节点输出 Key 排序比较器,如果没有指定,默认使用 * {@link WritableComparator#get(Class)} 返回的比较函数. * * @see RecordComparator * @return 输出 Key 排序比较器. * */ @SuppressWarnings("rawtypes") public Class<? extends RecordComparator> getOutputKeyComparatorClass() { return keyComparatorClass; } /** * 设置输出 Key 排序比较器. * * @param theClass * 用于输出 Key 排序的比较器,{@link RecordComparator} 子类 * */ @SuppressWarnings("rawtypes") public void setOutputKeyComparatorClass(Class<? extends RecordComparator> theClass) { keyComparatorClass = theClass; } /** * 获取指定的 Key 分组比较器,默认为 {@link #getOutputKeyComparatorClass()}. * * @see #setOutputKeyGroupingComparatorClass(Class) * @return Key 分组比较器 * */ @SuppressWarnings("rawtypes") public Class<? extends RecordComparator> getOutputKeyGroupingComparatorClass() { return keyGroupingComparatorClass; } /** * 设置 Key 分组比较器,如果不指定,默认使用 {@link #getOutputKeyComparatorClass()} 作为分组比较器. * * @param theClass * Key 分组比较器,实现 {@link RecordComparator}接口 * @see #setOutputKeyComparatorClass(Class) * */ @SuppressWarnings("rawtypes") public void setOutputKeyGroupingComparatorClass(Class<? extends RecordComparator> theClass) { keyGroupingComparatorClass = theClass; } /** * 获取节点对应的Mapper或Reducer class定义 * * @return */ @SuppressWarnings("rawtypes") public abstract Class getTransformClass(); /** * 设置当前节点任务数 * * @param n * 任务数 */ public void setNumTasks(int n) { this.taskNum = n; } public int getNumTasks() { return this.taskNum; } /** * 设置当前节点的内存资源大小,单位:MB,默认值:2048. * * @param mem * 内存大小 */ public void setMemoryForTask(int mem) { this.taskMemoryMB = mem; } public int getMemoryForTask() { return this.taskMemoryMB; } /** * 设置当前节点的JVM虚拟机的内存资源大小,单位:MB,默认值:1024. * * @param mem * 内存大小 */ public void setMemoryForJVM(int mem) { this.jvmMemoryMB = mem; } public int getMemoryForJVM() { return this.jvmMemoryMB; } } /** * Mapper节点 */ public static class MapNode extends TransformNode { private Class<? extends Mapper> mapper; /** * 定义一个Mapper节点 * * @param mapper * class定义 */ public MapNode(Class<? extends Mapper> mapper) { this.mapper = mapper; this.type = "map"; } @Override public Class<? extends Mapper> getTransformClass() { return this.mapper; } } /** * Reducer节点 */ public static class ReduceNode extends TransformNode { private Class<? extends Reducer> reducer; /** * 定义一个Reducer节点 * * @param reducer * class定义 */ public ReduceNode(Class<? extends Reducer> reducer) { this.reducer = reducer; this.type = "reduce"; } @Override public Class<? extends Reducer> getTransformClass() { return this.reducer; } } /** * 定义一个Pipeline对象 * * @param nodes * Pipeline节点列表 */ public Pipeline(List<TransformNode> nodes) { this.nodes = nodes; } /** * 获取Pipeline中的指定节点 * * @param index * 节点序号(从0开始) * @return Pipeline节点 */ public TransformNode getNode(int index) { if (index >= 0 && index < nodes.size()) { return this.nodes.get(index); } else { return null; } } /** * 获取Pipeline的第一个节点 * * @return 第一个节点对象 */ public TransformNode getFirstNode() { return getNode(0); } /** * 获取Pipeline的最后一个节点 * * @return 最后一个节点对象 */ public TransformNode getLastNode() { return getNode(getNodeNum() - 1); } /** * 获取Pipeline的节点数 * * @return 节点数目 */ public int getNodeNum() { return this.nodes.size(); } /** * 获取Pipeline的节点列表 * * @return 节点列表 */ public List<TransformNode> getNodes() { return this.nodes; } /** * Pipeline Builder对象,可以用它来创建一个Pipeline */ public static class Builder { private List<TransformNode> nodes = new ArrayList<TransformNode>(); private TransformNode lastNode; /** * Pipeline增加一个Mapper节点 * * @param mapper * Mapper class定义 * @return Builder对象 */ public Builder addMapper(Class<? extends Mapper> mapper) { // TODO check the previous node // if map after map, map after reduce can merge together MapNode map = new MapNode(mapper); this.nodes.add(map); this.lastNode = map; return this; } /** * Pipeline增加一个Mapper节点,同时定义该节点的输出格式等信息 * * @param mapper * mapper class定义 * @param keySchema * 输出Key定义 * @param valueSchema * 输出Value定义 * @param sortCols * 输出排序列 * @param order * 输出排序方式 * @param partCols * 输出切分列 * @param theClass * 输出自定义的切分类 * @param groupCols * 输出分组列 * @return Builder对象 */ public Builder addMapper(Class<? extends Mapper> mapper, Column[] keySchema, Column[] valueSchema, String[] sortCols, SortOrder[] order, String[] partCols, Class<? extends Partitioner> theClass, String[] groupCols) { addMapper(mapper).setOutputKeySchema(keySchema) .setOutputValueSchema(valueSchema) .setOutputKeySortColumns(sortCols) .setOutputKeySortOrder(order) .setPartitionColumns(partCols) .setPartitionerClass(theClass) .setOutputGroupingColumns(groupCols); return this; } /** * Pipeline增加一个Reducer节点 * * @param reducer * Reducer class定义 * @return Builder对象 */ public Builder addReducer(Class<? extends Reducer> reducer) { ReduceNode reduce = new ReduceNode(reducer); reduce.setPreviousNode(this.lastNode); if (lastNode != null) { lastNode.setNextNode(reduce); } this.nodes.add(reduce); this.lastNode = reduce; return this; } /** * Pipeline增加一个Reducer节点,同时定义该节点的输出格式等信息 * * @param reducer * Reducer class定义 * @param keySchema * 输出Key定义 * @param valueSchema * 输出Value定义 * @param sortCols * 输出排序列 * @param order * 输出排序方式 * @param partCols * 输出切分列 * @param theClass * 输出自定义的切分类 * @param groupCols * 输出分组列 * @return Builder对象 */ public Builder addReducer(Class<? extends Reducer> reducer, Column[] keySchema, Column[] valueSchema, String[] sortCols, SortOrder[] order, String[] partCols, Class<? extends Partitioner> theClass, String[] groupCols) { addReducer(reducer).setOutputKeySchema(keySchema) .setOutputValueSchema(valueSchema) .setOutputKeySortColumns(sortCols) .setOutputKeySortOrder(order) .setPartitionColumns(partCols) .setPartitionerClass(theClass) .setOutputGroupingColumns(groupCols); return this; } /** * 设置当前节点输出Key * * @param keySchema * Key定义 * @return Builder对象 */ public Builder setOutputKeySchema(Column[] keySchema) { if (lastNode != null) { lastNode.setOutputKeySchema(keySchema); } return this; } /** * 设置当前节点输出Value Schema * * @param valueSchema * Value定义 * @return Builder对象 */ public Builder setOutputValueSchema(Column[] valueSchema) { if (lastNode != null) { lastNode.setOutputValueSchema(valueSchema); } return this; } /** * 设置当前节点输出排序列 * * @param sortCols * 排序列 * @return Builder对象 */ public Builder setOutputKeySortColumns(String[] sortCols) { if (lastNode != null) { lastNode.setOutputKeySortColumns(sortCols); } return this; } /** * 设置当前节点输出排序方式(升序或者降序) * * @param order * 排序方式 * @return Builder对象 */ public Builder setOutputKeySortOrder(SortOrder[] order) { if (lastNode != null) { lastNode.setOutputKeySortOrder(order); } return this; } /** * 设置当前节点切分列 * * @param partCols * 切分列 * @return Builder对象 */ public Builder setPartitionColumns(String[] partCols) { if (lastNode != null) { lastNode.setPartitionColumns(partCols); } return this; } /** * 设置当前节点切分的Class定义 * * @param theClass * 切分class定义 * @return Builder对象 */ public Builder setPartitionerClass(Class<? extends Partitioner> theClass) { if (lastNode != null) { lastNode.setPartitionerClass(theClass); } return this; } /** * 设置当前节点任务数 * * @param n * 任务数 * @return Builder对象 */ public Builder setNumTasks(int n) { if (lastNode != null) { lastNode.setNumTasks(n); } return this; } /** * 设置当前节点的内存资源大小,单位:MB,默认值:2048. * * @param mem * 内存大小 * @return Builder对象 */ public Builder setMemoryForTask(int mem) { if (lastNode != null) { lastNode.setMemoryForTask(mem); } return this; } /** * 设置当前节点JVM虚拟机的内存资源大小,单位:MB,默认值:1024. * * @param mem * 内存大小 * @return Builder对象 */ public Builder setMemoryForJVM(int mem) { if (lastNode != null) { lastNode.setMemoryForJVM(mem); } return this; } /** * 设置当前节点输出的分组列 * * @param partCols * 分组列 * @return Builder对象 */ public Builder setOutputGroupingColumns(String[] cols) { if (lastNode != null) { lastNode.setOutputGroupingColumns(cols); } return this; } /** * 设置当前节点key比较器的Class定义 * * @param theClass * 切分class定义 * @return Builder对象 */ @SuppressWarnings("rawtypes") public Builder setOutputKeyComparatorClass(Class<? extends RecordComparator> theClass) { if (lastNode != null) { lastNode.setOutputKeyComparatorClass(theClass); } return this; } /** * 设置当前节点key分组比较器的Class定义 * * @param theClass * 切分class定义 * @return Builder对象 */ @SuppressWarnings("rawtypes") public Builder setOutputKeyGroupingComparatorClass(Class<? extends RecordComparator> theClass) { if (lastNode != null) { lastNode.setOutputKeyGroupingComparatorClass(theClass); } return this; } /** * 由Pipeline Builder产生Pipeline对象 * * @return 一个新的Pipeline对象 */ public Pipeline createPipeline() { return new Pipeline(nodes); } } /** * 创建一个Pipeline Builder * * @return builder对象 */ public static Builder builder() { return new Builder(); } /** * 将Pipeline对象序列化到JobConf中: * * <pre> * odps.pipeline.list=map:com.example.Map1,reduce:com.example.Reduce1 * odps.pipeline.0.output.key.schema=count:int * odps.pipeline.0.output.value.schema=word:string * ... * odps.pipeline.n.partitioner.class=com.example.Partitioner * </pre> * * @param job * JobConf对象 */ public static void toJobConf(JobConf conf, Pipeline pipeline) { StringBuilder sb = new StringBuilder(); List<TransformNode> nodes = pipeline.getNodes(); for (int i = 0; i < nodes.size(); i++) { TransformNode node = nodes.get(i); sb.append(node.type); sb.append(":"); sb.append(node.getTransformClass().getName()); if (i != nodes.size() - 1) { sb.append(","); } if (node.getOutputKeySchema() != null) { conf.set(PIPELINE + i + OUTPUT_KEY_SCHEMA, SchemaUtils.toString(node.getOutputKeySchema())); } if (node.getOutputValueSchema() != null) { conf.set(PIPELINE + i + OUTPUT_VALUE_SCHEMA, SchemaUtils.toString(node.getOutputValueSchema())); } if (node.getOutputKeySortColumns() != null) { conf.set(PIPELINE + i + OUTPUT_KEY_SORT_COLUMNS, StringUtils.join(node.getOutputKeySortColumns(), ",")); } conf.set(PIPELINE + i + OUTPUT_KEY_SORT_ORDER, StringUtils.join(node.getOutputKeySortOrder(), ",")); if (node.getPartitionColumns() != null) { conf.set(PIPELINE + i + PARTITION_COLUMNS, StringUtils.join(node.getPartitionColumns(), ",")); } if (node.getPartitionerClass() != null) { conf.set(PIPELINE + i + PARTITION_CLASS, node.getPartitionerClass().getName()); } if (node.getOutputGroupingColumns() != null) { conf.set(PIPELINE + i + OUTPUT_GROUP_COLUMNS, StringUtils.join(node.getOutputGroupingColumns(), ",")); } if (node.getOutputKeyComparatorClass() != null) { conf.set(PIPELINE + i + OUTPUT_KEY_COMPARATOR_CLASS, node.getOutputKeyComparatorClass().getName()); } if (node.getOutputKeyGroupingComparatorClass() != null) { conf.set(PIPELINE + i + OUTPUT_KEY_GROUPING_COMPARATOR_CLASS, node.getOutputKeyGroupingComparatorClass().getName()); } if (node.getNumTasks() >= 0) { if (i == 0) { conf.setInt("odps.stage.mapper.num", node.getNumTasks()); } else { conf.setInt("odps.stage.reducer." + i + ".num", node.getNumTasks()); } } if (node.getMemoryForTask() >= 0) { if (i == 0) { conf.setInt("odps.stage.mapper.mem", node.getMemoryForTask()); } else { conf.setInt("odps.stage.reducer." + i + ".mem", node.getMemoryForTask()); } } if (node.getMemoryForJVM() >= 0) { if (i == 0) { conf.setInt("odps.stage.mapper.jvm.mem", node.getMemoryForJVM()); } else { conf.setInt("odps.stage.reducer." + i + ".jvm.mem", node.getMemoryForJVM()); } } } conf.set(PIPELINE_LIST, sb.toString()); } /** * 从JobConf中获得Pipeline对象。如果JobConf中没有相应的配置,返回null * * @param job * JobConf * @return pipeline对象 */ public static Pipeline fromJobConf(JobConf conf) { String pipes = conf.get(PIPELINE_LIST); if (pipes == null) { return null; } Builder builder = builder(); String[] pipelist = pipes.split(","); for (int i = 0; i < pipelist.length; i++) { String pipe = pipelist[i]; String[] parts = pipe.split(":"); // set class name try { Class<?> cls = conf.getClassByName(parts[1]); if (cls != null) { if (parts[0].equals("map")) { if (!Mapper.class.isAssignableFrom(cls)) { throw new RuntimeException(cls + " not Mapper"); } else { builder.addMapper(cls.asSubclass(Mapper.class)); } } else if (parts[0].equals("reduce")) { if (!Reducer.class.isAssignableFrom(cls)) { throw new RuntimeException(cls + " not Reducer"); } else { builder.addReducer(cls.asSubclass(Reducer.class)); } } } else { throw new RuntimeException("Class " + parts[1] + " not found"); } } catch (ClassNotFoundException e) { throw new RuntimeException("Class " + parts[1] + " not found"); } // other properties String keySchema = conf.get(PIPELINE + i + OUTPUT_KEY_SCHEMA); if (keySchema != null) { builder.setOutputKeySchema(SchemaUtils.fromString(keySchema)); } String valueSchema = conf.get(PIPELINE + i + OUTPUT_VALUE_SCHEMA); if (valueSchema != null) { builder.setOutputValueSchema(SchemaUtils.fromString(valueSchema)); } String sortCols = conf.get(PIPELINE + i + OUTPUT_KEY_SORT_COLUMNS); if (sortCols != null) { builder.setOutputKeySortColumns(sortCols.split(",")); } String joined = conf.get(PIPELINE + i + OUTPUT_KEY_SORT_ORDER); SortOrder[] order; if (joined != null && !joined.isEmpty()) { String[] orders = joined.split(","); order = new SortOrder[orders.length]; for (int j = 0; j < order.length; j++) { order[j] = SortOrder.valueOf(orders[j]); } builder.setOutputKeySortOrder(order); } String partCols = conf.get(PIPELINE + i + PARTITION_COLUMNS); if (partCols != null && !partCols.isEmpty()) { builder.setPartitionColumns(partCols.split(",")); } String partClass = conf.get(PIPELINE + i + PARTITION_CLASS); if (partClass != null && !partClass.isEmpty()) { try { Class<?> cls = conf.getClassByName(partClass); if (cls != null) { if (!Partitioner.class.isAssignableFrom(cls)) { throw new RuntimeException(cls + " not Partitioner"); } else { builder.setPartitionerClass(cls.asSubclass(Partitioner.class)); } } else { throw new RuntimeException("Class " + partClass + " not found"); } } catch (ClassNotFoundException e) { throw new RuntimeException("Class " + partClass + " not found"); } } String groupCols = conf.get(PIPELINE + i + OUTPUT_GROUP_COLUMNS); if (groupCols != null && !groupCols.isEmpty()) { builder.setOutputGroupingColumns(groupCols.split(",")); } String keyCmpClass = conf.get(PIPELINE + i + OUTPUT_KEY_COMPARATOR_CLASS); if (keyCmpClass != null && !keyCmpClass.isEmpty()) { try { Class<?> cls = conf.getClassByName(keyCmpClass); if (cls != null) { if (!RecordComparator.class.isAssignableFrom(cls)) { throw new RuntimeException(cls + " not RecordComparator"); } else { builder.setOutputKeyComparatorClass(cls.asSubclass(RecordComparator.class)); } } else { throw new RuntimeException("Class " + keyCmpClass + " not found"); } } catch (ClassNotFoundException e) { throw new RuntimeException("Class " + keyCmpClass + " not found"); } } String keyGrpCmpClass = conf.get(PIPELINE + i + OUTPUT_KEY_GROUPING_COMPARATOR_CLASS); if (keyGrpCmpClass != null && !keyGrpCmpClass.isEmpty()) { try { Class<?> cls = conf.getClassByName(keyGrpCmpClass); if (cls != null) { if (!RecordComparator.class.isAssignableFrom(cls)) { throw new RuntimeException(cls + " not RecordComparator"); } else { builder.setOutputKeyGroupingComparatorClass(cls.asSubclass(RecordComparator.class)); } } else { throw new RuntimeException("Class " + keyGrpCmpClass + " not found"); } } catch (ClassNotFoundException e) { throw new RuntimeException("Class " + keyGrpCmpClass + " not found"); } } int numTasks = 1; if (i == 0) { numTasks = conf.getInt("odps.stage.mapper.num", 1); } else { numTasks = conf.getInt("odps.stage.reducer." + i + ".num", conf.getInt("odps.stage.reducer.num", 1)); } builder.setNumTasks(numTasks); } return builder.createPipeline(); } }