/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package com.aliyun.odps.graph; import java.io.IOException; import java.util.Properties; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import com.aliyun.odps.Odps; import com.aliyun.odps.OdpsException; import com.aliyun.odps.account.Account; import com.aliyun.odps.account.AliyunAccount; import com.aliyun.odps.conf.Configuration; import com.aliyun.odps.counter.Counters; import com.aliyun.odps.graph.job.JobRunner; import com.aliyun.odps.mapred.RunningJob; import com.aliyun.odps.mapred.conf.SessionState; import com.aliyun.odps.utils.ReflectionUtils; import com.aliyun.odps.utils.StringUtils; /** * GraphJob 继承自 {@link JobConf},用于定义、提交和管理一个 ODPS Graph 作业. * * <p> * ODPS Graph 作业属于一类 <a * href="http://en.wikipedia.org/wiki/Bulk_synchronous_parallel" >BSP (Bulk * Synchronous Parallel)</a> 程序,通过构造一个有向图,然后迭代对图进行编辑处理完成计算任务,迭代终止条件允许自定义。 * </p> * * <p> * 一个 ODPS Graph 程序逻辑如下: * <ol> * <li><b>加载图:</b> * <ul> * <li>图是一个由点 和边 ({@link Vertex}/{@link Edge})组成的有向图,点和边包含值; * <li>使用 {@link GraphLoader} 将表的记录解析为点及其出边; * <li>分布式化:按点 ID 哈希,分配到相应的 Worker; * </ul> * <li><b>超步(super step):</b> * <ul> * <li>一次迭代为一个超步,遍历所有非结束的点,并调用其 * {@linkplain Vertex#compute(ComputeContext, Iterable) compute} 方法; * <li>在用户实现的 {@linkplain Vertex#compute(ComputeContext, Iterable) compute} 方法中: * <ul> * <li>处理上一次迭代发给当前点的消息; * <li>根据需要对图进行编辑:1)修改点/边的取值;2)发送消息给某些点;3)增加/删除点或边; * <li>通过 {@link Aggregator} 汇总信息到全局信息; * <li>设置当前点状态,结束或非结束状态; * <li>迭代进行过程中,框架会将消息以异步的方式发送到对应 Worker 并在下一轮迭代时进行处理,用户无需关心; * </ul> * </ul> * <li><b>迭代终止(满足以下任意一条):</b> * <ul> * <li>所有点处于结束状态且没有新消息产生 * <li>到达最大迭代次数; * <li>某个 {@link Aggregator} 的 * {@linkplain Aggregator#terminate(WorkerContext, com.aliyun.odps.io.Writable) * terminate} 返回true; * </ul> * </ol> * </p> * * <p> * GraphJob 提供了两类接口: * </p> * * <p> * <b>第一类:用于定义一个 ODPS Graph 作业,这类接口继承自 {@link JobConf},主要包括:</b> * </p> * * <p> * 指定 ODPS Graph 的具体实现类: * <ul> * <li>{@link #setWorkerComputerClass(Class)} * <li>{@link #setGraphLoaderClass(Class)} * <li>{@link #setVertexClass(Class)} * <li>{@link #setAggregatorClass(Class)} * <li>{@link #setAggregatorClass(Class...)} * <li>{@link #setPartitionerClass(Class)} * <li>{@link #setCombinerClass(Class)} * </ul> * 其中,{@link GraphLoader} 和 {@link Vertex} 必须提供实现,其他根据需要可选。 * </p> * * <p> * 指定作业的输入输出: * <ul> * <li>{@link #addInput(TableInfo)} * <li>{@link #addInput(TableInfo, String[])} * <li>{@link #addOutput(TableInfo)} * <li>{@link #addOutput(TableInfo, boolean)} * <li>{@link #addOutput(TableInfo, String)} * <li>{@link #addOutput(TableInfo, String, boolean)} * </ul> * </p> * * <p> * 声明本作业用到的 ODPS 资源: * <ul> * <li>{@link #addCacheResources(String)},此接口作用与 jar -resources 声明资源的效果一样 * <li>{@link #addCacheResourcesToClassPath(String)} 此接口作用与 jar -libjars * 声明资源的效果一样 * </ul> * </p> * * <p> * 指定一些高级选项,指示 ODPS Graph 执行框架该如何执行这个作业,例如: : * <ul> * <li>{@link #setSplitSize(long)} 设置输入的切分大小(单位 MB,默认值 256),会影响 Worker 数目; * <li>{@link #setRuntimePartitioning(boolean)} 指示 Worker 在加载点后是否进行重新分发,默认值是 * true; * <li>{@link #setMaxIteration(int)} 设置最大迭代次数,最大迭代次数是迭代终止的条件之一,默认值是 * -1,若取值<=0,则表示最大迭代次数不作为迭代终止条件; * </ul> * </p> * * <p> * <b>第二类:用于提交和管理一个 ODPS Graph 作业,主要包括:</b> * <ul> * <li>{@link #run()} 提交作业并等待作业结束,作业失败则会抛异常,阻塞(同步)方式; * <li>{@link #submit()} 提交作业立即返回,非阻塞(异步)方式; * <li>{@link #isComplete()} 查询作业是否结束(成功、失败或被杀),通常在非阻塞方式提交作业时使用; * <li>{@link #isSuccessful()} 查询作业是否成功,通常在非阻塞方式提交作业时使用; * <li>{@link #getCounters()} 获取作业计数信息; * </ul> * </p> * * <p> * 代码示例,摘自PageRank: * * <pre> * {@code * public static void main(String[] args) throws IOException { * * GraphJob job = new GraphJob(); * * job.setGraphLoaderClass(PageRankGraphLoader.class); * job.setVertexClass(PageRankVertex.class); * job.addInput(new TableInfo(args[0])); * job.addOutput(new TableInfo(args[1])); * * job.setMaxIteration(30); * * job.run(); * } } * </pre> * * </blockquote> * </p> * * @see Vertex * @see GraphLoader * @see Aggregator * @see WorkerComputer */ public class GraphJob extends JobConf { private static final Log LOG = LogFactory.getLog(GraphJob.class); private RunningJob rJob = null; /** * 构造一个 ODPS Graph 作业. */ public GraphJob() { super(); } /** * 构造一个 ODPS Graph 作业,可以指示是否加载 CLASSPATH 路径上的 odps-graph.xml 配置文件. * * @param loadDefaults * 指示是否加载 CLASSPATH 路径上的 odps-graph.xml 配置文件 */ @Deprecated public GraphJob(boolean loadDefaults) { super(loadDefaults); } /** * 构造一个 ODPS Graph 作业. * * @param conf * 配置管理器 * @param js * 作业初始状态,定义或运行状态 */ @Deprecated public GraphJob(Configuration conf, JobState js) { super(conf, js); } /** * 构造一个 ODPS Graph 作业. * * @param conf * 配置管理器 */ public GraphJob(Configuration conf) { super(conf); } /** * 构造一个 ODPS Graph 作业. * * 传入一个Configuration-format XML 配置文件,格式示例: * * <pre> * <configuration> * <property> * <name>com.mycomp.xxx</name> * <value>xxx</value> * </property> * ... ... * </configuration> * </pre> * * @param config * Configuration-format XML 配置文件 */ @Deprecated public GraphJob(String config) { super(config); } /** * 查询作业是否结束. * * @return 作业结束返回true,否则返回false * @throws IOException */ public boolean isComplete() throws IOException { ensureState(JobState.RUNNING); return rJob.isComplete(); } /** * 查询作业实例是否运行成功. * * @return 作业成功返回true,否则返回false * @throws IOException */ public boolean isSuccessful() throws IOException { ensureState(JobState.RUNNING); return rJob.isSuccessful(); } /** * Kill 此作业运行实例 * * @throws IOException */ public void killJob() throws IOException { ensureState(JobState.RUNNING); rJob.killJob(); } /** * 非阻塞(异步)方式提交 ODPS Graph 作业后立即返回. * * <p> * 只有当提交作业发生异常抛{@link IOException}(注意:这与 {@link #run()} 异常行为不同, {@link #run()} * 在作业失败时会抛异常)。 * </p> * * <p> * 使用本方法提交作业,可以轮询作业状态,示例代码: * * <pre> * GraphJob job = new GraphJob(); * ... //config job * job.submit(); * while (!job.isComplete()) { * Thread.sleep(4000); // do your work or sleep * } * if (job.isSuccessful()) { * System.out.println("Job Success!"); * } else { * System.err.println("Job Failed!"); * } * </pre> * * </p> * * @throws IOException * 作业提交失败时抛异常 */ public void submit() throws IOException { ensureState(JobState.DEFINE); try { parseArgs(); String runner = "com.aliyun.odps.graph.job.NetworkJobRunner"; if (SessionState.get().isLocalRun()) { runner = "com.aliyun.odps.graph.local.LocalGraphJobRunner"; } JobRunner jobrunner = null; try { Class<? extends JobRunner> clz = (Class<? extends JobRunner>) Class.forName(runner); jobrunner = ReflectionUtils.newInstance(clz, this); } catch (ClassNotFoundException e) { LOG.fatal("Internal error: currupted installation.", e); throw new RuntimeException(e); } rJob = jobrunner.submit(); } catch (OdpsException oe) { LOG.error(StringUtils.stringifyException(oe)); throw new IOException(oe.getMessage()); } catch (Exception e) { LOG.error(StringUtils.stringifyException(e)); throw new IOException(e.getMessage()); } state = JobState.RUNNING; } /** * 阻塞(同步)方式提交 ODPS Graph 作业并等待作业结束. * * <p> * 以下情况发生时抛{@link IOException}: * <ul> * <li>提交作业时异常 * <li>轮询作业状态异常 * <li>作业失败,注意:这与{@link #submit()}异常行为不同 * </ul> * </p> * * <p> * 作业主程序(main函数)需要谨慎处理该异常,因为会影响到console的返回值: <br/> * 如果不catch异常,作业失败时会抛出异常,console返回值为非0;如果catch异常且不再向外抛出,即使作业失败,console返回值也为0。<br/> * </p> * * 示例代码: * * <pre> * GraphJob job = new GraphJob(); * ... //config job * job.run(); * </pre> * * @throws IOException * 如果发生提交作业异常、轮询作业状态异常或者作业失败,则抛 IOException 异常 * @see #submit() */ public void run() throws IOException { if (state == JobState.DEFINE) { submit(); } rJob.waitForCompletion(); if (!rJob.isSuccessful()) { throw new IOException("Job failed!"); } } /** * 获取作业运行实例的 Counters 信息,ODPS Graph 运行框架会汇总所有 Worker 设置的 Counters. * * @return 作业运行实例的 Counters 信息 * @throws IOException */ public Counters getCounters() throws IOException { return rJob.getCounters(); } private void parseArgs() { Properties prop = System.getProperties(); String runmode = prop.getProperty("odps.runner.mode"); if (runmode != null && runmode.length() != 0) { SessionState.get().setLocalRun(runmode.equalsIgnoreCase("local")); } String resources = prop.getProperty("odps.cache.resources"); if (resources != null && !resources.trim().isEmpty()) { this.addCacheResourcesToClassPath(resources); } String project = prop.getProperty("odps.project.name"); if (prop.getProperty("odps.access.id") != null) { String endpoint = prop.getProperty("odps.end.point"); String accessId = prop.getProperty("odps.access.id"); String accessKey = prop.getProperty("odps.access.key"); Account account = new AliyunAccount(accessId, accessKey); Odps odps = new Odps(account); odps.setDefaultProject(project); if (endpoint != null && endpoint.length() != 0) { odps.setEndpoint(endpoint); } String logViewHost = prop.getProperty("odps.logview.host"); if (logViewHost != null && logViewHost.length() != 0) { odps.setLogViewHost(logViewHost); } // set running cluster to new odps String runningCluster = SessionState.get().getOdps().instances().getDefaultRunningCluster(); odps.instances().setDefaultRunningCluster(runningCluster); SessionState.get().setOdps(odps); } // if in local mode and no odps in sessionState, fill it. else if (SessionState.get().isLocalRun() && SessionState.get().getOdps() == null) { Account account = new AliyunAccount("defaultId", "defaultKey"); Odps odps = new Odps(account); odps.setDefaultProject(project); SessionState.get().setOdps(odps); } } }