VerticaOutputFormat.java example

Explorer
yarn-comment-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.vertica;

import java.io.IOException;
import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.Statement;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.OutputCommitter;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

/**
 * Output formatter for loading reducer output to Vertica
 * 
 */
public class VerticaOutputFormat extends OutputFormat<Text, VerticaRecord> {
  String delimiter = VerticaConfiguration.DELIMITER;
  String terminator = VerticaConfiguration.RECORD_TERMINATER;

  /**
   * Set the output table
   * 
   * @param job
   * @param tableName
   */
  public static void setOutput(Job job, String tableName) {
    setOutput(job, tableName, false);
  }

  /**
   * Set the output table and whether to drop it before loading
   * 
   * @param job
   * @param tableName
   * @param dropTable
   */
  public static void setOutput(Job job, String tableName, boolean dropTable) {
    setOutput(job, tableName, dropTable, (String[])null);
  }

  /**
   * Set the output table, whether to drop it before loading and the create
   * table specification if it doesn't exist
   * 
   * @param job
   * @param tableName
   * @param dropTable
   * @param tableDef
   *          list of column definitions such as "foo int", "bar varchar(10)"
   */
  public static void setOutput(Job job, String tableName, boolean dropTable,
      String... tableDef) {
    VerticaConfiguration vtconfig = new VerticaConfiguration(job
        .getConfiguration());
    vtconfig.setOutputTableName(tableName);
    vtconfig.setOutputTableDef(tableDef);
    vtconfig.setDropTable(dropTable);
  }

  // TODO: handle collection of output tables private class VerticaTable {

  /** {@inheritDoc} */
  public void checkOutputSpecs(JobContext context) throws IOException {
    VerticaUtil.checkOutputSpecs(context.getConfiguration());
    VerticaConfiguration vtconfig = new VerticaConfiguration(context
        .getConfiguration());
    delimiter = vtconfig.getOutputDelimiter();
    terminator = vtconfig.getOutputRecordTerminator();
  }

  /**
   * Test check specs (don't connect to db)
   * 
   * @param context
   * @param test
   *          true if testing
   * @throws IOException
   */
  public void checkOutputSpecs(JobContext context, boolean test)
      throws IOException {
    VerticaUtil.checkOutputSpecs(context.getConfiguration());
    VerticaConfiguration vtconfig = new VerticaConfiguration(context
        .getConfiguration());
    delimiter = vtconfig.getOutputDelimiter();
    terminator = vtconfig.getOutputRecordTerminator();
  }

  /** {@inheritDoc} */
  public RecordWriter<Text, VerticaRecord> getRecordWriter(
      TaskAttemptContext context) throws IOException {

    VerticaConfiguration config = new VerticaConfiguration(context
        .getConfiguration());

    String name = context.getJobName();
    // TODO: use explicit date formats
    String table = config.getOutputTableName();
    String copyStmt = "COPY " + table + " FROM STDIN" + " DELIMITER '"
        + delimiter + "' RECORD TERMINATOR '" + terminator + "' STREAM NAME '"
        + name + "' DIRECT";

    try {
      Connection conn = config.getConnection(true);
      return new VerticaRecordWriter(conn, copyStmt, table, delimiter,
          terminator);
    } catch (Exception e) {
      throw new IOException(e);
    }
  }

  public static VerticaRecord getValue(Configuration conf) throws Exception {
    VerticaConfiguration config = new VerticaConfiguration(conf);
    String table = config.getOutputTableName();
    Connection conn = config.getConnection(true);
    return (new VerticaRecordWriter(conn, "", table, config
        .getOutputDelimiter(), config.getOutputRecordTerminator())).getValue();
  }

  /**
   * Optionally called at the end of a job to optimize any newly created and
   * loaded tables. Useful for new tables with more than 100k records.
   * 
   * @param conf
   * @throws Exception
   */
  public static void optimize(Configuration conf) throws Exception {
    VerticaConfiguration vtconfig = new VerticaConfiguration(conf);
    Connection conn = vtconfig.getConnection(true);

    // TODO: consider more tables and skip tables with non-temp projections 
    String tableName = vtconfig.getOutputTableName();
    Statement stmt = conn.createStatement();
    ResultSet rs = null;
    StringBuffer designTables = new StringBuffer(tableName);
    HashSet<String> tablesWithTemp = new HashSet<String>();

    //fully qualify the table name - defaults to public.<table>
    if(tableName.indexOf(".") == -1) {
      tableName = "public." + tableName;
    }

    //for now just add the single output table
    tablesWithTemp.add(tableName);

    // map from table name to set of projection names
    HashMap<String, Collection<String>> tableProj = new HashMap<String, Collection<String>>();
    rs = stmt.executeQuery("select schemaname, anchortablename, projname from vt_projection;");
    while(rs.next()) {
      String ptable = rs.getString(1) + "." + rs.getString(2);
      if(!tableProj.containsKey(ptable)) {
        tableProj.put(ptable, new HashSet<String>());
      }
      
      tableProj.get(ptable).add(rs.getString(3));
    }
    
    for(String table : tablesWithTemp) {
      if(!tableProj.containsKey(table)) {
        throw new RuntimeException("Cannot optimize table with no data: " + table);
      }
    }
    
    String designName = (new Integer(conn.hashCode())).toString();
    stmt.execute("select create_projection_design('" + designName + "', '', '"
        + designTables.toString() + "')");

    if(VerticaUtil.verticaVersion(conf, true) >= VerticaConfiguration.VERSION_3_5) {
      stmt.execute("select deploy_design('" + designName + "', '" + designName + "')");
    } else {
      rs = stmt.executeQuery("select get_design_script('" + designName + "', '"
          + designName + "')");
      rs.next();
      String[] projSet = rs.getString(1).split(";");
      for (String proj : projSet) {
        stmt.execute(proj);
      }
      stmt.execute("select start_refresh()");
  
      // poll for refresh complete
      boolean refreshing = true;
      Long timeout = vtconfig.getOptimizePollTimeout();
      while (refreshing) {
        refreshing = false;
        rs = stmt
            .executeQuery("select table_name, status from vt_projection_refresh");
        while (rs.next()) {
          String table = rs.getString(1);
          String stat = rs.getString(2);
          if (stat.equals("refreshing") && tablesWithTemp.contains(table))
            refreshing = true;
        }
        rs.close();
  
        Thread.sleep(timeout);
      }
  
      // refresh done, move the ancient history mark (ahm) and drop the temp projections
      stmt.execute("select make_ahm_now()");
  
      for (String table : tablesWithTemp) {
        for (String proj : tableProj.get(table)) {
          stmt.execute("DROP PROJECTION " + proj);
        }
      }

      stmt.close();
    }
  }

  /** (@inheritDoc) */
  public OutputCommitter getOutputCommitter(TaskAttemptContext context)
      throws IOException, InterruptedException {
    return new FileOutputCommitter(FileOutputFormat.getOutputPath(context),
        context);
  }
}