HiveViewCreator.java example

Explorer

bigpetstore-master
- hadoop-1.2.1
  - src
- src
  - integration
    - java
      - org
        bigtop
        bigpetstore
        integration
        BigPetStoreHiveIT.java
        BigPetStoreMahoutIT.java
        BigPetStorePigIT.java
        ITUtils.java
  - main
    - java
      - org
        bigtop
        bigpetstore
        clustering
        BPSRecommnder.java
        MahoutClusterTransactionsByRegion.java
        contract
        PetStoreStatistics.java
        etl
        CrunchETL.java
        HiveViewCreator.java
        LineItem.java
        PigCSVCleaner.java
        generator
        BPSGenerator.java
        GeneratePetStoreTransactionsInputFormat.java
        PetStoreTransaction.java
        PetStoreTransactionInputSplit.java
        TransactionIteratorFactory.java
        util
        BigPetStoreConstants.java
        DeveloperTools.java
        NumericalIdUtils.java
        Pair.java
        PetStoreParseFunctions.java
        StringUtils.java
  - test
    - java
      - org
        bigtop
        bigpetstore
        docs
        TestDocs.java
        generator
        TestNumericalIdUtils.java
        TestPetStoreTransactionGeneratorJob.java

package org.bigtop.bigpetstore.etl;

import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.parse.HiveParser_IdentifiersParser.booleanValue_return;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.util.Tool;
import org.bigtop.bigpetstore.util.BigPetStoreConstants;
import org.bigtop.bigpetstore.util.NumericalIdUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * 
 * Hive View creator is designed to read from Pigs cleaned output.
 * The basic strategy is:
 * 
 * 1) store pig output as a hive table
 * 2) use "select .. as" to select a subset 
 * 
 * Note on running locally:
 * 
 * 1) Local mode requires a hive and hadoop tarball, with HIVE_HOME and
 * HADOOP_HOME pointing to it. 2) In HADOOP_HOME, you will need to cp the
 * HIVE_HOME/lib/hive-serde*jar file into HADOOP_HOME/lib.
 * 
 * Then, the below queries will run.
 * 
 * The reason for this is that the hive SerDe stuff is used in the MapReduce
 * phase of things, so those utils need to be available to hadoop itself. That
 * is because the regex input/output is processed vthe mappers
 * 
 */
public class HiveViewCreator implements Tool {

    static {
        try{
            Class.forName("org.apache.hadoop.hive.ql.exec.mr.ExecDriver");
            System.out.println("found exec driver !!!!!!!!!!!!!!!!");
        }
        catch(Throwable t) {
            throw new RuntimeException(t);
        }
        try{
            //Class.forName("org.apache.hadoop.hive.ql.exec.mr.ExecDriver");
        }
        catch(Throwable t) {
            throw new RuntimeException(t);
        }
    }
    Configuration conf;
    @Override
    public void setConf(Configuration conf) {
        this.conf=conf;
    }

    @Override
    public Configuration getConf() {
        return conf;
    }

    /**
     * Input args:
     *  Cleaned data files from pig (tsv)
     *  Ouptut table (desired path to mahout input data set)
     *  
     */
    @Override
    public int run(String[] args) throws Exception {
        Statement stmt = getConnection();
        stmt.execute("DROP TABLE IF EXISTS " + BigPetStoreConstants.OUTPUTS.MAHOUT_CF_IN.name());
        System.out.println("input data " + args[0]);
        System.out.println("output table " + args[1]);
        
        Path inTablePath =  new Path(args[0]);
        String inTableName = "cleaned"+System.currentTimeMillis();
        String outTableName = BigPetStoreConstants.OUTPUTS.MAHOUT_CF_IN.name();
        
        Path outTablePath = new Path (inTablePath.getParent(),outTableName);
        
        final String create = "CREATE EXTERNAL TABLE "+inTableName+" ("
                + "  dump STRING,"
                + "  state STRING,"
                + "  trans_id STRING,"
                + "  lname STRING,"
                + "  fname STRING,"
                + "  date STRING,"
                + "  price STRING,"
                + "  product STRING"
                + ") ROW FORMAT "
                + "DELIMITED FIELDS TERMINATED BY '\t' "
                + "LINES TERMINATED BY '\n' "
                + "STORED AS TEXTFILE "
                + "LOCATION '"+inTablePath+"'";
        boolean res = stmt.execute(create);
        System.out.println("Execute return code : " +res);
        //will change once we add hashes into pig ETL clean
        String create2 = 
                "create table "+outTableName+" as "+
                "select hash(concat(state,fname,lname)),',',hash(product),',',1 " 
                + "from "+inTableName;
                
        System.out.println("CREATE = " + create2  );
        System.out.println("OUT PATH = " + outTablePath);
        boolean res2 = stmt.execute(create2);

        String finalOutput = String.format(
                "INSERT OVERWRITE DIRECTORY '%s' SELECT * FROM %s",outTablePath, outTableName) ;
        
        stmt.execute(finalOutput);
        System.out.println("FINAL OUTPUT STORED : " + outTablePath);
        return 0;
    }

    public static final String HIVE_JDBC_DRIVER = "org.apache.hive.jdbc.HiveDriver";
    public static final String HIVE_JDBC_EMBEDDED_CONNECTION = "jdbc:hive2://";

    final static Logger log = LoggerFactory.getLogger(HiveViewCreator.class);


    private Statement getConnection() throws ClassNotFoundException,
            SQLException {
        Class.forName(HIVE_JDBC_DRIVER);
        Connection con = DriverManager.getConnection(
                HIVE_JDBC_EMBEDDED_CONNECTION, "", "");
        System.out.println("hive con = " + con.getClass().getName());
        Statement stmt = con.createStatement();
        return stmt;
    }
    
    public static void main(String[] args) throws Exception {
        new HiveViewCreator()
            .run(args);
    }
}