package org.bigtop.bigpetstore.etl; import java.sql.Connection; import java.sql.DriverManager; import java.sql.ResultSet; import java.sql.SQLException; import java.sql.Statement; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.ql.parse.HiveParser_IdentifiersParser.booleanValue_return; import org.apache.hadoop.io.Text; import org.apache.hadoop.util.Tool; import org.bigtop.bigpetstore.util.BigPetStoreConstants; import org.bigtop.bigpetstore.util.NumericalIdUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * * Hive View creator is designed to read from Pigs cleaned output. * The basic strategy is: * * 1) store pig output as a hive table * 2) use "select .. as" to select a subset * * Note on running locally: * * 1) Local mode requires a hive and hadoop tarball, with HIVE_HOME and * HADOOP_HOME pointing to it. 2) In HADOOP_HOME, you will need to cp the * HIVE_HOME/lib/hive-serde*jar file into HADOOP_HOME/lib. * * Then, the below queries will run. * * The reason for this is that the hive SerDe stuff is used in the MapReduce * phase of things, so those utils need to be available to hadoop itself. That * is because the regex input/output is processed vthe mappers * */ public class HiveViewCreator implements Tool { static { try{ Class.forName("org.apache.hadoop.hive.ql.exec.mr.ExecDriver"); System.out.println("found exec driver !!!!!!!!!!!!!!!!"); } catch(Throwable t) { throw new RuntimeException(t); } try{ //Class.forName("org.apache.hadoop.hive.ql.exec.mr.ExecDriver"); } catch(Throwable t) { throw new RuntimeException(t); } } Configuration conf; @Override public void setConf(Configuration conf) { this.conf=conf; } @Override public Configuration getConf() { return conf; } /** * Input args: * Cleaned data files from pig (tsv) * Ouptut table (desired path to mahout input data set) * */ @Override public int run(String[] args) throws Exception { Statement stmt = getConnection(); stmt.execute("DROP TABLE IF EXISTS " + BigPetStoreConstants.OUTPUTS.MAHOUT_CF_IN.name()); System.out.println("input data " + args[0]); System.out.println("output table " + args[1]); Path inTablePath = new Path(args[0]); String inTableName = "cleaned"+System.currentTimeMillis(); String outTableName = BigPetStoreConstants.OUTPUTS.MAHOUT_CF_IN.name(); Path outTablePath = new Path (inTablePath.getParent(),outTableName); final String create = "CREATE EXTERNAL TABLE "+inTableName+" (" + " dump STRING," + " state STRING," + " trans_id STRING," + " lname STRING," + " fname STRING," + " date STRING," + " price STRING," + " product STRING" + ") ROW FORMAT " + "DELIMITED FIELDS TERMINATED BY '\t' " + "LINES TERMINATED BY '\n' " + "STORED AS TEXTFILE " + "LOCATION '"+inTablePath+"'"; boolean res = stmt.execute(create); System.out.println("Execute return code : " +res); //will change once we add hashes into pig ETL clean String create2 = "create table "+outTableName+" as "+ "select hash(concat(state,fname,lname)),',',hash(product),',',1 " + "from "+inTableName; System.out.println("CREATE = " + create2 ); System.out.println("OUT PATH = " + outTablePath); boolean res2 = stmt.execute(create2); String finalOutput = String.format( "INSERT OVERWRITE DIRECTORY '%s' SELECT * FROM %s",outTablePath, outTableName) ; stmt.execute(finalOutput); System.out.println("FINAL OUTPUT STORED : " + outTablePath); return 0; } public static final String HIVE_JDBC_DRIVER = "org.apache.hive.jdbc.HiveDriver"; public static final String HIVE_JDBC_EMBEDDED_CONNECTION = "jdbc:hive2://"; final static Logger log = LoggerFactory.getLogger(HiveViewCreator.class); private Statement getConnection() throws ClassNotFoundException, SQLException { Class.forName(HIVE_JDBC_DRIVER); Connection con = DriverManager.getConnection( HIVE_JDBC_EMBEDDED_CONNECTION, "", ""); System.out.println("hive con = " + con.getClass().getName()); Statement stmt = con.createStatement(); return stmt; } public static void main(String[] args) throws Exception { new HiveViewCreator() .run(args); } }