/* Copyright (c) 2005 - 2012 Vertica, an HP company -*- Java -*- */ package com.vertica.hadoop; import java.io.IOException; import java.util.ArrayList; import java.util.Calendar; import java.util.Collection; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.StringTokenizer; import java.math.BigDecimal; import java.sql.Types; import java.sql.Date; import java.sql.Time; import java.sql.Timestamp; import java.text.ParseException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.io.DoubleWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import com.vertica.hadoop.VerticaInputFormat; import com.vertica.hadoop.VerticaOutputFormat; import com.vertica.hadoop.VerticaRecord; public class hdfs2vertica extends Configured implements Tool { public static class Map extends Mapper<LongWritable, Text, Text, VerticaRecord> { VerticaRecord record = null; Integer numColumns = null; String delimiter = null; String table = null; ArrayList<String> columns = null; public void setup(Context context) throws IOException, InterruptedException { super.setup(context); try { record = new VerticaRecord(context.getConfiguration()); numColumns = new Integer(record.size()); Configuration conf = context.getConfiguration(); String colNames = new String(conf.get("hdfs2vertica.columns")); StringTokenizer tok = new StringTokenizer(colNames, ","); if (tok.countTokens() != numColumns) { throw new IOException("The number of columns in the file (" + tok.countTokens() + ") does not match the number of columns (" + numColumns + ") in the table"); } columns = new ArrayList<String>(); for (int count = 0; count < numColumns; count++) { columns.add(tok.nextToken()); } delimiter = new String(conf.get("hdfs2vertica.delimiter")); table = new String(conf.get("hdfs2vertica.table")); } catch (Exception e) { throw new IOException(e); } } public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString(); if (record == null) { throw new IOException("No output record found"); } StringTokenizer tok = new StringTokenizer(line, delimiter); if (tok.countTokens() != numColumns) { throw new IOException("The number of columns in the file (" + tok.countTokens() + ") does not match the number of columns (" + numColumns + ") in row " + value.toString()); } try { for (int count = 0; count < numColumns; count++) { record.setFromString(columns.get(count), tok.nextToken()); } } catch (ParseException p) { throw new IOException(p.getMessage()); } context.write(new Text(table), record); } } public Job getJob(String[] args) throws IOException { Configuration conf = getConf(); Job job = new Job(conf); conf = job.getConfiguration(); job.setJobName("HDFS To Vertica Transfer"); job.setNumReduceTasks(0); job.setOutputKeyClass(Text.class); job.setOutputValueClass(VerticaRecord.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(VerticaRecord.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(VerticaOutputFormat.class); job.setJarByClass(hdfs2vertica.class); job.setMapperClass(Map.class); return job; } @SuppressWarnings("serial") @Override public int run(String[] args) throws Exception { if (args.length != 4) { throw new IOException("Expect <file path> <table name> <delimiter> <file schema>"); } Job job = getJob(args); Configuration conf = job.getConfiguration(); conf.set("hdfs2vertica.table",args[1]); conf.set("hdfs2vertica.delimiter",args[2]); conf.set("hdfs2vertica.columns",args[3]); VerticaOutputFormat.setOutput(job, args[1]); FileInputFormat.setInputPaths(job, args[0]); job.waitForCompletion(true); return 0; } public static void main(String[] args) throws Exception { int res = ToolRunner.run(new Configuration(), new hdfs2vertica(), args); System.exit(res); } }