ConvertEnvSingleTable2Avro.java example

Explorer

hadcom.utils-master
- src
  - main
    - java
      - com
        cloudera
        sa
        hcu
        env2
        arvo
        io
        examples
        AvroMapReduceExample.java
        GenerateBasicAvroFile.java
        job
        ConvertEnvMultiTable2MultiAvro.java
        ConvertEnvMultiTable2MultiAvro2.java
        ConvertEnvSingleTable2Avro.java
        EnvMain.java
        test
        gen
        GenerateEnvSingleTableTestFile.java
        hcuMain.java
        io
        console
        out
        ConsoleOutMain.java
        get
        AbstractGetter.java
        GetAvroFile.java
        GetMain.java
        GetRcFile.java
        GetSequenceFile.java
        out
        ConsoleOutAvroFile.java
        ConsoleOutRcFile.java
        put
        PutMain.java
        Putter.java
        hdfs
        writer
        AbstractWriter.java
        AvroWriter.java
        HBaseWriter.java
        RcFileWriter.java
        SequenceFileDelimiterWriter.java
        WriterFactory.java
        listener
        HeartBeatConsoleOutputListener.java
        PutListener.java
        local
        reader
        AbstractLocalFileColumnReader.java
        CsvReader.java
        DelimiterReader.java
        FileNameAggregateFileReader.java
        FlatFileReader.java
        LocalOneOrMoreFileColumnReader.java
        ReaderFactory.java
        VaribleLengthDelimiterReader.java
        VaribleLengthFlatFileReader.java
        recompression
        nonsplittable
        NonSplittableGzipToSeq.java
        NonSplittableZipToSeq.java
        route
        RouteMain.java
        scheduler
        AbstractRoute.java
        EventDrivenRoute.java
        IRouteWorker.java
        ScheduledDrivenRoute.java
        thread
        DirWatcherObserver.java
        IRouteThread.java
        InputDirWatcherThread.java
        PutExecuterObserver.java
        PutExecutionThread.java
        utils
        LocalFileUtils.java
        utils
        PropertyUtils.java

package com.cloudera.sa.hcu.env2.arvo.job;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.regex.Pattern;

import org.apache.avro.Schema;
import org.apache.avro.file.DataFileWriter;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericDatumWriter;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.io.DatumWriter;
import org.apache.avro.util.Utf8;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.codehaus.jackson.JsonNode;
import org.codehaus.jackson.node.TextNode;

/**
 * Summary of map reduce job: <br>
 * <br>
 * This job will take a file where each line in that file in of the following
 * format. <br>
 * <br>
 * {unique id}|{field name}|{field value} <br>
 * <br>
 * An example file would be the following <br>
 * <br>
 * Josh|Title|Data Scientist Josh|Age|33 <br>
 * Josh|Favorite Restaurant|Heirloom Cafe <br>
 * Ted|Office Location|Palo Alto <br>
 * Ted|Title|Systems Engineer <br>
 * Ted|Favorite Sport|Golf <br>
 * <br>
 * 
 * This job will take this data and create a avro schema like the following <br>
 * <br>
 * { "name": "givenSchemaName", "type": "record", "fields": [<br>
 * {"name": "Title", "type": "string"}, <br>
 * {"name": "Age", "type": "string"}, <br>
 * {"name": "Favorite_Restaurant", "type": "double"}, <br>
 * {"name": "Office_Location", "type": "string"}, <br>
 * {"name": "Favorite_Sport", "type": "string"} ]};<br>
 * <br>
 * Then write the output file.<br>
 * <br>
 * This is all done in one map/reduce. To do this I have to use the following
 * tricks: <br>
 * 1. The mapper send field information to every reducer. This is done using a
 * override partitioner <br>
 * 2. The avro file is created in the reducer not at the time the job started.
 * This has to be because the schema is not known before the sort and shuffle is
 * complete.<br>
 * <br>
 * 
 * @author ted.malaska
 * 
 */
public class ConvertEnvSingleTable2Avro
{
	//This is the post fix of the normal reducers output directory
	//TODO need to figure out how to turn off normal reducer output
	public static final String STAT_DIR_POST_NAME = "_stats";
	
	//These are name of configuration properties that are added my the main function
	public static final String CONF_OUTPUT_DIR = "convertEnt2Avro.outputDir";
	public static final String CONF_NUM_OF_REDUCERS = "convertEnt2Avro.numOfReducers";
	public static final String CONF_SCHEMA_NAME = "convertEnt2Avro.schemaName";

	//Precompiled pattern for spliting pipe delimitered strings
	public static Pattern pipeSplit = Pattern.compile("\\|");

	// TODO in future versions I may try to figure out the schema at run time
	public static Pattern isIntPattern = Pattern.compile("^(\\+|-)?\\d+$");
	public static Pattern isDoublePattern = Pattern.compile("^(\\+|-)?[0-9]+(\\.[0-9][0-9]?)?");

	public static class CustomMapper extends Mapper<LongWritable, Text, Text, Text>
	{
		int numOfReducers = 0;

		HashSet<String> fieldSet = new HashSet<String>();

		Text newKey = new Text();
		Text newValue = new Text();

		static final int KEY_IDX = 0;
		static final int FIELD_IDX = 1;
		static final int VALUE_IDX = 2;

		public static final String VALUES_SORT_FLAG = "V";

		@Override
		public void setup(Context context)
		{
			numOfReducers = Integer.parseInt(context.getConfiguration().get(CONF_NUM_OF_REDUCERS));
		}

		@Override
		public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException
		{
			String values[] = pipeSplit.split(value.toString());

			newKey.set(VALUES_SORT_FLAG + values[KEY_IDX]);

			String field = values[1].replace(' ', '_');
			field = field.replace('-', '_');

			newValue.set(field + "|" + values[2]);

			context.write(newKey, newValue);

			fieldSet.add(field);
		}

		@Override
		public void cleanup(Context context) throws IOException, InterruptedException
		{

			for (String field : fieldSet)
			{

				for (int i = 0; i < numOfReducers; i++)
				{
					newKey.set(Integer.toString(i) + "|" + field);
					newValue.set("");

					context.write(newKey, newValue);
				}
			}
		}
	}

	public static class CustomReducer extends Reducer<Text, Text, LongWritable, Text>
	{
		LongWritable newKey = new LongWritable();
		Text newValue = new Text();

		boolean isSchemaBuildingMode = true;

		FSDataOutputStream dataOutputStream;
		DataFileWriter<GenericRecord> dataFileWriter;
		Path outputFilePath;
		FileSystem hdfs;

		Schema schema;
		ArrayList<Schema.Field> fieldList = new ArrayList<Schema.Field>();

		boolean closedFile = false;
		
		String schemaName;

		@Override
		public void setup(Context context) throws IOException
		{
			context.getCounter("debug", "Setup").increment(1);

			String outputDir = context.getConfiguration().get(CONF_OUTPUT_DIR);

			Configuration config = new Configuration();
			hdfs = FileSystem.get(config);

			outputFilePath = new Path(outputDir + "/part-r-" + context.getTaskAttemptID().getTaskID().getId() + "-" + context.getTaskAttemptID().getId());

			dataOutputStream = hdfs.create(outputFilePath);

			DatumWriter<GenericRecord> writer = new GenericDatumWriter<GenericRecord>();
			// writer.setSchema(s); // I guess I don't need this

			dataFileWriter = new DataFileWriter<GenericRecord>(writer);
			
			//add key field to field list: schemaName_ID
			Schema s = Schema.create(Schema.Type.STRING);
			JsonNode defaultValue = new TextNode("-");
			
			schemaName = context.getConfiguration().get(CONF_SCHEMA_NAME);
			
			Schema.Field schemaField = new Schema.Field(schemaName + "_ID", s, "Field: " + schemaName + "_ID", defaultValue);
			fieldList.add(schemaField);

		}

		@Override
		public void cleanup(Context context) throws IOException, InterruptedException
		{

			try
			{

				dataFileWriter.close();
				dataOutputStream.close();
				hdfs.close();
			} catch (Exception e)
			{
				// do nothing right now
			}
			closedFile = true;

		}

		@Override
		public void run(Context context) throws IOException, InterruptedException
		{
			try
			{
				setup(context);
				while (context.nextKey())
				{
					reduce(context.getCurrentKey(), context.getValues(), context);
				}
				cleanup(context);
			} catch (Exception e)
			{
				exceptionThrown();
				throw new RuntimeException(e);
			}
		}

		public void exceptionThrown() throws IOException, InterruptedException
		{
			// if the file hasn't closed then close it and kill it.
			if (closedFile == false)
			{
				try
				{

					dataFileWriter.close();

					dataOutputStream.close();
				} catch (Exception e)
				{
					// already closed
				}
				try
				{

					hdfs.deleteOnExit(outputFilePath);
					hdfs.close();
				} catch (Exception e)
				{
					// couldn't delete
				}

			}
		}

		@Override
		public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException
		{
			if (key.toString().startsWith(CustomMapper.VALUES_SORT_FLAG))
			{
				// Check is isSchemaBuildingMode true. If so then this is the
				// first data record
				if (isSchemaBuildingMode == true)
				{
					isSchemaBuildingMode = false;
				
					setUpAvroWriter(context);

				}

				context.getCounter("debug", "Record-Add").increment(1);

				//TODO I think I can avoid constructing this evertime
				GenericRecord datum = new GenericData.Record(schema);

				applyDefaultValuesToRecord(datum);

				//add key as schemaName_ID 
				// - also remember to remove first character of the key because it was added for sorting reasons
				datum.put(schemaName + "_ID", new Utf8(key.toString().substring(1)));
				
				//add all the other fields
				for (Text value : values)
				{
					String valueParts[] = pipeSplit.split(value.toString());
					try
					{
						datum.put(valueParts[0], new Utf8(valueParts[1]));
					} catch (Exception e)
					{
						throw new RuntimeException(valueParts[0] + " " + valueParts[1], e);
					}
				}

				dataFileWriter.append(datum);

			} else
			{
				// We are still building the schema
				addFieldToFieldList(key);
			}

		}

		private void applyDefaultValuesToRecord(GenericRecord datum)
		{
			for (Schema.Field field : fieldList)
			{
				datum.put(field.name(), new Utf8(""));
			}
		}

		private void setUpAvroWriter(Context context) throws IOException
		{
			context.getCounter("debug", "Record-Defined").increment(1);

			schema = Schema.createRecord(schemaName, "Generated Schema", null, false);
			
			schema.setFields(fieldList);

			dataFileWriter.create(schema, dataOutputStream);
		}

		private void addFieldToFieldList(Text key)
		{
			String field = pipeSplit.split(key.toString())[1];

			Schema s = Schema.create(Schema.Type.STRING);
			JsonNode defaultValue = new TextNode("-");

			Schema.Field schemaField = new Schema.Field(field, s, "Field: " + field, defaultValue);
			fieldList.add(schemaField);
		}
	}

	public static class CustomPartitioner extends Partitioner<Text, Text>
	{

		public static final int GROUP_KEY_IDX = 0;

		@Override
		public int getPartition(Text key, Text value, int numPartitions)
		{

			if (numPartitions == 1)
			{
				return 0;
			} else
			{
				String keyString = key.toString();
				if (keyString.startsWith(CustomMapper.VALUES_SORT_FLAG))
				{
					int result = keyString.hashCode() % numPartitions;

					return Math.abs(result);
				} else
				{
					return Integer.parseInt(pipeSplit.split(keyString)[0]);
				}
			}
		}

	}

	public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException
	{
		if (args.length != 4 || args[0].contains("-h"))
		{
			System.out.println("ConvertEnt2Avro <inputPath> <outputPath> <schemaName> <# reducers>");
			System.out.println();
			System.out.println("ConvertEnt2Avro ./input ./old testSchemaName 10");
			return;
		}

		// Get values from args
		String inputPath = args[0];
		String outputPath = args[1];
		String schemaName = args[2];
		String numberOfReducers = args[3];

		// Create job
		Job job = new Job();
		job.setJarByClass(ConvertEnvSingleTable2Avro.class);

		// Set some configurations
		job.getConfiguration().set(CONF_OUTPUT_DIR, outputPath);
		job.getConfiguration().set(CONF_NUM_OF_REDUCERS, numberOfReducers);
		job.getConfiguration().set(CONF_SCHEMA_NAME, schemaName);

		// Define input format and path
		job.setInputFormatClass(TextInputFormat.class);
		FileInputFormat.addInputPath(job, new Path(inputPath));

		// Define output format and path
		job.setOutputFormatClass(TextOutputFormat.class);
		FileOutputFormat.setOutputPath(job, new Path(outputPath + STAT_DIR_POST_NAME));

		// Define the mapper and reducer
		job.setMapperClass(CustomMapper.class);
		job.setReducerClass(CustomReducer.class);
		job.setPartitionerClass(CustomPartitioner.class);

		// Define the key and value format
		job.setOutputKeyClass(LongWritable.class);
		job.setOutputValueClass(Text.class);
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(Text.class);

		job.setNumReduceTasks(Integer.parseInt(numberOfReducers));

		// create output director
		createHdfsDirectoryForFinalOutput(outputPath);

		// Exit
		job.waitForCompletion(true);

		//delete the stat directory because it is empty
		//TODO if we could find a way to not make the stat directory that would be great
		deleteStatDirectory(outputPath);
	}

	private static void deleteStatDirectory(String outputPath) throws IOException
	{
		Configuration config;
		FileSystem hdfs;
		config = new Configuration();
		hdfs = FileSystem.get(config);
		hdfs.deleteOnExit(new Path(outputPath + STAT_DIR_POST_NAME));
	}

	private static void createHdfsDirectoryForFinalOutput(String outputPath) throws IOException
	{
		Configuration config = new Configuration();
		FileSystem hdfs = FileSystem.get(config);

		hdfs.mkdirs(new Path(outputPath));

		hdfs.close();
	}

}