TableMRSortedTableZebraKeyGenerator.java example

Explorer
flare-spork-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.zebra.mapreduce;

import java.io.IOException;
import java.util.Iterator;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.zebra.mapreduce.BasicTableOutputFormat;
import org.apache.hadoop.zebra.parser.ParseException;
import org.apache.hadoop.zebra.schema.Schema;
import org.apache.hadoop.zebra.types.TypesUtils;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.data.DefaultTuple;
import org.apache.pig.data.Tuple;

/**
 * This is a sample a complete MR sample code for Table. It doens't contain
 * 'read' part. But, it should be similar and easier to write. Refer to test
 * cases in the same directory.
 * 
 * Assume the input files contain rows of word and count, separated by a space:
 * 
 * <pre>
 * this 2
 * is 1
 * a 4 
 * test 2 
 * hello 1 
 * world 3
 * </pre>
 * 
 */
public class TableMRSortedTableZebraKeyGenerator {
	static class MapClass extends
	Mapper<LongWritable, Text, BytesWritable, Tuple> {
		private BytesWritable bytesKey;
		private Tuple tupleRow;
		private Object javaObj;

		@Override
		public void map(LongWritable key, Text value, Context context)
		throws IOException, InterruptedException {
			// value should contain "word count"
			String[] wdct = value.toString().split(" ");
			if (wdct.length != 2) {
				// LOG the error
				return;
			}

			byte[] word = wdct[0].getBytes();
			bytesKey.set(word, 0, word.length);
			tupleRow.set(0, new String(word));
			tupleRow.set(1, Integer.parseInt(wdct[1]));

			// This key has to be created by user
			Tuple userKey = new DefaultTuple();
			userKey.append(new String(word));
			userKey.append(Integer.parseInt(wdct[1]));
			try {

				/* New M/R Interface */
				/* Converts user key to zebra BytesWritable key */
				/* using sort key expr tree  */
				/* Returns a java base object */
				/* Done for each user key */

				bytesKey = BasicTableOutputFormat.getSortKey(javaObj, userKey);
			} catch(Exception e) {

			}

			context.write(bytesKey, tupleRow);
		}

		@Override
		public void setup(Context context) {
			bytesKey = new BytesWritable();
			try {
				Schema outSchema = BasicTableOutputFormat.getSchema(context);
				tupleRow = TypesUtils.createTuple(outSchema);

				/* New M/R Interface */
				/* returns an expression tree for sort keys */
				/* Returns a java base object */
				/* Done once per table */
				javaObj = BasicTableOutputFormat.getSortKeyGenerator(context);

			} catch (IOException e) {
				throw new RuntimeException(e);
			} catch (ParseException e) {
				throw new RuntimeException(e);
			}
		}

	}

	static class ReduceClass extends
	Reducer<BytesWritable, Tuple, BytesWritable, Tuple> {
		Tuple outRow;

		public void reduce(BytesWritable key, Iterator<Tuple> values, Context context)
		throws IOException, InterruptedException {
			try {
				for(; values.hasNext();)  {
					context.write(key, values.next());
				}  
			} catch (ExecException e) {
				e.printStackTrace();
			}
		}

	}  

	public static void main(String[] args) throws ParseException, IOException, 
	InterruptedException, ClassNotFoundException {
		Job job = new Job();
		job.setJobName("tableMRSample");
		Configuration conf = job.getConfiguration();
		conf.set("table.output.tfile.compression", "gz");

		// input settings
		job.setInputFormatClass(TextInputFormat.class);
		job.setMapperClass(TableMRSortedTableZebraKeyGenerator.MapClass.class);
		job.setReducerClass(TableMRSortedTableZebraKeyGenerator.ReduceClass.class);
		job.setMapOutputKeyClass(BytesWritable.class);
		job.setMapOutputValueClass(DefaultTuple.class);
		FileInputFormat.setInputPaths(job, new Path(
				"/home/gauravj/work/grid/myTesting/input.txt"));

		// TODO: need to find a replacement.
		//job.setNumMapTasks(1);

		// output settings
		Path outPath = new Path("/home/gauravj/work/grid/myTesting/tableOuts");
		job.setOutputFormatClass(BasicTableOutputFormat.class);
		BasicTableOutputFormat.setOutputPath(job, outPath);
		// set the logical schema with 2 columns
		BasicTableOutputFormat.setSchema(job, "word:string, count:int");
		// for demo purposes, create 2 physical column groups
		BasicTableOutputFormat.setStorageHint(job, "[word];[count]");

		/* New M/R Interface */
		/* Set sort columns in a comma separated string */
		/* Each sort column should belong to schema columns */
		BasicTableOutputFormat.setSortInfo(job, "word, count");

		// set map-only job.
		job.setNumReduceTasks(1);
		job.submit();
	}
}