/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with this * work for additional information regarding copyright ownership. The ASF * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package org.apache.hadoop.zebra.mapred; import java.io.IOException; import junit.framework.Assert; import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.RawComparator; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.OutputFormat; import org.apache.hadoop.mapred.RecordWriter; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.util.Progressable; import org.apache.hadoop.zebra.io.BasicTable; import org.apache.hadoop.zebra.io.TableInserter; import org.apache.hadoop.zebra.parser.ParseException; import org.apache.hadoop.zebra.types.Partition; import org.apache.hadoop.zebra.types.SortInfo; import org.apache.hadoop.zebra.schema.Schema; import org.apache.hadoop.zebra.tfile.TFile; import org.apache.pig.data.Tuple; import org.apache.hadoop.zebra.pig.comparator.*; import org.apache.hadoop.util.ReflectionUtils; /** * {@link org.apache.hadoop.mapred.OutputFormat} class for creating a * BasicTable. * * Usage Example: * <p> * In the main program, add the following code. * * <pre> * jobConf.setOutputFormat(BasicTableOutputFormat.class); * Path outPath = new Path("path/to/the/BasicTable"); * BasicTableOutputFormat.setOutputPath(jobConf, outPath); * BasicTableOutputFormat.setSchema(jobConf, "Name, Age, Salary, BonusPct"); * </pre> * * The above code does the following things: * <UL> * <LI>Set the output format class to BasicTableOutputFormat. * <LI>Set the single path to the BasicTable to be created. * <LI>Set the schema of the BasicTable to be created. In this case, the * to-be-created BasicTable contains three columns with names "Name", "Age", * "Salary", "BonusPct". * </UL> * * To create multiple output paths. ZebraOutputPartitoner interface needs to be implemented * <pre> * String multiLocs = "commaSeparatedPaths" * jobConf.setOutputFormat(BasicTableOutputFormat.class); * BasicTableOutputFormat.setMultipleOutputPaths(jobConf, multiLocs); * jobConf.setOutputFormat(BasicTableOutputFormat.class); * BasicTableOutputFormat.setSchema(jobConf, "Name, Age, Salary, BonusPct"); * BasicTableOutputFormat.setZebraOutputPartitionClass( * jobConf, MultipleOutputsTest.OutputPartitionerClass.class); * </pre> * * * The user ZebraOutputPartitionClass should like this * * <pre> * * static class OutputPartitionerClass implements ZebraOutputPartition { * @Override * public int getOutputPartition(BytesWritable key, Tuple value) { * * return someIndexInOutputParitionlist0; * } * * </pre> * * * The user Reducer code (or similarly Mapper code if it is a Map-only job) * should look like the following: * * <pre> * static class MyReduceClass implements Reducer<K, V, BytesWritable, Tuple> { * // keep the tuple object for reuse. * Tuple outRow; * // indices of various fields in the output Tuple. * int idxName, idxAge, idxSalary, idxBonusPct; * * @Override * public void configure(JobConf job) { * Schema outSchema = BasicTableOutputFormat.getSchema(job); * // create a tuple that conforms to the output schema. * outRow = TypesUtils.createTuple(outSchema); * // determine the field indices. * idxName = outSchema.getColumnIndex("Name"); * idxAge = outSchema.getColumnIndex("Age"); * idxSalary = outSchema.getColumnIndex("Salary"); * idxBonusPct = outSchema.getColumnIndex("BonusPct"); * } * * @Override * public void reduce(K key, Iterator<V> values, * OutputCollector<BytesWritable, Tuple> output, Reporter reporter) * throws IOException { * String name; * int age; * int salary; * double bonusPct; * // ... Determine the value of the individual fields of the row to be inserted. * try { * outTuple.set(idxName, name); * outTuple.set(idxAge, new Integer(age)); * outTuple.set(idxSalary, new Integer(salary)); * outTuple.set(idxBonusPct, new Double(bonusPct)); * output.collect(new BytesWritable(name.getBytes()), outTuple); * } * catch (ExecException e) { * // should never happen * } * } * * @Override * public void close() throws IOException { * // no-op * } * * } * </pre> * * @Deprecated Use (@link org.apache.hadoop.zebra.mapreduce.BasicTableOutputFormat) instead */ @Deprecated public class BasicTableOutputFormat implements OutputFormat<BytesWritable, Tuple> { private static final String OUTPUT_PATH = "mapred.lib.table.output.dir"; private static final String MULTI_OUTPUT_PATH = "mapred.lib.table.multi.output.dirs"; private static final String OUTPUT_SCHEMA = "mapred.lib.table.output.schema"; private static final String OUTPUT_STORAGEHINT = "mapred.lib.table.output.storagehint"; private static final String OUTPUT_SORTCOLUMNS = "mapred.lib.table.output.sortcolumns"; private static final String OUTPUT_COMPARATOR = "mapred.lib.table.output.comparator"; static final String IS_MULTI = "multi"; private static final String ZEBRA_OUTPUT_PARTITIONER_CLASS = "zebra.output.partitioner.class"; /** * Set the multiple output paths of the BasicTable in JobConf * * @param conf * The JobConf object. * @param commaSeparatedLocations * The comma separated output paths to the tables. * The path must either not existent, or must be an empty directory. * @param theClass * Zebra output partitoner class */ public static void setMultipleOutputs(JobConf conf, String commaSeparatedLocations, Class<? extends ZebraOutputPartition> theClass) throws IOException { conf.set(MULTI_OUTPUT_PATH, commaSeparatedLocations); if(conf.getBoolean(IS_MULTI, true) == false) { throw new IllegalArgumentException("Job has been setup as single output path"); } conf.setBoolean(IS_MULTI, true); setZebraOutputPartitionClass(conf, theClass); } /** * Set the multiple output paths of the BasicTable in JobConf * * @param conf * The JobConf object. * @param paths * The list of paths * The path must either not existent, or must be an empty directory. * @param theClass * Zebra output partitioner class */ public static void setMultipleOutputs(JobConf conf, Class<? extends ZebraOutputPartition> theClass, Path... paths) throws IOException { FileSystem fs = FileSystem.get(conf); Path path = paths[0].makeQualified(fs); StringBuffer str = new StringBuffer(StringUtils.escapeString(path.toString())); for(int i = 1; i < paths.length;i++) { str.append(StringUtils.COMMA_STR); path = paths[i].makeQualified(fs); str.append(StringUtils.escapeString(path.toString())); } conf.set(MULTI_OUTPUT_PATH, str.toString()); if(conf.getBoolean(IS_MULTI, true) == false) { throw new IllegalArgumentException("Job has been setup as single output path"); } conf.setBoolean(IS_MULTI, true); setZebraOutputPartitionClass(conf, theClass); } /** * Set the multiple output paths of the BasicTable in JobConf * * @param conf * The JobConf object. * @return path * The comma separated output paths to the tables. * The path must either not existent, or must be an empty directory. */ public static Path[] getOutputPaths(JobConf conf) throws IOException { Path[] result; String paths = conf.get(MULTI_OUTPUT_PATH); String path = conf.get(OUTPUT_PATH); if(paths != null && path != null) { throw new IllegalArgumentException("Illegal output paths specs. Both multi and single output locs are set"); } if(conf.getBoolean(IS_MULTI, false) == true) { if (paths == null || paths.equals("")) { throw new IllegalArgumentException("Illegal multi output paths"); } String [] list = StringUtils.split(paths); result = new Path[list.length]; for (int i = 0; i < list.length; i++) { result[i] = new Path(StringUtils.unEscapeString(list[i])); } } else { if (path == null || path.equals("")) { throw new IllegalArgumentException("Cannot find output path"); } result = new Path[1]; result[0] = new Path(path); } return result; } private static void setZebraOutputPartitionClass( JobConf conf, Class<? extends ZebraOutputPartition> theClass) throws IOException { if (!ZebraOutputPartition.class.isAssignableFrom(theClass)) throw new IOException(theClass+" not "+ZebraOutputPartition.class.getName()); conf.set(ZEBRA_OUTPUT_PARTITIONER_CLASS, theClass.getName()); } public static Class<? extends ZebraOutputPartition> getZebraOutputPartitionClass(JobConf conf) throws IOException { Class<?> theClass; String valueString = conf.get(ZEBRA_OUTPUT_PARTITIONER_CLASS); if (valueString == null) throw new IOException("zebra output partitioner class not found"); try { theClass = conf.getClassByName(valueString); } catch (ClassNotFoundException e) { throw new IOException(e); } if (theClass != null && !ZebraOutputPartition.class.isAssignableFrom(theClass)) throw new IOException(theClass+" not "+ZebraOutputPartition.class.getName()); else if (theClass != null) return theClass.asSubclass(ZebraOutputPartition.class); else return null; } /** * Set the output path of the BasicTable in JobConf * * @param conf * The JobConf object. * @param path * The output path to the table. The path must either not existent, * or must be an empty directory. */ public static void setOutputPath(JobConf conf, Path path) { conf.set(OUTPUT_PATH, path.toString()); if(conf.getBoolean(IS_MULTI, false) == true) { throw new IllegalArgumentException("Job has been setup as multi output paths"); } conf.setBoolean(IS_MULTI, false); } /** * Get the output path of the BasicTable from JobConf * * @param conf * job conf * @return The output path. */ public static Path getOutputPath(JobConf conf) { String path = conf.get(OUTPUT_PATH); return (path == null) ? null : new Path(path); } /** * Set the table schema in JobConf * * @param conf * The JobConf object. * @param schema * The schema of the BasicTable to be created. For the initial * implementation, the schema string is simply a comma separated list * of column names, such as "Col1, Col2, Col3". * * @deprecated Use {@link #setStorageInfo(JobConf, ZebraSchema, ZebraStorageHint, ZebraSortInfo)} instead. */ public static void setSchema(JobConf conf, String schema) { conf.set(OUTPUT_SCHEMA, Schema.normalize(schema)); } /** * Get the table schema in JobConf. * * @param conf * The JobConf object. * @return The output schema of the BasicTable. If the schema is not defined * in the conf object at the time of the call, null will be returned. */ public static Schema getSchema(JobConf conf) throws ParseException { String schema = conf.get(OUTPUT_SCHEMA); if (schema == null) { return null; } //schema = schema.replaceAll(";", ","); return new Schema(schema); } private static KeyGenerator makeKeyBuilder(byte[] elems) { ComparatorExpr[] exprs = new ComparatorExpr[elems.length]; for (int i = 0; i < elems.length; ++i) { exprs[i] = ExprUtils.primitiveComparator(i, elems[i]); } return new KeyGenerator(ExprUtils.tupleComparator(exprs)); } /** * Generates a zebra specific sort key generator which is used to generate BytesWritable key * Sort Key(s) are used to generate this object * * @param conf * The JobConf object. * @return Object of type zebra.pig.comaprator.KeyGenerator. * */ public static Object getSortKeyGenerator(JobConf conf) throws IOException, ParseException { SortInfo sortInfo = getSortInfo(conf); Schema schema = getSchema(conf); String[] sortColNames = sortInfo.getSortColumnNames(); byte[] types = new byte[sortColNames.length]; for(int i =0 ; i < sortColNames.length; ++i){ types[i] = schema.getColumn(sortColNames[i]).getType().pigDataType(); } KeyGenerator builder = makeKeyBuilder(types); return builder; } /** * Generates a BytesWritable key for the input key * using keygenerate provided. Sort Key(s) are used to generate this object * * @param builder * Opaque key generator created by getSortKeyGenerator() method * @param t * Tuple to create sort key from * @return ByteWritable Key * */ public static BytesWritable getSortKey(Object builder, Tuple t) throws Exception { KeyGenerator kg = (KeyGenerator) builder; return kg.generateKey(t); } /** * Set the table storage hint in JobConf, should be called after setSchema is * called. * <br> <br> * * Note that the "secure by" feature is experimental now and subject to * changes in the future. * * @param conf * The JobConf object. * @param storehint * The storage hint of the BasicTable to be created. The format would * be like "[f1, f2.subfld]; [f3, f4]". * * @deprecated Use {@link #setStorageInfo(JobConf, ZebraSchema, ZebraStorageHint, ZebraSortInfo)} instead. */ public static void setStorageHint(JobConf conf, String storehint) throws ParseException, IOException { String schema = conf.get(OUTPUT_SCHEMA); if (schema == null) throw new ParseException("Schema has not been set"); // for sanity check purpose only new Partition(schema, storehint, null); conf.set(OUTPUT_STORAGEHINT, storehint); } /** * Get the table storage hint in JobConf. * * @param conf * The JobConf object. * @return The storage hint of the BasicTable. If the storage hint is not * defined in the conf object at the time of the call, an empty string * will be returned. */ public static String getStorageHint(JobConf conf) { String storehint = conf.get(OUTPUT_STORAGEHINT); return storehint == null ? "" : storehint; } /** * Set the sort info * * @param conf * The JobConf object. * * @param sortColumns * Comma-separated sort column names * * @param comparatorClass * comparator class name; null for default * * @deprecated Use {@link #setStorageInfo(JobConf, ZebraSchema, ZebraStorageHint, ZebraSortInfo)} instead. */ public static void setSortInfo(JobConf conf, String sortColumns, Class<? extends RawComparator<Object>> comparatorClass) { conf.set(OUTPUT_SORTCOLUMNS, sortColumns); if (comparatorClass != null) conf.set(OUTPUT_COMPARATOR, TFile.COMPARATOR_JCLASS+comparatorClass.getName()); } /** * Set the sort info * * @param conf * The JobConf object. * * @param sortColumns * Comma-separated sort column names * * @deprecated Use {@link #setStorageInfo(JobConf, ZebraSchema, ZebraStorageHint, ZebraSortInfo)} instead. */ public static void setSortInfo(JobConf conf, String sortColumns) { conf.set(OUTPUT_SORTCOLUMNS, sortColumns); } /** * Set the table storage info including ZebraSchema, * * @param conf * The JobConf object. * * @param zSchema The ZebraSchema object containing schema information. * * @param zStorageHint The ZebraStorageHint object containing storage hint information. * * @param zSortInfo The ZebraSortInfo object containing sorting information. * */ public static void setStorageInfo(JobConf conf, ZebraSchema zSchema, ZebraStorageHint zStorageHint, ZebraSortInfo zSortInfo) throws ParseException, IOException { String schemaStr = null; String storageHintStr = null; /* validity check on schema*/ if (zSchema == null) { throw new IllegalArgumentException("ZebraSchema object cannot be null."); } else { schemaStr = zSchema.toString(); } Schema schema = null; try { schema = new Schema(schemaStr); } catch (ParseException e) { throw new ParseException("[" + zSchema + "] " + " is not a valid schema string: " + e.getMessage()); } /* validity check on storage hint*/ if (zStorageHint == null) { storageHintStr = ""; } else { storageHintStr = zStorageHint.toString(); } try { new Partition(schemaStr, storageHintStr, null); } catch (ParseException e) { throw new ParseException("[" + zStorageHint + "] " + " is not a valid storage hint string: " + e.getMessage() ); } catch (IOException e) { throw new ParseException("[" + zStorageHint + "] " + " is not a valid storage hint string: " + e.getMessage() ); } conf.set(OUTPUT_SCHEMA, schemaStr); conf.set(OUTPUT_STORAGEHINT, storageHintStr); /* validity check on sort info if user specifies it */ if (zSortInfo != null) { String sortColumnsStr = zSortInfo.getSortColumns(); String comparatorStr = zSortInfo.getComparator(); /* Check existence of comparable class if user specifies it */ if (comparatorStr != null && comparatorStr != "") { try { conf.getClassByName(comparatorStr.substring(TFile.COMPARATOR_JCLASS.length()).trim()); } catch (ClassNotFoundException e) { throw new IOException("comparator Class cannot be found : " + e.getMessage()); } } try { SortInfo.parse(sortColumnsStr, schema, comparatorStr); } catch (IOException e) { throw new IOException("[" + sortColumnsStr + " + " + comparatorStr + "] " + "is not a valid sort configuration: " + e.getMessage()); } if (sortColumnsStr != null) conf.set(OUTPUT_SORTCOLUMNS, sortColumnsStr); if (comparatorStr != null) conf.set(OUTPUT_COMPARATOR, comparatorStr); } } /** * Get the SortInfo object * * @param conf * The JobConf object. * @return SortInfo object; null if the Zebra table is unsorted * */ public static SortInfo getSortInfo(JobConf conf)throws IOException { String sortColumns = conf.get(OUTPUT_SORTCOLUMNS); if (sortColumns == null) return null; Schema schema = null; try { schema = getSchema(conf); } catch (ParseException e) { throw new IOException("Schema parsing failure : "+e.getMessage()); } if (schema == null) throw new IOException("Schema not defined"); String comparator = getComparator(conf); return SortInfo.parse(sortColumns, schema, comparator); } /** * Get the comparator for sort columns * * @param conf * The JobConf object. * @return comparator String * */ private static String getComparator(JobConf conf) { return conf.get(OUTPUT_COMPARATOR); } /** * Get the output table as specified in JobConf. It is useful for applications * to add more meta data after all rows have been added to the table. * * @param conf * The JobConf object. * @return The output BasicTable.Writer object. * @throws IOException */ private static BasicTable.Writer[] getOutput(JobConf conf) throws IOException { Path[] paths = getOutputPaths(conf); BasicTable.Writer[] writers = new BasicTable.Writer[paths.length]; for(int i = 0; i < paths.length; i++) { writers[i] = new BasicTable.Writer(paths[i], conf); } return writers; } /** * Note: we perform the Initialization of the table here. So we expect this to * be called before * {@link BasicTableOutputFormat#getRecordWriter(FileSystem, JobConf, String, Progressable)} * * @see OutputFormat#checkOutputSpecs(FileSystem, JobConf) */ @Override public void checkOutputSpecs(FileSystem ignored, JobConf conf) throws IOException { String schema = conf.get(OUTPUT_SCHEMA); if (schema == null) { throw new IllegalArgumentException("Cannot find output schema"); } String storehint, sortColumns, comparator; storehint = getStorageHint(conf); sortColumns = (getSortInfo(conf) == null ? null : SortInfo.toSortString(getSortInfo(conf).getSortColumnNames())); comparator = getComparator(conf); Path [] paths = getOutputPaths(conf); for (Path path : paths) { BasicTable.Writer writer = new BasicTable.Writer(path, schema, storehint, sortColumns, comparator, conf); writer.finish(); } } /** * @see OutputFormat#getRecordWriter(FileSystem, JobConf, String, * Progressable) */ @Override public RecordWriter<BytesWritable, Tuple> getRecordWriter(FileSystem ignored, JobConf conf, String name, Progressable progress) throws IOException { String path = conf.get(OUTPUT_PATH); return new TableRecordWriter(path, name, conf, progress); } /** * Close the output BasicTable, No more rows can be added into the table. A * BasicTable is not visible for reading until it is "closed". * * @param conf * The JobConf object. * @throws IOException */ public static void close(JobConf conf) throws IOException { BasicTable.Writer tables[] = getOutput(conf); for(int i =0; i < tables.length; ++i) { tables[i].close(); } } } /** * Adaptor class for BasicTable RecordWriter. */ class TableRecordWriter implements RecordWriter<BytesWritable, Tuple> { private final TableInserter inserter[]; private final Progressable progress; private org.apache.hadoop.zebra.mapred.ZebraOutputPartition op = null; public TableRecordWriter(String path, String name, JobConf conf, Progressable progress) throws IOException { if(conf.getBoolean(BasicTableOutputFormat.IS_MULTI, false) == true) { op = (org.apache.hadoop.zebra.mapred.ZebraOutputPartition) ReflectionUtils.newInstance(BasicTableOutputFormat.getZebraOutputPartitionClass(conf), conf); } Path [] paths = BasicTableOutputFormat.getOutputPaths(conf); inserter = new TableInserter[paths.length]; for(int i = 0; i < paths.length; ++i) { BasicTable.Writer writer = new BasicTable.Writer(paths[i], conf); this.inserter[i] = writer.getInserter(name, true); } this.progress = progress; } @Override public void close(Reporter reporter) throws IOException { for(int i = 0; i < this.inserter.length; ++i) { inserter[i].close(); } reporter.progress(); } @Override public void write(BytesWritable key, Tuple value) throws IOException { if(op != null ) { int idx = op.getOutputPartition(key, value); if(idx < 0 || (idx >= inserter.length)) { throw new IllegalArgumentException("index returned by getOutputPartition is out of range"); } inserter[idx].insert(key, value); } else { inserter[0].insert(key, value); } progress.progress(); } }