package org.apache.hadoop.hive.mastiff; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Properties; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.mastiff.MastiffHandlerUtil.MTableDesc; import org.apache.hadoop.hive.ql.io.HiveOutputFormat; import org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable; import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; import org.apache.hadoop.io.Writable; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.util.Progressable; import cn.ac.ncic.mastiff.etl.ETLUtils; import cn.ac.ncic.mastiff.hive.serde.lazy.ClusterAccessor.DataType; import cn.ac.ncic.mastiff.mapred.MastiffMapReduce.TableDesc; public class SegmentFileOutputFormat extends FileOutputFormat<WritableComparable, BytesRefArrayWritable> implements HiveOutputFormat<WritableComparable, BytesRefArrayWritable> { @Override public void checkOutputSpecs(FileSystem arg0, JobConf arg1) throws IOException { // TODO Auto-generated method stub } /** * set number of columns into the given configuration. * * @param conf * configuration instance which need to set the column number * @param columnNum * column number for SegmentFile's Writer * */ @Override public org.apache.hadoop.mapred.RecordWriter getRecordWriter(FileSystem arg0, JobConf arg1, String arg2, Progressable arg3) throws IOException { throw new RuntimeException("Error: Hive should not invoke this method."); } @Override public org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter getHiveRecordWriter( JobConf jc, Path finalOutPath, Class valueClass, boolean isCompressed, Properties tableProperties, Progressable progress) throws IOException { String tblName = (String) tableProperties.get(MastiffHandlerUtil.CF_TABLE_NAME); try { MastiffHandlerUtil.setCFMeta(jc, tblName); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } MTableDesc tblDesc = MastiffHandlerUtil.getMTableDesc(tableProperties); MastiffHandlerUtil.getCFTypes(tblDesc); List<List<DataType>> clusterTypes = new ArrayList<List<DataType>>(); SerializeUtil.jc = jc; SerializeUtil.fs = finalOutPath.getFileSystem(jc); SerializeUtil.tableProperties = tableProperties; // SerializeUtil.outputPath = finalOutPath; SerializeUtil.desc = new TableDesc(); SerializeUtil.desc.columnsMapping = tblDesc.columnsMapping; SerializeUtil.desc.clusterCodingTypes = tblDesc.clusterCodingTypes; SerializeUtil.desc.clusterAlgos = tblDesc.clusterAlgos; DataType[][] tableSchema = new DataType[1][tblDesc.columnTypes.length]; // List<DataType> cols = new ArrayList<DataType>(); for (int i = 0; i < tblDesc.clusterTypes.length; i++) { clusterTypes.add(new ArrayList()); for (int j = 0; j < tblDesc.clusterTypes[i].length; j++) { switch (((PrimitiveTypeInfo) tblDesc.clusterTypes[i][j]).getPrimitiveCategory()) { case BOOLEAN: clusterTypes.get(i).add(DataType.BOOLEAN); tableSchema[0][SerializeUtil.desc.columnsMapping[i][j]] = DataType.BOOLEAN; break; case BYTE: clusterTypes.get(i).add(DataType.BYTE); tableSchema[0][SerializeUtil.desc.columnsMapping[i][j]] = DataType.BYTE; break; case SHORT: clusterTypes.get(i).add(DataType.SHORT); tableSchema[0][SerializeUtil.desc.columnsMapping[i][j]] = DataType.SHORT; break; case INT: clusterTypes.get(i).add(DataType.INT); tableSchema[0][SerializeUtil.desc.columnsMapping[i][j]] = DataType.INT; break; case LONG: clusterTypes.get(i).add(DataType.LONG); tableSchema[0][SerializeUtil.desc.columnsMapping[i][j]] = DataType.LONG; break; case FLOAT: clusterTypes.get(i).add(DataType.FLOAT); tableSchema[0][SerializeUtil.desc.columnsMapping[i][j]] = DataType.FLOAT; break; case DOUBLE: clusterTypes.get(i).add(DataType.DOUBLE); tableSchema[0][SerializeUtil.desc.columnsMapping[i][j]] = DataType.DOUBLE; break; case STRING: clusterTypes.get(i).add(DataType.STRING); tableSchema[0][SerializeUtil.desc.columnsMapping[i][j]] = DataType.STRING; break; case DATE: clusterTypes.get(i).add(DataType.DATE); tableSchema[0][SerializeUtil.desc.columnsMapping[i][j]] = DataType.DATE; break; case TIMESTAMP: clusterTypes.get(i).add(DataType.DATE); tableSchema[0][SerializeUtil.desc.columnsMapping[i][j]] = DataType.DATE; break; default: try { throw new Exception("not supported type"); } catch (Exception e) { e.printStackTrace(); } } } } for (int i = 0; i < clusterTypes.size(); i++) { for (int j = 0; j < clusterTypes.get(i).size(); j++) { if (clusterTypes.get(i).get(j) == DataType.DATE) { clusterTypes.get(i).set(j, DataType.LONG); } } } SerializeUtil.desc.clusterTypes = clusterTypes; SerializeUtil.desc.tableSchema = tableSchema; String filename = ETLUtils.getOutputName(ETLUtils.getPartion(jc)); Path outputPath = new Path(finalOutPath, filename); Path tmpoutputPath = new Path("/tmp/hive-mastiff/"); Path tmpPath = new Path(tmpoutputPath, filename); // FileSystem fs =finalOutPath.getFileSystem(jc); // int buffsize =fs.getConf().getInt("io.file.buffer.size", 4096); // short block= fs.getDefaultReplication(); // long blocksize=fs.getDefaultBlockSize(); // fs.create(outputPath,false, buffsize, block, blocksize) ; if (finalOutPath.getFileSystem(jc).exists(tmpPath)) { finalOutPath.getFileSystem(jc).delete(tmpPath, true); } final SegmentFile.Writer outWriter = new SegmentFile.Writer(finalOutPath.getFileSystem(jc), tmpPath, outputPath, tmpoutputPath, clusterTypes); return new org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter() { @Override public void write(Writable r) throws IOException { outWriter.append(r); } @Override public void close(boolean abort) throws IOException { outWriter.close(); } }; } }