package org.apache.hadoop.hive.mastiff;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.mastiff.MastiffHandlerUtil.MTableDesc;
import org.apache.hadoop.hive.ql.io.HiveOutputFormat;
import org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable;
import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.util.Progressable;
import cn.ac.ncic.mastiff.etl.ETLUtils;
import cn.ac.ncic.mastiff.hive.serde.lazy.ClusterAccessor.DataType;
import cn.ac.ncic.mastiff.mapred.MastiffMapReduce.TableDesc;
public class SegmentFileOutputFormat extends
FileOutputFormat<WritableComparable, BytesRefArrayWritable> implements
HiveOutputFormat<WritableComparable, BytesRefArrayWritable> {
@Override
public void checkOutputSpecs(FileSystem arg0, JobConf arg1) throws IOException {
// TODO Auto-generated method stub
}
/**
* set number of columns into the given configuration.
*
* @param conf
* configuration instance which need to set the column number
* @param columnNum
* column number for SegmentFile's Writer
*
*/
@Override
public org.apache.hadoop.mapred.RecordWriter getRecordWriter(FileSystem arg0, JobConf arg1,
String arg2, Progressable arg3)
throws IOException {
throw new RuntimeException("Error: Hive should not invoke this method.");
}
@Override
public org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter getHiveRecordWriter(
JobConf jc, Path finalOutPath, Class valueClass, boolean isCompressed,
Properties tableProperties, Progressable progress) throws IOException {
String tblName = (String) tableProperties.get(MastiffHandlerUtil.CF_TABLE_NAME);
try {
MastiffHandlerUtil.setCFMeta(jc, tblName);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
MTableDesc tblDesc = MastiffHandlerUtil.getMTableDesc(tableProperties);
MastiffHandlerUtil.getCFTypes(tblDesc);
List<List<DataType>> clusterTypes = new ArrayList<List<DataType>>();
SerializeUtil.jc = jc;
SerializeUtil.fs = finalOutPath.getFileSystem(jc);
SerializeUtil.tableProperties = tableProperties;
// SerializeUtil.outputPath = finalOutPath;
SerializeUtil.desc = new TableDesc();
SerializeUtil.desc.columnsMapping = tblDesc.columnsMapping;
SerializeUtil.desc.clusterCodingTypes = tblDesc.clusterCodingTypes;
SerializeUtil.desc.clusterAlgos = tblDesc.clusterAlgos;
DataType[][] tableSchema = new DataType[1][tblDesc.columnTypes.length];
// List<DataType> cols = new ArrayList<DataType>();
for (int i = 0; i < tblDesc.clusterTypes.length; i++) {
clusterTypes.add(new ArrayList());
for (int j = 0; j < tblDesc.clusterTypes[i].length; j++) {
switch (((PrimitiveTypeInfo) tblDesc.clusterTypes[i][j]).getPrimitiveCategory()) {
case BOOLEAN:
clusterTypes.get(i).add(DataType.BOOLEAN);
tableSchema[0][SerializeUtil.desc.columnsMapping[i][j]] = DataType.BOOLEAN;
break;
case BYTE:
clusterTypes.get(i).add(DataType.BYTE);
tableSchema[0][SerializeUtil.desc.columnsMapping[i][j]] = DataType.BYTE;
break;
case SHORT:
clusterTypes.get(i).add(DataType.SHORT);
tableSchema[0][SerializeUtil.desc.columnsMapping[i][j]] = DataType.SHORT;
break;
case INT:
clusterTypes.get(i).add(DataType.INT);
tableSchema[0][SerializeUtil.desc.columnsMapping[i][j]] = DataType.INT;
break;
case LONG:
clusterTypes.get(i).add(DataType.LONG);
tableSchema[0][SerializeUtil.desc.columnsMapping[i][j]] = DataType.LONG;
break;
case FLOAT:
clusterTypes.get(i).add(DataType.FLOAT);
tableSchema[0][SerializeUtil.desc.columnsMapping[i][j]] = DataType.FLOAT;
break;
case DOUBLE:
clusterTypes.get(i).add(DataType.DOUBLE);
tableSchema[0][SerializeUtil.desc.columnsMapping[i][j]] = DataType.DOUBLE;
break;
case STRING:
clusterTypes.get(i).add(DataType.STRING);
tableSchema[0][SerializeUtil.desc.columnsMapping[i][j]] = DataType.STRING;
break;
case DATE:
clusterTypes.get(i).add(DataType.DATE);
tableSchema[0][SerializeUtil.desc.columnsMapping[i][j]] = DataType.DATE;
break;
case TIMESTAMP:
clusterTypes.get(i).add(DataType.DATE);
tableSchema[0][SerializeUtil.desc.columnsMapping[i][j]] = DataType.DATE;
break;
default:
try {
throw new Exception("not supported type");
} catch (Exception e) {
e.printStackTrace();
}
}
}
}
for (int i = 0; i < clusterTypes.size(); i++) {
for (int j = 0; j < clusterTypes.get(i).size(); j++) {
if (clusterTypes.get(i).get(j) == DataType.DATE) {
clusterTypes.get(i).set(j, DataType.LONG);
}
}
}
SerializeUtil.desc.clusterTypes = clusterTypes;
SerializeUtil.desc.tableSchema = tableSchema;
String filename = ETLUtils.getOutputName(ETLUtils.getPartion(jc));
Path outputPath = new Path(finalOutPath, filename);
Path tmpoutputPath = new Path("/tmp/hive-mastiff/");
Path tmpPath = new Path(tmpoutputPath, filename);
// FileSystem fs =finalOutPath.getFileSystem(jc);
// int buffsize =fs.getConf().getInt("io.file.buffer.size", 4096);
// short block= fs.getDefaultReplication();
// long blocksize=fs.getDefaultBlockSize();
// fs.create(outputPath,false, buffsize, block, blocksize) ;
if (finalOutPath.getFileSystem(jc).exists(tmpPath)) {
finalOutPath.getFileSystem(jc).delete(tmpPath, true);
}
final SegmentFile.Writer outWriter = new SegmentFile.Writer(finalOutPath.getFileSystem(jc),
tmpPath, outputPath, tmpoutputPath,
clusterTypes);
return new org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter() {
@Override
public void write(Writable r) throws IOException {
outWriter.append(r);
}
@Override
public void close(boolean abort) throws IOException {
outWriter.close();
}
};
}
}