/*
* Copyright 2014, Stratio.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.stratio.deep.commons.extractor.impl;
import static com.stratio.deep.commons.utils.Utils.initConfig;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.JobID;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.apache.log4j.Logger;
import org.apache.spark.Partition;
import org.apache.spark.rdd.NewHadoopPartition;
import com.stratio.deep.commons.config.BaseConfig;
import com.stratio.deep.commons.config.DeepJobConfig;
import com.stratio.deep.commons.config.ExtractorConfig;
import com.stratio.deep.commons.config.HadoopConfig;
import com.stratio.deep.commons.exception.DeepGenericException;
import com.stratio.deep.commons.querybuilder.UpdateQueryBuilder;
import com.stratio.deep.commons.rdd.IExtractor;
import com.stratio.deep.commons.utils.DeepSparkHadoopMapReduceUtil;
import scala.Tuple2;
/**
* Created by rcrespo on 26/08/14.
*/
public abstract class GenericHadoopExtractor<T, S extends BaseConfig, K, V, kOut, vOut> implements IExtractor<T, S> {
protected HadoopConfig deepJobConfig;
protected transient RecordReader<K, V> reader;
protected transient RecordWriter<kOut, vOut> writer;
protected transient InputFormat<K, V> inputFormat;
protected transient OutputFormat<kOut, vOut> outputFormat;
protected transient String jobTrackerId;
protected transient TaskAttemptContext hadoopAttemptContext;
protected boolean havePair = false;
protected boolean finished = false;
protected transient JobID jobId = null;
private static final Logger LOG = Logger.getLogger(GenericHadoopExtractor.class);
{
SimpleDateFormat formatter = new SimpleDateFormat("yyyyMMddHHmm");
jobTrackerId = formatter.format(new Date());
}
@Override
public List<String> getPreferredLocations(Partition split) {
return null;
}
@Override
public Partition[] getPartitions(S config) {
int id = config.getRddId();
jobId = new JobID(jobTrackerId, id);
Configuration conf = getHadoopConfig(config);
JobContext jobContext = DeepSparkHadoopMapReduceUtil.newJobContext(conf, jobId);
try {
List<InputSplit> splits = inputFormat.getSplits(jobContext);
Partition[] partitions = new Partition[(splits.size())];
for (int i = 0; i < splits.size(); i++) {
partitions[i] = new NewHadoopPartition(id, i, splits.get(i));
}
return partitions;
} catch (IOException | InterruptedException | RuntimeException e) {
LOG.error("Impossible to calculate partitions " + e.getMessage());
throw new DeepGenericException("Impossible to calculate partitions ", e);
}
}
@Override
public boolean hasNext() {
if (!finished && !havePair) {
try {
finished = !reader.nextKeyValue();
} catch (IOException | InterruptedException e) {
LOG.error("Impossible to get hasNext " + e.getMessage());
throw new DeepGenericException("Impossible to get hasNext ", e);
}
havePair = !finished;
}
return !finished;
}
@Override
public T next() {
if (!hasNext()) {
throw new java.util.NoSuchElementException("End of stream");
}
havePair = false;
Tuple2<K, V> tuple = null;
try {
return (T) transformElement(new Tuple2<>(reader.getCurrentKey(), reader.getCurrentValue()),
deepJobConfig);
} catch (IOException | InterruptedException e) {
LOG.error("Impossible to get next value " + e.getMessage());
throw new DeepGenericException("Impossible to get next value ", e);
}
}
@Override
public void close() {
try {
if (reader != null) {
reader.close();
}
if (writer != null) {
writer.close(hadoopAttemptContext);
}
} catch (IOException | InterruptedException e) {
LOG.error("Impossible to close RecordReader " + e.getMessage());
throw new DeepGenericException("Impossible to close RecordReader ", e);
}
}
private Configuration getHadoopConfig(S config) {
deepJobConfig = initConfig(config, deepJobConfig);
return deepJobConfig.getHadoopConfiguration();
}
public abstract T transformElement(Tuple2<K, V> tuple, DeepJobConfig<T, ? extends DeepJobConfig> config);
@Override
public void saveRDD(T t) {
Tuple2<kOut, vOut> tuple = transformElement(t);
try {
writer.write(tuple._1(), tuple._2());
} catch (IOException | InterruptedException e) {
LOG.error("Impossible to saveRDD " + e.getMessage());
throw new DeepGenericException("Impossible to saveRDD ", e);
}
return;
}
@Override
public void initSave(S config, T first, UpdateQueryBuilder queryBuilder) {
int id = config.getRddId();
int partitionIndex = config.getPartitionId();
TaskAttemptID attemptId = DeepSparkHadoopMapReduceUtil
.newTaskAttemptID(jobTrackerId, id, true, partitionIndex, 0);
Configuration configuration = getHadoopConfig(config);
hadoopAttemptContext = DeepSparkHadoopMapReduceUtil
.newTaskAttemptContext(configuration,
attemptId);
try {
writer = outputFormat.getRecordWriter(hadoopAttemptContext);
} catch (IOException | InterruptedException e) {
throw new DeepGenericException(e);
}
}
@Override
public void initIterator(Partition dp, S config) {
int id = config.getRddId();
NewHadoopPartition split = (NewHadoopPartition) dp;
TaskAttemptID attemptId = DeepSparkHadoopMapReduceUtil
.newTaskAttemptID(jobTrackerId, id, true, split.index(), 0);
Configuration configuration = getHadoopConfig(config);
TaskAttemptContext hadoopAttemptContext = DeepSparkHadoopMapReduceUtil
.newTaskAttemptContext(configuration, attemptId);
try {
reader = inputFormat.createRecordReader(split.serializableHadoopSplit().value(), hadoopAttemptContext);
reader.initialize(split.serializableHadoopSplit().value(), hadoopAttemptContext);
} catch (IOException | InterruptedException e) {
throw new DeepGenericException(e);
}
}
public abstract Tuple2<kOut, vOut> transformElement(T record);
}