/* * Copyright 2014, Stratio. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.stratio.deep.commons.extractor.impl; import static com.stratio.deep.commons.utils.Utils.initConfig; import java.io.IOException; import java.text.SimpleDateFormat; import java.util.Date; import java.util.List; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.mapreduce.InputFormat; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.JobContext; import org.apache.hadoop.mapreduce.JobID; import org.apache.hadoop.mapreduce.OutputFormat; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.RecordWriter; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.TaskAttemptID; import org.apache.log4j.Logger; import org.apache.spark.Partition; import org.apache.spark.rdd.NewHadoopPartition; import com.stratio.deep.commons.config.BaseConfig; import com.stratio.deep.commons.config.DeepJobConfig; import com.stratio.deep.commons.config.ExtractorConfig; import com.stratio.deep.commons.config.HadoopConfig; import com.stratio.deep.commons.exception.DeepGenericException; import com.stratio.deep.commons.querybuilder.UpdateQueryBuilder; import com.stratio.deep.commons.rdd.IExtractor; import com.stratio.deep.commons.utils.DeepSparkHadoopMapReduceUtil; import scala.Tuple2; /** * Created by rcrespo on 26/08/14. */ public abstract class GenericHadoopExtractor<T, S extends BaseConfig, K, V, kOut, vOut> implements IExtractor<T, S> { protected HadoopConfig deepJobConfig; protected transient RecordReader<K, V> reader; protected transient RecordWriter<kOut, vOut> writer; protected transient InputFormat<K, V> inputFormat; protected transient OutputFormat<kOut, vOut> outputFormat; protected transient String jobTrackerId; protected transient TaskAttemptContext hadoopAttemptContext; protected boolean havePair = false; protected boolean finished = false; protected transient JobID jobId = null; private static final Logger LOG = Logger.getLogger(GenericHadoopExtractor.class); { SimpleDateFormat formatter = new SimpleDateFormat("yyyyMMddHHmm"); jobTrackerId = formatter.format(new Date()); } @Override public List<String> getPreferredLocations(Partition split) { return null; } @Override public Partition[] getPartitions(S config) { int id = config.getRddId(); jobId = new JobID(jobTrackerId, id); Configuration conf = getHadoopConfig(config); JobContext jobContext = DeepSparkHadoopMapReduceUtil.newJobContext(conf, jobId); try { List<InputSplit> splits = inputFormat.getSplits(jobContext); Partition[] partitions = new Partition[(splits.size())]; for (int i = 0; i < splits.size(); i++) { partitions[i] = new NewHadoopPartition(id, i, splits.get(i)); } return partitions; } catch (IOException | InterruptedException | RuntimeException e) { LOG.error("Impossible to calculate partitions " + e.getMessage()); throw new DeepGenericException("Impossible to calculate partitions ", e); } } @Override public boolean hasNext() { if (!finished && !havePair) { try { finished = !reader.nextKeyValue(); } catch (IOException | InterruptedException e) { LOG.error("Impossible to get hasNext " + e.getMessage()); throw new DeepGenericException("Impossible to get hasNext ", e); } havePair = !finished; } return !finished; } @Override public T next() { if (!hasNext()) { throw new java.util.NoSuchElementException("End of stream"); } havePair = false; Tuple2<K, V> tuple = null; try { return (T) transformElement(new Tuple2<>(reader.getCurrentKey(), reader.getCurrentValue()), deepJobConfig); } catch (IOException | InterruptedException e) { LOG.error("Impossible to get next value " + e.getMessage()); throw new DeepGenericException("Impossible to get next value ", e); } } @Override public void close() { try { if (reader != null) { reader.close(); } if (writer != null) { writer.close(hadoopAttemptContext); } } catch (IOException | InterruptedException e) { LOG.error("Impossible to close RecordReader " + e.getMessage()); throw new DeepGenericException("Impossible to close RecordReader ", e); } } private Configuration getHadoopConfig(S config) { deepJobConfig = initConfig(config, deepJobConfig); return deepJobConfig.getHadoopConfiguration(); } public abstract T transformElement(Tuple2<K, V> tuple, DeepJobConfig<T, ? extends DeepJobConfig> config); @Override public void saveRDD(T t) { Tuple2<kOut, vOut> tuple = transformElement(t); try { writer.write(tuple._1(), tuple._2()); } catch (IOException | InterruptedException e) { LOG.error("Impossible to saveRDD " + e.getMessage()); throw new DeepGenericException("Impossible to saveRDD ", e); } return; } @Override public void initSave(S config, T first, UpdateQueryBuilder queryBuilder) { int id = config.getRddId(); int partitionIndex = config.getPartitionId(); TaskAttemptID attemptId = DeepSparkHadoopMapReduceUtil .newTaskAttemptID(jobTrackerId, id, true, partitionIndex, 0); Configuration configuration = getHadoopConfig(config); hadoopAttemptContext = DeepSparkHadoopMapReduceUtil .newTaskAttemptContext(configuration, attemptId); try { writer = outputFormat.getRecordWriter(hadoopAttemptContext); } catch (IOException | InterruptedException e) { throw new DeepGenericException(e); } } @Override public void initIterator(Partition dp, S config) { int id = config.getRddId(); NewHadoopPartition split = (NewHadoopPartition) dp; TaskAttemptID attemptId = DeepSparkHadoopMapReduceUtil .newTaskAttemptID(jobTrackerId, id, true, split.index(), 0); Configuration configuration = getHadoopConfig(config); TaskAttemptContext hadoopAttemptContext = DeepSparkHadoopMapReduceUtil .newTaskAttemptContext(configuration, attemptId); try { reader = inputFormat.createRecordReader(split.serializableHadoopSplit().value(), hadoopAttemptContext); reader.initialize(split.serializableHadoopSplit().value(), hadoopAttemptContext); } catch (IOException | InterruptedException e) { throw new DeepGenericException(e); } } public abstract Tuple2<kOut, vOut> transformElement(T record); }