/*
* Copyright 2014, Stratio.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.stratio.deep.examples.java.extractor;
import java.io.Serializable;
import java.util.HashMap;
import java.util.Map;
import org.apache.log4j.Logger;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.rdd.RDD;
import com.google.common.collect.Lists;
import com.stratio.deep.cassandra.extractor.CassandraCellExtractor;
import com.stratio.deep.commons.config.ExtractorConfig;
import com.stratio.deep.commons.entity.Cell;
import com.stratio.deep.commons.entity.Cells;
import com.stratio.deep.commons.extractor.utils.ExtractorConstants;
import com.stratio.deep.core.context.DeepSparkContext;
import com.stratio.deep.examples.utils.ContextProperties;
import scala.Tuple2;
/**
* Author: Emmanuelle Raffenne
* Date..: 3-mar-2014
*/
public final class WritingCellToCassandra {
private static final Logger LOG = Logger.getLogger(WritingCellToCassandra.class);
private WritingCellToCassandra() {
}
/**
* Application entry point.
*
* @param args the arguments passed to the application.
*/
public static void main(String[] args) {
doMain(args);
}
/**
* This is the method called by both main and tests.
*
* @param args
*/
public static void doMain(String[] args) {
String job = "java:writingCellToCassandra";
String KEYSPACENAME = "crawler";
String TABLENAME = "listdomains";
Integer cqlPort = 9042;
Integer rpcPort = 9160;
String HOST = "127.0.0.1";
final String outputTableName = "newlistdomains";
// Creating the Deep Context where args are Spark Master and Job Name
ContextProperties p = new ContextProperties(args);
DeepSparkContext deepContext = new DeepSparkContext(p.getCluster(), job, p.getSparkHome(), p.getJars());
// --- INPUT RDD
ExtractorConfig<Cells> inputConfig = new ExtractorConfig();
inputConfig.setExtractorImplClass(CassandraCellExtractor.class);
//inputConfig.setEntityClass(TweetEntity.class);
Map<String, Serializable> values = new HashMap<>();
values.put(ExtractorConstants.KEYSPACE, KEYSPACENAME);
values.put(ExtractorConstants.TABLE, TABLENAME);
values.put(ExtractorConstants.CQLPORT, cqlPort);
values.put(ExtractorConstants.RPCPORT, rpcPort);
values.put(ExtractorConstants.HOST, HOST);
inputConfig.setValues(values);
RDD<Cells> inputRDD = deepContext.createRDD(inputConfig);
LOG.info("Count :" + inputRDD.count());
LOG.info("First :" + inputRDD.first());
JavaPairRDD<String, Cells> pairRDD = inputRDD.toJavaRDD().mapToPair(new PairFunction<Cells, String, Cells>() {
@Override
public Tuple2<String, Cells> call(Cells c) {
return new Tuple2<>((String) c.getCellByName("domain")
.getCellValue(), c);
}
});
JavaPairRDD<String, Integer> numPerKey = pairRDD.groupByKey()
.mapToPair(new PairFunction<Tuple2<String, Iterable<Cells>>, String, Integer>() {
@Override
public Tuple2<String, Integer> call(Tuple2<String, Iterable<Cells>> t) {
return new Tuple2<String, Integer>(t._1(), Lists.newArrayList(t._2()).size());
}
});
JavaRDD<Cells> outputRDD = numPerKey.map(new Function<Tuple2<String, Integer>, Cells>() {
@Override
public Cells call(Tuple2<String, Integer> t) {
Cell c1 = Cell.create("domain", t._1(), true, false);
Cell c2 = Cell.create("num_pages", t._2());
return new Cells(outputTableName, c1, c2);
}
});
LOG.info("Count insert:" + outputRDD.count());
LOG.info("First insert:" + outputRDD.first());
// --- OUTPUT RDD
ExtractorConfig<Cells> outputConfig = new ExtractorConfig();
outputConfig.setExtractorImplClass(CassandraCellExtractor.class);
Map<String, Serializable> valuesOutput = new HashMap<>();
valuesOutput.put(ExtractorConstants.KEYSPACE, KEYSPACENAME);
valuesOutput.put(ExtractorConstants.TABLE, outputTableName);
valuesOutput.put(ExtractorConstants.CQLPORT, cqlPort);
valuesOutput.put(ExtractorConstants.RPCPORT, rpcPort);
valuesOutput.put(ExtractorConstants.HOST, HOST);
valuesOutput.put(ExtractorConstants.CREATE_ON_WRITE, true);
outputConfig.setValues(valuesOutput);
deepContext.saveRDD(outputRDD.rdd(), outputConfig);
deepContext.stop();
}
}