package me.test.spark;
import org.apache.spark.Accumulable;
import org.apache.spark.AccumulableParam;
import org.apache.spark.Accumulator;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import scala.Tuple2;
import java.security.SecureRandom;
import java.util.*;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.IntStream;
public class SparkTest {
static Logger log = LoggerFactory.getLogger(SparkTest.class);
public static void main(String[] args) {
//hello();
withIterator();
//stopAll();
// cartesian();
}
/**
* 第一个学习例子。测试每个线程到底跑了多少个job。
*/
public static void hello() {
SparkConf conf = new SparkConf()
.setAppName("btpka3")
.setMaster("local[2]");
JavaSparkContext jsc = new JavaSparkContext(conf);
List<Integer> data = Arrays.asList(1, 2, 3, 4, 5, 1, 1, 2);
JavaRDD<Integer> distData = jsc.parallelize(data);
JavaPairRDD<Integer, Integer> counts = distData.mapToPair(
new PairFunction<Integer, Integer, Integer>() {
@Override
public Tuple2<Integer, Integer> call(Integer s) {
//return new Tuple2<Integer, Integer>(s, 1);
return new Tuple2<Integer, Integer>((int) Thread.currentThread().getId(), 1);
}
})
.reduceByKey(new Function2<Integer, Integer, Integer>() {
@Override
public Integer call(Integer i1, Integer i2) {
return i1 + i2;
}
});
System.out.println("=======================================");
System.out.println(counts.collectAsMap());
System.out.println("---------------------------------------");
jsc.stop();
}
/**
* 1. 尝试使用 Iterator 作为数据源。
* 2. 一条记录生成多条
* 3. 生成的多条记录再分配给其他work执行
*/
public static void withIterator() {
SparkConf conf = new SparkConf()
.setAppName("btpka3")
.setMaster("local[4]");
final JavaSparkContext jsc = new JavaSparkContext(conf);
JavaRDD<Integer> rdd = jsc.parallelize(Arrays.asList(1, 2));
rdd = rdd.flatMap(
new FlatMapFunction<Integer, Integer>() {
@Override
public Iterable<Integer> call(Integer t) throws Exception {
return new Iterable<Integer>() {
@Override
public Iterator<Integer> iterator() {
return IntStream.range(t * 100 + 0, t * 100 + 20).iterator();
}
};
}
}
);
System.out.println("======: " + rdd.collect());
// NOTICE: 将数据重新分片,否则仅会在当前worker上执行,不会分给其他worker执行。
rdd = rdd.repartition(4);
JavaPairRDD<Integer, List<Integer>> counts = rdd.mapToPair(
new PairFunction<Integer, Integer, List<Integer>>() {
// key = thread id , value = number
@Override
public Tuple2<Integer, List<Integer>> call(Integer s) {
return new Tuple2<Integer, List<Integer>>(
Integer.valueOf((int) Thread.currentThread().getId()),
Arrays.asList(s));
}
})
.reduceByKey(new Function2<List<Integer>, List<Integer>, List<Integer>>() {
@Override
public List<Integer> call(List<Integer> i1, List<Integer> i2) {
List<Integer> l = new ArrayList<Integer>();
l.addAll(i1);
l.addAll(i2);
return l;
}
});
System.out.println("=======================================");
System.out.println(counts.collectAsMap());
System.out.println("---------------------------------------");
jsc.stop();
}
/**
* 如何快速终止其他worker?
*/
public static void stopAll() {
//String master = "local[4]";
String master = "spark://127.0.0.1:7077";
SparkConf conf = new SparkConf()
.setAppName("btpka3")
.set("spark.driver.cores", "1")
.set("spark.driver.memory", "5120m")
.set("spark.executor.memory", "512m")
.setMaster(master);
//SparkConf conf = new SparkConf();
final JavaSparkContext jsc = new JavaSparkContext(conf);
final Accumulator<Integer> acc1 = jsc.accumulator(0);
final Accumulator<Integer> acc2 = jsc.accumulator(0);
final AtomicInteger actualSeconds = new AtomicInteger();
// 不行, JavaRDD#foreachPartition 是分布式执行的。
//final AtomicInteger expectedSeconds = new AtomicInteger();
final Accumulator<Integer> expectedSeconds1 = jsc.accumulator(0);
//final Accumulator<Integer> accMap = jsc.accumulator(new HashMap<String, Integer>(), new MapAccumulator());
final Accumulable<Map<String, Integer>, Map<String, Integer>> accMap1 = jsc.accumulable(new HashMap<String, Integer>(), new MapAccumulable());
final Accumulable<Map<String, List>, Map<String, List>> accMap2 = jsc.accumulable(new HashMap<String, List>(), new MapListAccumulable());
final List<Integer> a = Arrays.asList(0);
// 准备一组数据,并随机插入,期待总执行时间小于等于25秒
List<Integer> data = new LinkedList<>();
for (int i = 0; i < 99; i++) {
data.add(i);
}
SecureRandom r = new SecureRandom();
int pos = r.nextInt(data.size());
data.add(pos, -1);
JavaRDD<Integer> distData = jsc.parallelize(data);
// 默认是4, 为了测试,将其分片为 4*5=20片,
// 即:100个数据,4个worker的话,每片应当有5个数据,共需5次循环即可完成,最长执行时间为5*5=秒。
// 则实际任务执行时间估计是(不包含任务分配所花费的时间) :
// i + 5*n 秒。其中 i是在特定分片数据中的位置(下标)。n是0~4。鉴于在执行的任务不能被终止,理想的任务执行时间是 5*(n+1) 秒
// FIXME: 如何确定每片数据的内容——即确定i的值
System.out.println("11111111111111111111111111111111111111");
log.info("partition's default count = " + distData.getNumPartitions());
distData = distData.repartition(4 * 5);
System.out.println("partition's new count = " + distData.getNumPartitions());
distData.foreachPartition(new VoidFunction<Iterator<Integer>>() {
@Override
public void call(Iterator<Integer> intIterator) throws Exception {
// if (expectedSeconds.get() > 0) {
// return;
// }
int i = 0;
while (intIterator.hasNext()) {
int v = intIterator.next();
if (v < 0) {
expectedSeconds1.add(i + 1);
//expectedSeconds.set(i + 1);
}
i++;
}
}
});
new Thread() {
public void run() {
Date start = new Date();
System.out.println("----- started at : " + start);
int i = 0;
try {
while (acc2.value() == 0 && i < 60) {
i++;
Thread.sleep(500);
}
jsc.cancelAllJobs();
System.out.println("watching thread exited on success");
} catch (InterruptedException e) {
e.printStackTrace();
System.out.println("watching thread exited on error");
}
Date end = new Date();
actualSeconds.addAndGet((int) (end.getTime() - start.getTime()) / 1000);
System.out.println("----- finished at : " + end + ", cost " + actualSeconds + " seconds");
// cancel:只能cancel尚未被调度的任务?已经在执行的不能被终止?
System.out.println("expected secondes is " + expectedSeconds1 + ", actual is " + actualSeconds);
}
}.start();
JavaRDD<Integer> counts = distData.map(new Function<Integer, Integer>() {
@Override
public Integer call(Integer v1) throws Exception {
Thread.sleep(1000);
acc1.add(1);
Map<String, Integer> m = new HashMap<String, Integer>();
m.put(String.valueOf(Thread.currentThread().getId()), 1);
accMap1.add(m);
Map<String, List> m2 = new HashMap<String, List>();
m2.put(String.valueOf(Thread.currentThread().getId()), Arrays.asList(v1));
accMap2.add(m2);
if (v1 < 0) {
acc2.add(1);
}
a.set(0, a.get(0) + 1);
return v1 + 1;
}
});
System.out.println("=======================================" + Thread.currentThread().getId());
try {
System.out.println(counts.toArray());
} catch (Exception e) {
log.error("toArray err", e);
}
System.out.println("local list a = " + a);
System.out.println("acc1 = " + acc1.value());
System.out.println("accMap1 = " + accMap1);
System.out.println("accMap2 = " + accMap2);
System.out.println("---------------------------------------");
//jsc.stop();
}
/**
* 笛卡尔乘积。
*/
public static void cartesian() {
SparkConf conf = new SparkConf()
.setAppName("btpka3")
.setMaster("local[2]");
JavaSparkContext jsc = new JavaSparkContext(conf);
JavaRDD<String> rdd1 = jsc.parallelize(Arrays.asList("a", "b", "c"));
JavaRDD<String> rdd2 = jsc.parallelize(Arrays.asList("1", "2", "3"));
JavaRDD<String> rdd3 = jsc.parallelize(Arrays.asList("x", "y", "z"));
JavaPairRDD<String, String> s1 = rdd1.cartesian(rdd2);
JavaPairRDD<Tuple2<String, String>, String> s2 = s1.cartesian(rdd3);
System.out.println("=======================================");
System.out.println(s1.collect());
System.out.println(s2.collect());
System.out.println("---------------------------------------");
jsc.stop();
}
// NOTICE: MapAccumulator是scala的tratit,并实现了部分方法,是否因此无法被Java类实现?
// public static class MapAccumulator implements AccumulatorParam<Map<String, Integer>> {
// private Map<String, Integer> value = new HashMap<String, Integer>();
//
// @Override
// public Map addAccumulator(Map<String, Integer> t1, Map<String, Integer> t2) {
// return addInPlace(t1, t2);
// }
//
// @Override
// public Map addInPlace(Map<String, Integer> t1, Map<String, Integer> t2) {
// for (Map.Entry<String, Integer> entry : t2.entrySet()) {
// String key = entry.getKey();
// Integer value = entry.getValue();
// Integer i = t1.get(key);
// if (i == null) {
// i = 0;
// }
// if (value != null) {
// i = i + value;
// }
// t1.put(key, i);
// }
// return t1;
// }
//
// @Override
// public Map zero(Map initialValue) {
// return initialValue;
// }
// }
public static class MapAccumulable implements AccumulableParam<Map<String, Integer>, Map<String, Integer>> {
private Map<String, Integer> value = new HashMap<String, Integer>();
@Override
public Map addAccumulator(Map<String, Integer> t1, Map<String, Integer> t2) {
return addInPlace(t1, t2);
}
@Override
public Map addInPlace(Map<String, Integer> t1, Map<String, Integer> t2) {
for (Map.Entry<String, Integer> entry : t2.entrySet()) {
String key = entry.getKey();
Integer value = entry.getValue();
Integer i = t1.get(key);
if (i == null) {
i = 0;
}
if (value != null) {
i = i + value;
}
t1.put(key, i);
}
return t1;
}
@Override
public Map zero(Map initialValue) {
return initialValue;
}
}
public static class MapListAccumulable implements AccumulableParam<Map<String, List>, Map<String, List>> {
private Map<String, Integer> value = new HashMap<String, Integer>();
@Override
public Map addAccumulator(Map<String, List> t1, Map<String, List> t2) {
return addInPlace(t1, t2);
}
@Override
public Map addInPlace(Map<String, List> t1, Map<String, List> t2) {
for (Map.Entry<String, List> entry : t2.entrySet()) {
String key = entry.getKey();
List value = entry.getValue();
List i = t1.get(key);
if (i == null) {
i = new ArrayList();
}
if (value != null) {
i.addAll(value);
}
t1.put(key, i);
}
return t1;
}
@Override
public Map zero(Map initialValue) {
return initialValue;
}
}
}