CacheTest.java example

Explorer
beam-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.beam.runners.spark;

import static org.junit.Assert.assertEquals;

import org.apache.beam.runners.spark.translation.EvaluationContext;
import org.apache.beam.runners.spark.translation.SparkContextFactory;
import org.apache.beam.runners.spark.translation.TransformTranslator;
import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.options.PipelineOptionsFactory;
import org.apache.beam.sdk.transforms.Count;
import org.apache.beam.sdk.transforms.Create;
import org.apache.beam.sdk.values.PCollection;
import org.apache.spark.api.java.JavaSparkContext;
import org.junit.Test;

/**
 * This test checks how the cache candidates map is populated by the runner when evaluating the
 * pipeline.
 */
public class CacheTest {

  @Test
  public void cacheCandidatesUpdaterTest() throws Exception {
    SparkPipelineOptions options =
        PipelineOptionsFactory.create().as(TestSparkPipelineOptions.class);
    options.setRunner(TestSparkRunner.class);
    Pipeline pipeline = Pipeline.create(options);
    PCollection<String> pCollection = pipeline.apply(Create.of("foo", "bar"));
    // first read
    pCollection.apply(Count.<String>globally());
    // second read
    // as we access the same PCollection two times, the Spark runner does optimization and so
    // will cache the RDD representing this PCollection
    pCollection.apply(Count.<String>globally());

    JavaSparkContext jsc = SparkContextFactory.getSparkContext(options);
    EvaluationContext ctxt = new EvaluationContext(jsc, pipeline, options);
    SparkRunner.CacheVisitor cacheVisitor =
        new SparkRunner.CacheVisitor(new TransformTranslator.Translator(), ctxt);
    pipeline.traverseTopologically(cacheVisitor);
    assertEquals(2L, (long) ctxt.getCacheCandidates().get(pCollection));
  }

}