/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.beam.sdk.io.gcp.datastore; import static org.apache.beam.sdk.io.gcp.datastore.DatastoreV1.Read.NUM_QUERY_SPLITS_MIN; import static org.junit.Assert.assertEquals; import com.google.datastore.v1.Query; import java.util.List; import javax.annotation.Nullable; import org.apache.beam.sdk.io.gcp.datastore.DatastoreV1.Read.SplitQueryFn; import org.apache.beam.sdk.io.gcp.datastore.DatastoreV1.Read.V1Options; import org.apache.beam.sdk.transforms.DoFnTester; import org.apache.beam.sdk.values.KV; import org.junit.Test; import org.junit.runner.RunWith; import org.junit.runners.JUnit4; /** * Integration tests for {@link DatastoreV1.Read.SplitQueryFn}. * * <p>It is hard to mock the exact behavior of Cloud Datastore, especially for the statistics * queries. Also the fact that DatastoreIO falls back gracefully when querying statistics fails, * makes it hard to catch these issues in production. This test here ensures we interact with * the Cloud Datastore directly, query the actual stats and verify that the SplitQueryFn generates * the expected number of query splits. * * <p>These tests are brittle as they rely on statistics data in Cloud Datastore. If the data * gets lost or changes then they will begin failing and this test should be disabled. * At the time of writing, the Cloud Datastore has the following statistics, * <ul> * <li>kind = sort_1G, entity_bytes = 2130000000, count = 10000000 * <li>kind = shakespeare, entity_bytes = 26383451, count = 172948 * </ul> */ @RunWith(JUnit4.class) public class SplitQueryFnIT { /** * Tests {@link SplitQueryFn} to generate expected number of splits for a large dataset. */ @Test public void testSplitQueryFnWithLargeDataset() throws Exception { String projectId = "apache-beam-testing"; String kind = "sort_1G"; String namespace = null; // Num splits is computed based on the entity_bytes size of the input_sort_1G kind reported by // Datastore stats. int expectedNumSplits = 32; testSplitQueryFn(projectId, kind, namespace, expectedNumSplits); } /** * Tests {@link SplitQueryFn} to fallback to NUM_QUERY_SPLITS_MIN for a small dataset. */ @Test public void testSplitQueryFnWithSmallDataset() throws Exception { String projectId = "apache-beam-testing"; String kind = "shakespeare"; String namespace = null; int expectedNumSplits = NUM_QUERY_SPLITS_MIN; testSplitQueryFn(projectId, kind, namespace, expectedNumSplits); } /** * A helper method to test {@link SplitQueryFn} to generate the expected number of splits. */ private void testSplitQueryFn(String projectId, String kind, @Nullable String namespace, int expectedNumSplits) throws Exception { Query.Builder query = Query.newBuilder(); query.addKindBuilder().setName(kind); SplitQueryFn splitQueryFn = new SplitQueryFn( V1Options.from(projectId, namespace, null), 0); DoFnTester<Query, KV<Integer, Query>> doFnTester = DoFnTester.of(splitQueryFn); List<KV<Integer, Query>> queries = doFnTester.processBundle(query.build()); assertEquals(queries.size(), expectedNumSplits); } // TODO (vikasrk): Create datasets under a different namespace and add tests. }