/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.beam.sdk.io.gcp.datastore; import static com.google.common.base.Preconditions.checkArgument; import static com.google.common.base.Preconditions.checkNotNull; import static com.google.common.base.Verify.verify; import static com.google.datastore.v1.PropertyFilter.Operator.EQUAL; import static com.google.datastore.v1.PropertyOrder.Direction.DESCENDING; import static com.google.datastore.v1.QueryResultBatch.MoreResultsType.NOT_FINISHED; import static com.google.datastore.v1.client.DatastoreHelper.makeAndFilter; import static com.google.datastore.v1.client.DatastoreHelper.makeDelete; import static com.google.datastore.v1.client.DatastoreHelper.makeFilter; import static com.google.datastore.v1.client.DatastoreHelper.makeOrder; import static com.google.datastore.v1.client.DatastoreHelper.makeUpsert; import static com.google.datastore.v1.client.DatastoreHelper.makeValue; import com.google.api.client.http.HttpRequestInitializer; import com.google.auth.Credentials; import com.google.auth.http.HttpCredentialsAdapter; import com.google.auto.value.AutoValue; import com.google.cloud.hadoop.util.ChainingHttpRequestInitializer; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.MoreObjects; import com.google.common.base.Strings; import com.google.common.collect.ImmutableList; import com.google.datastore.v1.CommitRequest; import com.google.datastore.v1.Entity; import com.google.datastore.v1.EntityResult; import com.google.datastore.v1.GqlQuery; import com.google.datastore.v1.Key; import com.google.datastore.v1.Key.PathElement; import com.google.datastore.v1.Mutation; import com.google.datastore.v1.PartitionId; import com.google.datastore.v1.Query; import com.google.datastore.v1.QueryResultBatch; import com.google.datastore.v1.RunQueryRequest; import com.google.datastore.v1.RunQueryResponse; import com.google.datastore.v1.client.Datastore; import com.google.datastore.v1.client.DatastoreException; import com.google.datastore.v1.client.DatastoreFactory; import com.google.datastore.v1.client.DatastoreHelper; import com.google.datastore.v1.client.DatastoreOptions; import com.google.datastore.v1.client.QuerySplitter; import com.google.protobuf.Int32Value; import com.google.rpc.Code; import java.io.IOException; import java.io.Serializable; import java.util.ArrayList; import java.util.List; import java.util.NoSuchElementException; import javax.annotation.Nullable; import org.apache.beam.sdk.PipelineRunner; import org.apache.beam.sdk.annotations.Experimental; import org.apache.beam.sdk.annotations.Experimental.Kind; import org.apache.beam.sdk.coders.SerializableCoder; import org.apache.beam.sdk.extensions.gcp.options.GcpOptions; import org.apache.beam.sdk.options.PipelineOptions; import org.apache.beam.sdk.options.ValueProvider; import org.apache.beam.sdk.options.ValueProvider.StaticValueProvider; import org.apache.beam.sdk.transforms.Create; import org.apache.beam.sdk.transforms.DoFn; import org.apache.beam.sdk.transforms.Flatten; import org.apache.beam.sdk.transforms.GroupByKey; import org.apache.beam.sdk.transforms.MapElements; import org.apache.beam.sdk.transforms.PTransform; import org.apache.beam.sdk.transforms.ParDo; import org.apache.beam.sdk.transforms.SimpleFunction; import org.apache.beam.sdk.transforms.Values; import org.apache.beam.sdk.transforms.display.DisplayData; import org.apache.beam.sdk.transforms.display.DisplayData.Builder; import org.apache.beam.sdk.transforms.display.HasDisplayData; import org.apache.beam.sdk.util.BackOff; import org.apache.beam.sdk.util.BackOffUtils; import org.apache.beam.sdk.util.FluentBackoff; import org.apache.beam.sdk.util.RetryHttpRequestInitializer; import org.apache.beam.sdk.util.Sleeper; import org.apache.beam.sdk.values.KV; import org.apache.beam.sdk.values.PBegin; import org.apache.beam.sdk.values.PCollection; import org.apache.beam.sdk.values.PDone; import org.apache.beam.sdk.values.TypeDescriptor; import org.joda.time.Duration; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * {@link DatastoreV1} provides an API to Read, Write and Delete {@link PCollection PCollections} * of <a href="https://developers.google.com/datastore/">Google Cloud Datastore</a> version v1 * {@link Entity} objects. Read is only supported for Bounded PCollections while Write and Delete * are supported for both Bounded and Unbounded PCollections. * * <p>This API currently requires an authentication workaround. To use {@link DatastoreV1}, users * must use the {@code gcloud} command line tool to get credentials for Cloud Datastore: * <pre> * $ gcloud auth login * </pre> * * <p>To read a {@link PCollection} from a query to Cloud Datastore, use {@link DatastoreV1#read} * and its methods {@link DatastoreV1.Read#withProjectId} and {@link DatastoreV1.Read#withQuery} to * specify the project to query and the query to read from. You can optionally provide a namespace * to query within using {@link DatastoreV1.Read#withNamespace}. You could also optionally specify * how many splits you want for the query using {@link DatastoreV1.Read#withNumQuerySplits}. * * <p>For example: * * <pre> {@code * // Read a query from Datastore * PipelineOptions options = PipelineOptionsFactory.fromArgs(args).create(); * Query query = ...; * String projectId = "..."; * * Pipeline p = Pipeline.create(options); * PCollection<Entity> entities = p.apply( * DatastoreIO.v1().read() * .withProjectId(projectId) * .withQuery(query)); * } </pre> * * <p><b>Note:</b> A runner may read from Cloud Datastore in parallel across * many workers. However, when the {@link Query} is configured with a limit using * {@link com.google.datastore.v1.Query.Builder#setLimit(Int32Value)} or if the Query contains * inequality filters like {@code GREATER_THAN, LESS_THAN} etc., then all returned results * will be read by a single worker in order to ensure correct data. Since data is read from * a single worker, this could have a significant impact on the performance of the job. * * <p>To write a {@link PCollection} to a Cloud Datastore, use {@link DatastoreV1#write}, * specifying the Cloud Datastore project to write to: * * <pre> {@code * PCollection<Entity> entities = ...; * entities.apply(DatastoreIO.v1().write().withProjectId(projectId)); * p.run(); * } </pre> * * <p>To delete a {@link PCollection} of {@link Entity Entities} from Cloud Datastore, use * {@link DatastoreV1#deleteEntity()}, specifying the Cloud Datastore project to write to: * * <pre> {@code * PCollection<Entity> entities = ...; * entities.apply(DatastoreIO.v1().deleteEntity().withProjectId(projectId)); * p.run(); * } </pre> * * <p>To delete entities associated with a {@link PCollection} of {@link Key Keys} from Cloud * Datastore, use {@link DatastoreV1#deleteKey}, specifying the Cloud Datastore project to write to: * * <pre> {@code * PCollection<Entity> entities = ...; * entities.apply(DatastoreIO.v1().deleteKey().withProjectId(projectId)); * p.run(); * } </pre> * * <p>{@link Entity Entities} in the {@code PCollection} to be written or deleted must have complete * {@link Key Keys}. Complete {@code Keys} specify the {@code name} and {@code id} of the * {@code Entity}, where incomplete {@code Keys} do not. A {@code namespace} other than * {@code projectId} default may be used by specifying it in the {@code Entity} {@code Keys}. * * <pre>{@code * Key.Builder keyBuilder = DatastoreHelper.makeKey(...); * keyBuilder.getPartitionIdBuilder().setNamespace(namespace); * }</pre> * * <p>{@code Entities} will be committed as upsert (update or insert) or delete mutations. Please * read <a href="https://cloud.google.com/datastore/docs/concepts/entities">Entities, Properties, * and Keys</a> for more information about {@code Entity} keys. * * <h3>Permissions</h3> * Permission requirements depend on the {@code PipelineRunner} that is used to execute the * pipeline. Please refer to the documentation of corresponding {@code PipelineRunner}s for * more details. * * <p>Please see <a href="https://cloud.google.com/datastore/docs/activate">Cloud Datastore Sign Up * </a>for security and permission related information specific to Cloud Datastore. * * <p>Optionally, Cloud Datastore V1 Emulator, running locally, could be used for testing purposes * by providing the host port information through {@code withLocalhost("host:port"} for all the * above transforms. In such a case, all the Cloud Datastore API calls are directed to the Emulator. * * @see PipelineRunner */ public class DatastoreV1 { // A package-private constructor to prevent direct instantiation from outside of this package DatastoreV1() {} /** * Cloud Datastore has a limit of 500 mutations per batch operation, so we flush * changes to Datastore every 500 entities. */ @VisibleForTesting static final int DATASTORE_BATCH_UPDATE_LIMIT = 500; /** * Cloud Datastore has a limit of 10MB per RPC, so we also flush if the total size of mutations * exceeds this limit. This is set lower than the 10MB limit on the RPC, as this only accounts for * the mutations themselves and not the CommitRequest wrapper around them. */ @VisibleForTesting static final int DATASTORE_BATCH_UPDATE_BYTES_LIMIT = 5_000_000; /** * Returns an empty {@link DatastoreV1.Read} builder. Configure the source {@code projectId}, * {@code query}, and optionally {@code namespace} and {@code numQuerySplits} using * {@link DatastoreV1.Read#withProjectId}, {@link DatastoreV1.Read#withQuery}, * {@link DatastoreV1.Read#withNamespace}, {@link DatastoreV1.Read#withNumQuerySplits}. */ public DatastoreV1.Read read() { return new AutoValue_DatastoreV1_Read.Builder().setNumQuerySplits(0).build(); } /** * A {@link PTransform} that reads the result rows of a Cloud Datastore query as {@code Entity} * objects. * * @see DatastoreIO */ @AutoValue public abstract static class Read extends PTransform<PBegin, PCollection<Entity>> { private static final Logger LOG = LoggerFactory.getLogger(Read.class); /** An upper bound on the number of splits for a query. */ public static final int NUM_QUERY_SPLITS_MAX = 50000; /** A lower bound on the number of splits for a query. */ static final int NUM_QUERY_SPLITS_MIN = 12; /** Default bundle size of 64MB. */ static final long DEFAULT_BUNDLE_SIZE_BYTES = 64 * 1024 * 1024; /** * Maximum number of results to request per query. * * <p>Must be set, or it may result in an I/O error when querying Cloud Datastore. */ static final int QUERY_BATCH_LIMIT = 500; @Nullable public abstract ValueProvider<String> getProjectId(); @Nullable public abstract Query getQuery(); @Nullable public abstract ValueProvider<String> getLiteralGqlQuery(); @Nullable public abstract ValueProvider<String> getNamespace(); public abstract int getNumQuerySplits(); @Nullable public abstract String getLocalhost(); @Override public abstract String toString(); abstract Builder toBuilder(); @AutoValue.Builder abstract static class Builder { abstract Builder setProjectId(ValueProvider<String> projectId); abstract Builder setQuery(Query query); abstract Builder setLiteralGqlQuery(ValueProvider<String> literalGqlQuery); abstract Builder setNamespace(ValueProvider<String> namespace); abstract Builder setNumQuerySplits(int numQuerySplits); abstract Builder setLocalhost(String localhost); abstract Read build(); } /** * Computes the number of splits to be performed on the given query by querying the estimated * size from Cloud Datastore. */ static int getEstimatedNumSplits(Datastore datastore, Query query, @Nullable String namespace) { int numSplits; try { long estimatedSizeBytes = getEstimatedSizeBytes(datastore, query, namespace); LOG.info("Estimated size bytes for the query is: {}", estimatedSizeBytes); numSplits = (int) Math.min(NUM_QUERY_SPLITS_MAX, Math.round(((double) estimatedSizeBytes) / DEFAULT_BUNDLE_SIZE_BYTES)); } catch (Exception e) { LOG.warn("Failed the fetch estimatedSizeBytes for query: {}", query, e); // Fallback in case estimated size is unavailable. numSplits = NUM_QUERY_SPLITS_MIN; } return Math.max(numSplits, NUM_QUERY_SPLITS_MIN); } /** * Cloud Datastore system tables with statistics are periodically updated. This method fetches * the latest timestamp (in microseconds) of statistics update using the {@code __Stat_Total__} * table. */ private static long queryLatestStatisticsTimestamp(Datastore datastore, @Nullable String namespace) throws DatastoreException { Query.Builder query = Query.newBuilder(); // Note: namespace either being null or empty represents the default namespace, in which // case we treat it as not provided by the user. if (Strings.isNullOrEmpty(namespace)) { query.addKindBuilder().setName("__Stat_Total__"); } else { query.addKindBuilder().setName("__Stat_Ns_Total__"); } query.addOrder(makeOrder("timestamp", DESCENDING)); query.setLimit(Int32Value.newBuilder().setValue(1)); RunQueryRequest request = makeRequest(query.build(), namespace); RunQueryResponse response = datastore.runQuery(request); QueryResultBatch batch = response.getBatch(); if (batch.getEntityResultsCount() == 0) { throw new NoSuchElementException( "Datastore total statistics unavailable"); } Entity entity = batch.getEntityResults(0).getEntity(); return entity.getProperties().get("timestamp").getTimestampValue().getSeconds() * 1000000; } /** * Get the estimated size of the data returned by the given query. * * <p>Cloud Datastore provides no way to get a good estimate of how large the result of a query * entity kind being queried, using the __Stat_Kind__ system table, assuming exactly 1 kind * is specified in the query. * * <p>See https://cloud.google.com/datastore/docs/concepts/stats. */ static long getEstimatedSizeBytes(Datastore datastore, Query query, @Nullable String namespace) throws DatastoreException { String ourKind = query.getKind(0).getName(); long latestTimestamp = queryLatestStatisticsTimestamp(datastore, namespace); LOG.info("Latest stats timestamp for kind {} is {}", ourKind, latestTimestamp); Query.Builder queryBuilder = Query.newBuilder(); if (Strings.isNullOrEmpty(namespace)) { queryBuilder.addKindBuilder().setName("__Stat_Kind__"); } else { queryBuilder.addKindBuilder().setName("__Stat_Ns_Kind__"); } queryBuilder.setFilter(makeAndFilter( makeFilter("kind_name", EQUAL, makeValue(ourKind).build()).build(), makeFilter("timestamp", EQUAL, makeValue(latestTimestamp).build()).build())); RunQueryRequest request = makeRequest(queryBuilder.build(), namespace); long now = System.currentTimeMillis(); RunQueryResponse response = datastore.runQuery(request); LOG.debug("Query for per-kind statistics took {}ms", System.currentTimeMillis() - now); QueryResultBatch batch = response.getBatch(); if (batch.getEntityResultsCount() == 0) { throw new NoSuchElementException( "Datastore statistics for kind " + ourKind + " unavailable"); } Entity entity = batch.getEntityResults(0).getEntity(); return entity.getProperties().get("entity_bytes").getIntegerValue(); } private static PartitionId.Builder forNamespace(@Nullable String namespace) { PartitionId.Builder partitionBuilder = PartitionId.newBuilder(); // Namespace either being null or empty represents the default namespace. // Datastore Client libraries expect users to not set the namespace proto field in // either of these cases. if (!Strings.isNullOrEmpty(namespace)) { partitionBuilder.setNamespaceId(namespace); } return partitionBuilder; } /** Builds a {@link RunQueryRequest} from the {@code query} and {@code namespace}. */ static RunQueryRequest makeRequest(Query query, @Nullable String namespace) { return RunQueryRequest.newBuilder() .setQuery(query) .setPartitionId(forNamespace(namespace)).build(); } @VisibleForTesting /** Builds a {@link RunQueryRequest} from the {@code GqlQuery} and {@code namespace}. */ static RunQueryRequest makeRequest(GqlQuery gqlQuery, @Nullable String namespace) { return RunQueryRequest.newBuilder() .setGqlQuery(gqlQuery) .setPartitionId(forNamespace(namespace)).build(); } /** * A helper function to get the split queries, taking into account the optional * {@code namespace}. */ private static List<Query> splitQuery(Query query, @Nullable String namespace, Datastore datastore, QuerySplitter querySplitter, int numSplits) throws DatastoreException { // If namespace is set, include it in the split request so splits are calculated accordingly. return querySplitter.getSplits(query, forNamespace(namespace).build(), numSplits, datastore); } /** * Translates a Cloud Datastore gql query string to {@link Query}. * * <p>Currently, the only way to translate a gql query string to a Query is to run the query * against Cloud Datastore and extract the {@code Query} from the response. To prevent reading * any data, we set the {@code LIMIT} to 0 but if the gql query already has a limit set, we * catch the exception with {@code INVALID_ARGUMENT} error code and retry the translation * without the zero limit. * * <p>Note: This may result in reading actual data from Cloud Datastore but the service has a * cap on the number of entities returned for a single rpc request, so this should not be a * problem in practice. */ @VisibleForTesting static Query translateGqlQueryWithLimitCheck(String gql, Datastore datastore, String namespace) throws DatastoreException { String gqlQueryWithZeroLimit = gql + " LIMIT 0"; try { Query translatedQuery = translateGqlQuery(gqlQueryWithZeroLimit, datastore, namespace); // Clear the limit that we set. return translatedQuery.toBuilder().clearLimit().build(); } catch (DatastoreException e) { // Note: There is no specific error code or message to detect if the query already has a // limit, so we just check for INVALID_ARGUMENT and assume that that the query might have // a limit already set. if (e.getCode() == Code.INVALID_ARGUMENT) { LOG.warn("Failed to translate Gql query '{}': {}", gqlQueryWithZeroLimit, e.getMessage()); LOG.warn("User query might have a limit already set, so trying without zero limit"); // Retry without the zero limit. return translateGqlQuery(gql, datastore, namespace); } else { throw e; } } } /** Translates a gql query string to {@link Query}.*/ private static Query translateGqlQuery(String gql, Datastore datastore, String namespace) throws DatastoreException { GqlQuery gqlQuery = GqlQuery.newBuilder().setQueryString(gql) .setAllowLiterals(true).build(); RunQueryRequest req = makeRequest(gqlQuery, namespace); return datastore.runQuery(req).getQuery(); } /** * Returns a new {@link DatastoreV1.Read} that reads from the Cloud Datastore for the specified * project. */ public DatastoreV1.Read withProjectId(String projectId) { checkNotNull(projectId, "projectId"); return toBuilder().setProjectId(StaticValueProvider.of(projectId)).build(); } /** * Same as {@link Read#withProjectId(String)} but with a {@link ValueProvider}. */ public DatastoreV1.Read withProjectId(ValueProvider<String> projectId) { checkNotNull(projectId, "projectId"); return toBuilder().setProjectId(projectId).build(); } /** * Returns a new {@link DatastoreV1.Read} that reads the results of the specified query. * * <p><b>Note:</b> Normally, {@code DatastoreIO} will read from Cloud Datastore in parallel * across many workers. However, when the {@link Query} is configured with a limit using * {@link Query.Builder#setLimit}, then all results will be read by a single worker in order * to ensure correct results. */ public DatastoreV1.Read withQuery(Query query) { checkNotNull(query, "query"); checkArgument(!query.hasLimit() || query.getLimit().getValue() > 0, "Invalid query limit %s: must be positive", query.getLimit().getValue()); return toBuilder().setQuery(query).build(); } /** * Returns a new {@link DatastoreV1.Read} that reads the results of the specified GQL query. * See <a href="https://cloud.google.com/datastore/docs/reference/gql_reference">GQL Reference * </a> to know more about GQL grammar. * * <p><b><i>Note:</i></b> This query is executed with literals allowed, so the users should * ensure that the query is originated from trusted sources to avoid any security * vulnerabilities via SQL Injection. * * <p><b><i>Experimental</i></b>: Cloud Datastore does not a provide a clean way to translate * a gql query string to {@link Query}, so we end up making a query to the service for * translation but this may read the actual data, although it will be a small amount. * It needs more validation through production use cases before marking it as stable. */ @Experimental(Kind.SOURCE_SINK) public DatastoreV1.Read withLiteralGqlQuery(String gqlQuery) { checkNotNull(gqlQuery, "gqlQuery"); return toBuilder().setLiteralGqlQuery(StaticValueProvider.of(gqlQuery)).build(); } /** * Same as {@link Read#withLiteralGqlQuery(String)} but with a {@link ValueProvider}. */ @Experimental(Kind.SOURCE_SINK) public DatastoreV1.Read withLiteralGqlQuery(ValueProvider<String> gqlQuery) { checkNotNull(gqlQuery, "gqlQuery"); return toBuilder().setLiteralGqlQuery(gqlQuery).build(); } /** * Returns a new {@link DatastoreV1.Read} that reads from the given namespace. */ public DatastoreV1.Read withNamespace(String namespace) { return toBuilder().setNamespace(StaticValueProvider.of(namespace)).build(); } /** * Same as {@link Read#withNamespace(String)} but with a {@link ValueProvider}. */ public DatastoreV1.Read withNamespace(ValueProvider<String> namespace) { return toBuilder().setNamespace(namespace).build(); } /** * Returns a new {@link DatastoreV1.Read} that reads by splitting the given {@code query} into * {@code numQuerySplits}. * * <p>The semantics for the query splitting is defined below: * <ul> * <li>Any value less than or equal to 0 will be ignored, and the number of splits will be * chosen dynamically at runtime based on the query data size. * <li>Any value greater than {@link Read#NUM_QUERY_SPLITS_MAX} will be capped at * {@code NUM_QUERY_SPLITS_MAX}. * <li>If the {@code query} has a user limit set, then {@code numQuerySplits} will be * ignored and no split will be performed. * <li>Under certain cases Cloud Datastore is unable to split query to the requested number of * splits. In such cases we just use whatever the Cloud Datastore returns. * </ul> */ public DatastoreV1.Read withNumQuerySplits(int numQuerySplits) { return toBuilder() .setNumQuerySplits(Math.min(Math.max(numQuerySplits, 0), NUM_QUERY_SPLITS_MAX)) .build(); } /** * Returns a new {@link DatastoreV1.Read} that reads from a Datastore Emulator running at the * given localhost address. */ public DatastoreV1.Read withLocalhost(String localhost) { return toBuilder() .setLocalhost(localhost) .build(); } @Override public PCollection<Entity> expand(PBegin input) { V1Options v1Options = V1Options.from(getProjectId(), getNamespace(), getLocalhost()); /* * This composite transform involves the following steps: * 1. Create a singleton of the user provided {@code query} or if {@code gqlQuery} is * provided apply a {@link ParDo} that translates the {@code gqlQuery} into a {@code query}. * * 2. A {@link ParDo} splits the resulting query into {@code numQuerySplits} and * assign each split query a unique {@code Integer} as the key. The resulting output is * of the type {@code PCollection<KV<Integer, Query>>}. * * If the value of {@code numQuerySplits} is less than or equal to 0, then the number of * splits will be computed dynamically based on the size of the data for the {@code query}. * * 3. The resulting {@code PCollection} is sharded using a {@link GroupByKey} operation. The * queries are extracted from they {@code KV<Integer, Iterable<Query>>} and flattened to * output a {@code PCollection<Query>}. * * 4. In the third step, a {@code ParDo} reads entities for each query and outputs * a {@code PCollection<Entity>}. */ PCollection<Query> inputQuery; if (getQuery() != null) { inputQuery = input.apply(Create.of(getQuery())); } else { inputQuery = input .apply(Create.of(getLiteralGqlQuery()) .withCoder(SerializableCoder.of(new TypeDescriptor<ValueProvider<String>>() {}))) .apply(ParDo.of(new GqlQueryTranslateFn(v1Options))); } PCollection<KV<Integer, Query>> splitQueries = inputQuery .apply(ParDo.of(new SplitQueryFn(v1Options, getNumQuerySplits()))); PCollection<Query> shardedQueries = splitQueries .apply(GroupByKey.<Integer, Query>create()) .apply(Values.<Iterable<Query>>create()) .apply(Flatten.<Query>iterables()); PCollection<Entity> entities = shardedQueries .apply(ParDo.of(new ReadFn(v1Options))); return entities; } @Override public void validate(PipelineOptions options) { checkNotNull(getProjectId(), "projectId"); if (getProjectId().isAccessible() && getProjectId().get() == null) { throw new IllegalArgumentException("Project id cannot be null"); } if (getQuery() == null && getLiteralGqlQuery() == null) { throw new IllegalArgumentException( "Either query or gql query ValueProvider should be provided"); } if (getQuery() != null && getLiteralGqlQuery() != null) { throw new IllegalArgumentException( "Only one of query or gql query ValueProvider should be provided"); } if (getLiteralGqlQuery() != null && getLiteralGqlQuery().isAccessible()) { checkNotNull(getLiteralGqlQuery().get(), "gqlQuery"); } } @Override public void populateDisplayData(DisplayData.Builder builder) { super.populateDisplayData(builder); String query = getQuery() == null ? null : getQuery().toString(); builder .addIfNotNull(DisplayData.item("projectId", getProjectId()) .withLabel("ProjectId")) .addIfNotNull(DisplayData.item("namespace", getNamespace()) .withLabel("Namespace")) .addIfNotNull(DisplayData.item("query", query) .withLabel("Query")) .addIfNotNull(DisplayData.item("gqlQuery", getLiteralGqlQuery()) .withLabel("GqlQuery")); } @VisibleForTesting static class V1Options implements HasDisplayData, Serializable { private final ValueProvider<String> project; @Nullable private final ValueProvider<String> namespace; @Nullable private final String localhost; private V1Options(ValueProvider<String> project, ValueProvider<String> namespace, String localhost) { this.project = project; this.namespace = namespace; this.localhost = localhost; } public static V1Options from(String projectId, String namespace, String localhost) { return from(StaticValueProvider.of(projectId), StaticValueProvider.of(namespace), localhost); } public static V1Options from(ValueProvider<String> project, ValueProvider<String> namespace, String localhost) { return new V1Options(project, namespace, localhost); } public String getProjectId() { return project.get(); } @Nullable public String getNamespace() { return namespace == null ? null : namespace.get(); } public ValueProvider<String> getProjectValueProvider() { return project; } @Nullable public ValueProvider<String> getNamespaceValueProvider() { return namespace; } @Nullable public String getLocalhost() { return localhost; } @Override public void populateDisplayData(DisplayData.Builder builder) { builder .addIfNotNull(DisplayData.item("projectId", getProjectValueProvider()) .withLabel("ProjectId")) .addIfNotNull(DisplayData.item("namespace", getNamespaceValueProvider()) .withLabel("Namespace")); } } /** * A DoFn that translates a Cloud Datastore gql query string to {@code Query}. */ static class GqlQueryTranslateFn extends DoFn<ValueProvider<String>, Query> { private final V1Options v1Options; private transient Datastore datastore; private final V1DatastoreFactory datastoreFactory; GqlQueryTranslateFn(V1Options options) { this(options, new V1DatastoreFactory()); } GqlQueryTranslateFn(V1Options options, V1DatastoreFactory datastoreFactory) { this.v1Options = options; this.datastoreFactory = datastoreFactory; } @StartBundle public void startBundle(StartBundleContext c) throws Exception { datastore = datastoreFactory.getDatastore(c.getPipelineOptions(), v1Options.getProjectId()); } @ProcessElement public void processElement(ProcessContext c) throws Exception { ValueProvider<String> gqlQuery = c.element(); LOG.info("User query: '{}'", gqlQuery.get()); Query query = translateGqlQueryWithLimitCheck(gqlQuery.get(), datastore, v1Options.getNamespace()); LOG.info("User gql query translated to Query({})", query); c.output(query); } } /** * A {@link DoFn} that splits a given query into multiple sub-queries, assigns them unique * keys and outputs them as {@link KV}. */ @VisibleForTesting static class SplitQueryFn extends DoFn<Query, KV<Integer, Query>> { private final V1Options options; // number of splits to make for a given query private final int numSplits; private final V1DatastoreFactory datastoreFactory; // Datastore client private transient Datastore datastore; // Query splitter private transient QuerySplitter querySplitter; public SplitQueryFn(V1Options options, int numSplits) { this(options, numSplits, new V1DatastoreFactory()); } @VisibleForTesting SplitQueryFn(V1Options options, int numSplits, V1DatastoreFactory datastoreFactory) { this.options = options; this.numSplits = numSplits; this.datastoreFactory = datastoreFactory; } @StartBundle public void startBundle(StartBundleContext c) throws Exception { datastore = datastoreFactory.getDatastore(c.getPipelineOptions(), options.getProjectId(), options.getLocalhost()); querySplitter = datastoreFactory.getQuerySplitter(); } @ProcessElement public void processElement(ProcessContext c) throws Exception { int key = 1; Query query = c.element(); // If query has a user set limit, then do not split. if (query.hasLimit()) { c.output(KV.of(key, query)); return; } int estimatedNumSplits; // Compute the estimated numSplits if numSplits is not specified by the user. if (numSplits <= 0) { estimatedNumSplits = getEstimatedNumSplits(datastore, query, options.getNamespace()); } else { estimatedNumSplits = numSplits; } LOG.info("Splitting the query into {} splits", estimatedNumSplits); List<Query> querySplits; try { querySplits = splitQuery(query, options.getNamespace(), datastore, querySplitter, estimatedNumSplits); } catch (Exception e) { LOG.warn("Unable to parallelize the given query: {}", query, e); querySplits = ImmutableList.of(query); } // assign unique keys to query splits. for (Query subquery : querySplits) { c.output(KV.of(key++, subquery)); } } @Override public void populateDisplayData(DisplayData.Builder builder) { super.populateDisplayData(builder); builder.include("options", options); if (numSplits > 0) { builder.add(DisplayData.item("numQuerySplits", numSplits) .withLabel("Requested number of Query splits")); } } } /** * A {@link DoFn} that reads entities from Cloud Datastore for each query. */ @VisibleForTesting static class ReadFn extends DoFn<Query, Entity> { private final V1Options options; private final V1DatastoreFactory datastoreFactory; // Datastore client private transient Datastore datastore; public ReadFn(V1Options options) { this(options, new V1DatastoreFactory()); } @VisibleForTesting ReadFn(V1Options options, V1DatastoreFactory datastoreFactory) { this.options = options; this.datastoreFactory = datastoreFactory; } @StartBundle public void startBundle(StartBundleContext c) throws Exception { datastore = datastoreFactory.getDatastore(c.getPipelineOptions(), options.getProjectId(), options.getLocalhost()); } /** Read and output entities for the given query. */ @ProcessElement public void processElement(ProcessContext context) throws Exception { Query query = context.element(); String namespace = options.getNamespace(); int userLimit = query.hasLimit() ? query.getLimit().getValue() : Integer.MAX_VALUE; boolean moreResults = true; QueryResultBatch currentBatch = null; while (moreResults) { Query.Builder queryBuilder = query.toBuilder().clone(); queryBuilder.setLimit(Int32Value.newBuilder().setValue( Math.min(userLimit, QUERY_BATCH_LIMIT))); if (currentBatch != null && !currentBatch.getEndCursor().isEmpty()) { queryBuilder.setStartCursor(currentBatch.getEndCursor()); } RunQueryRequest request = makeRequest(queryBuilder.build(), namespace); RunQueryResponse response = datastore.runQuery(request); currentBatch = response.getBatch(); // MORE_RESULTS_AFTER_LIMIT is not implemented yet: // https://groups.google.com/forum/#!topic/gcd-discuss/iNs6M1jA2Vw, so // use result count to determine if more results might exist. int numFetch = currentBatch.getEntityResultsCount(); if (query.hasLimit()) { verify(userLimit >= numFetch, "Expected userLimit %s >= numFetch %s, because query limit %s must be <= userLimit", userLimit, numFetch, query.getLimit()); userLimit -= numFetch; } // output all the entities from the current batch. for (EntityResult entityResult : currentBatch.getEntityResultsList()) { context.output(entityResult.getEntity()); } // Check if we have more entities to be read. moreResults = // User-limit does not exist (so userLimit == MAX_VALUE) and/or has not been satisfied (userLimit > 0) // All indications from the API are that there are/may be more results. && ((numFetch == QUERY_BATCH_LIMIT) || (currentBatch.getMoreResults() == NOT_FINISHED)); } } @Override public void populateDisplayData(DisplayData.Builder builder) { super.populateDisplayData(builder); builder.include("options", options); } } } /** * Returns an empty {@link DatastoreV1.Write} builder. Configure the destination * {@code projectId} using {@link DatastoreV1.Write#withProjectId}. */ public Write write() { return new Write(null, null); } /** * Returns an empty {@link DeleteEntity} builder. Configure the destination * {@code projectId} using {@link DeleteEntity#withProjectId}. */ public DeleteEntity deleteEntity() { return new DeleteEntity(null, null); } /** * Returns an empty {@link DeleteKey} builder. Configure the destination * {@code projectId} using {@link DeleteKey#withProjectId}. */ public DeleteKey deleteKey() { return new DeleteKey(null, null); } /** * A {@link PTransform} that writes {@link Entity} objects to Cloud Datastore. * * @see DatastoreIO */ public static class Write extends Mutate<Entity> { /** * Note that {@code projectId} is only {@code @Nullable} as a matter of build order, but if * it is {@code null} at instantiation time, an error will be thrown. */ Write(@Nullable ValueProvider<String> projectId, @Nullable String localhost) { super(projectId, localhost, new UpsertFn()); } /** * Returns a new {@link Write} that writes to the Cloud Datastore for the specified project. */ public Write withProjectId(String projectId) { checkNotNull(projectId, "projectId"); return withProjectId(StaticValueProvider.of(projectId)); } /** * Same as {@link Write#withProjectId(String)} but with a {@link ValueProvider}. */ public Write withProjectId(ValueProvider<String> projectId) { checkNotNull(projectId, "projectId ValueProvider"); return new Write(projectId, localhost); } /** * Returns a new {@link Write} that writes to the Cloud Datastore Emulator running locally on * the specified host port. */ public Write withLocalhost(String localhost) { checkNotNull(localhost, "localhost"); return new Write(projectId, localhost); } } /** * A {@link PTransform} that deletes {@link Entity Entities} from Cloud Datastore. * * @see DatastoreIO */ public static class DeleteEntity extends Mutate<Entity> { /** * Note that {@code projectId} is only {@code @Nullable} as a matter of build order, but if * it is {@code null} at instantiation time, an error will be thrown. */ DeleteEntity(@Nullable ValueProvider<String> projectId, @Nullable String localhost) { super(projectId, localhost, new DeleteEntityFn()); } /** * Returns a new {@link DeleteEntity} that deletes entities from the Cloud Datastore for the * specified project. */ public DeleteEntity withProjectId(String projectId) { checkNotNull(projectId, "projectId"); return withProjectId(StaticValueProvider.of(projectId)); } /** * Same as {@link DeleteEntity#withProjectId(String)} but with a {@link ValueProvider}. */ public DeleteEntity withProjectId(ValueProvider<String> projectId) { checkNotNull(projectId, "projectId ValueProvider"); return new DeleteEntity(projectId, localhost); } /** * Returns a new {@link DeleteEntity} that deletes entities from the Cloud Datastore Emulator * running locally on the specified host port. */ public DeleteEntity withLocalhost(String localhost) { checkNotNull(localhost, "localhost"); return new DeleteEntity(projectId, localhost); } } /** * A {@link PTransform} that deletes {@link Entity Entities} associated with the given * {@link Key Keys} from Cloud Datastore. * * @see DatastoreIO */ public static class DeleteKey extends Mutate<Key> { /** * Note that {@code projectId} is only {@code @Nullable} as a matter of build order, but if * it is {@code null} at instantiation time, an error will be thrown. */ DeleteKey(@Nullable ValueProvider<String> projectId, @Nullable String localhost) { super(projectId, localhost, new DeleteKeyFn()); } /** * Returns a new {@link DeleteKey} that deletes entities from the Cloud Datastore for the * specified project. */ public DeleteKey withProjectId(String projectId) { checkNotNull(projectId, "projectId"); return withProjectId(StaticValueProvider.of(projectId)); } /** * Returns a new {@link DeleteKey} that deletes entities from the Cloud Datastore Emulator * running locally on the specified host port. */ public DeleteKey withLocalhost(String localhost) { checkNotNull(localhost, "localhost"); return new DeleteKey(projectId, localhost); } /** * Same as {@link DeleteKey#withProjectId(String)} but with a {@link ValueProvider}. */ public DeleteKey withProjectId(ValueProvider<String> projectId) { checkNotNull(projectId, "projectId ValueProvider"); return new DeleteKey(projectId, localhost); } } /** * A {@link PTransform} that writes mutations to Cloud Datastore. * * <p>It requires a {@link DoFn} that tranforms an object of type {@code T} to a {@link Mutation}. * {@code T} is usually either an {@link Entity} or a {@link Key} * <b>Note:</b> Only idempotent Cloud Datastore mutation operations (upsert and delete) should * be used by the {@code DoFn} provided, as the commits are retried when failures occur. */ private abstract static class Mutate<T> extends PTransform<PCollection<T>, PDone> { protected ValueProvider<String> projectId; @Nullable protected String localhost; /** A function that transforms each {@code T} into a mutation. */ private final SimpleFunction<T, Mutation> mutationFn; /** * Note that {@code projectId} is only {@code @Nullable} as a matter of build order, but if * it is {@code null} at instantiation time, an error will be thrown. */ Mutate(@Nullable ValueProvider<String> projectId, @Nullable String localhost, SimpleFunction<T, Mutation> mutationFn) { this.projectId = projectId; this.localhost = localhost; this.mutationFn = checkNotNull(mutationFn); } @Override public PDone expand(PCollection<T> input) { input.apply("Convert to Mutation", MapElements.via(mutationFn)) .apply("Write Mutation to Datastore", ParDo.of( new DatastoreWriterFn(projectId, localhost))); return PDone.in(input.getPipeline()); } @Override public void validate(PipelineOptions options) { checkNotNull(projectId, "projectId ValueProvider"); if (projectId.isAccessible()) { checkNotNull(projectId.get(), "projectId"); } checkNotNull(mutationFn, "mutationFn"); } @Override public String toString() { return MoreObjects.toStringHelper(getClass()) .add("projectId", projectId) .add("mutationFn", mutationFn.getClass().getName()) .toString(); } @Override public void populateDisplayData(DisplayData.Builder builder) { super.populateDisplayData(builder); builder .addIfNotNull(DisplayData.item("projectId", projectId) .withLabel("Output Project")) .include("mutationFn", mutationFn); } public String getProjectId() { return projectId.get(); } } /** * {@link DoFn} that writes {@link Mutation}s to Cloud Datastore. Mutations are written in * batches, where the maximum batch size is {@link DatastoreV1#DATASTORE_BATCH_UPDATE_LIMIT}. * * <p>See <a * href="https://cloud.google.com/datastore/docs/concepts/entities"> * Datastore: Entities, Properties, and Keys</a> for information about entity keys and mutations. * * <p>Commits are non-transactional. If a commit fails because of a conflict over an entity * group, the commit will be retried (up to {@link DatastoreV1#DATASTORE_BATCH_UPDATE_LIMIT} * times). This means that the mutation operation should be idempotent. Thus, the writer should * only be used for {code upsert} and {@code delete} mutation operations, as these are the only * two Cloud Datastore mutations that are idempotent. */ @VisibleForTesting static class DatastoreWriterFn extends DoFn<Mutation, Void> { private static final Logger LOG = LoggerFactory.getLogger(DatastoreWriterFn.class); private final ValueProvider<String> projectId; @Nullable private final String localhost; private transient Datastore datastore; private final V1DatastoreFactory datastoreFactory; // Current batch of mutations to be written. private final List<Mutation> mutations = new ArrayList<>(); private int mutationsSize = 0; // Accumulated size of protos in mutations. private static final int MAX_RETRIES = 5; private static final FluentBackoff BUNDLE_WRITE_BACKOFF = FluentBackoff.DEFAULT .withMaxRetries(MAX_RETRIES).withInitialBackoff(Duration.standardSeconds(5)); DatastoreWriterFn(String projectId, @Nullable String localhost) { this(StaticValueProvider.of(projectId), localhost, new V1DatastoreFactory()); } DatastoreWriterFn(ValueProvider<String> projectId, @Nullable String localhost) { this(projectId, localhost, new V1DatastoreFactory()); } @VisibleForTesting DatastoreWriterFn(ValueProvider<String> projectId, @Nullable String localhost, V1DatastoreFactory datastoreFactory) { this.projectId = checkNotNull(projectId, "projectId"); this.localhost = localhost; this.datastoreFactory = datastoreFactory; } @StartBundle public void startBundle(StartBundleContext c) { datastore = datastoreFactory.getDatastore(c.getPipelineOptions(), projectId.get(), localhost); } @ProcessElement public void processElement(ProcessContext c) throws Exception { Mutation write = c.element(); int size = write.getSerializedSize(); if (mutations.size() > 0 && mutationsSize + size >= DatastoreV1.DATASTORE_BATCH_UPDATE_BYTES_LIMIT) { flushBatch(); } mutations.add(c.element()); mutationsSize += size; if (mutations.size() >= DatastoreV1.DATASTORE_BATCH_UPDATE_LIMIT) { flushBatch(); } } @FinishBundle public void finishBundle() throws Exception { if (!mutations.isEmpty()) { flushBatch(); } } /** * Writes a batch of mutations to Cloud Datastore. * * <p>If a commit fails, it will be retried up to {@link #MAX_RETRIES} times. All * mutations in the batch will be committed again, even if the commit was partially * successful. If the retry limit is exceeded, the last exception from Cloud Datastore will be * thrown. * * @throws DatastoreException if the commit fails or IOException or InterruptedException if * backing off between retries fails. */ private void flushBatch() throws DatastoreException, IOException, InterruptedException { LOG.debug("Writing batch of {} mutations", mutations.size()); Sleeper sleeper = Sleeper.DEFAULT; BackOff backoff = BUNDLE_WRITE_BACKOFF.backoff(); while (true) { // Batch upsert entities. try { CommitRequest.Builder commitRequest = CommitRequest.newBuilder(); commitRequest.addAllMutations(mutations); commitRequest.setMode(CommitRequest.Mode.NON_TRANSACTIONAL); datastore.commit(commitRequest.build()); // Break if the commit threw no exception. break; } catch (DatastoreException exception) { // Only log the code and message for potentially-transient errors. The entire exception // will be propagated upon the last retry. LOG.error("Error writing to the Datastore ({}): {}", exception.getCode(), exception.getMessage()); if (!BackOffUtils.next(sleeper, backoff)) { LOG.error("Aborting after {} retries.", MAX_RETRIES); throw exception; } } } LOG.debug("Successfully wrote {} mutations", mutations.size()); mutations.clear(); mutationsSize = 0; } @Override public void populateDisplayData(Builder builder) { super.populateDisplayData(builder); builder .addIfNotNull(DisplayData.item("projectId", projectId) .withLabel("Output Project")); } } /** * Returns true if a Cloud Datastore key is complete. A key is complete if its last element * has either an id or a name. */ static boolean isValidKey(Key key) { List<PathElement> elementList = key.getPathList(); if (elementList.isEmpty()) { return false; } PathElement lastElement = elementList.get(elementList.size() - 1); return (lastElement.getId() != 0 || !lastElement.getName().isEmpty()); } /** * A function that constructs an upsert {@link Mutation} from an {@link Entity}. */ @VisibleForTesting static class UpsertFn extends SimpleFunction<Entity, Mutation> { @Override public Mutation apply(Entity entity) { // Verify that the entity to write has a complete key. checkArgument(isValidKey(entity.getKey()), "Entities to be written to the Cloud Datastore must have complete keys:\n%s", entity); return makeUpsert(entity).build(); } @Override public void populateDisplayData(Builder builder) { builder.add(DisplayData.item("upsertFn", this.getClass()) .withLabel("Create Upsert Mutation")); } } /** * A function that constructs a delete {@link Mutation} from an {@link Entity}. */ @VisibleForTesting static class DeleteEntityFn extends SimpleFunction<Entity, Mutation> { @Override public Mutation apply(Entity entity) { // Verify that the entity to delete has a complete key. checkArgument(isValidKey(entity.getKey()), "Entities to be deleted from the Cloud Datastore must have complete keys:\n%s", entity); return makeDelete(entity.getKey()).build(); } @Override public void populateDisplayData(Builder builder) { builder.add(DisplayData.item("deleteEntityFn", this.getClass()) .withLabel("Create Delete Mutation")); } } /** * A function that constructs a delete {@link Mutation} from a {@link Key}. */ @VisibleForTesting static class DeleteKeyFn extends SimpleFunction<Key, Mutation> { @Override public Mutation apply(Key key) { // Verify that the entity to delete has a complete key. checkArgument(isValidKey(key), "Keys to be deleted from the Cloud Datastore must be complete:\n%s", key); return makeDelete(key).build(); } @Override public void populateDisplayData(Builder builder) { builder.add(DisplayData.item("deleteKeyFn", this.getClass()) .withLabel("Create Delete Mutation")); } } /** * A wrapper factory class for Cloud Datastore singleton classes {@link DatastoreFactory} and * {@link QuerySplitter} * * <p>{@link DatastoreFactory} and {@link QuerySplitter} are not java serializable, hence * wrapping them under this class, which implements {@link Serializable}. */ @VisibleForTesting static class V1DatastoreFactory implements Serializable { /** Builds a Cloud Datastore client for the given pipeline options and project. */ public Datastore getDatastore(PipelineOptions pipelineOptions, String projectId) { return getDatastore(pipelineOptions, projectId, null); } /** * Builds a Cloud Datastore client for the given pipeline options, project and an optional * locahost. */ public Datastore getDatastore(PipelineOptions pipelineOptions, String projectId, @Nullable String localhost) { Credentials credential = pipelineOptions.as(GcpOptions.class).getGcpCredential(); HttpRequestInitializer initializer; if (credential != null) { initializer = new ChainingHttpRequestInitializer( new HttpCredentialsAdapter(credential), new RetryHttpRequestInitializer()); } else { initializer = new RetryHttpRequestInitializer(); } DatastoreOptions.Builder builder = new DatastoreOptions.Builder() .projectId(projectId) .initializer(initializer); if (localhost != null) { builder.localHost(localhost); } else { builder.host("batch-datastore.googleapis.com"); } return DatastoreFactory.get().create(builder.build()); } /** Builds a Cloud Datastore {@link QuerySplitter}. */ public QuerySplitter getQuerySplitter() { return DatastoreHelper.getQuerySplitter(); } } }