/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.beam.sdk.io.mongodb; import static com.google.common.base.Preconditions.checkArgument; import static com.google.common.base.Preconditions.checkNotNull; import com.google.auto.value.AutoValue; import com.google.common.annotations.VisibleForTesting; import com.mongodb.BasicDBObject; import com.mongodb.MongoClient; import com.mongodb.MongoClientURI; import com.mongodb.client.MongoCollection; import com.mongodb.client.MongoCursor; import com.mongodb.client.MongoDatabase; import java.util.ArrayList; import java.util.List; import javax.annotation.Nullable; import org.apache.beam.sdk.annotations.Experimental; import org.apache.beam.sdk.coders.Coder; import org.apache.beam.sdk.coders.SerializableCoder; import org.apache.beam.sdk.io.BoundedSource; import org.apache.beam.sdk.options.PipelineOptions; import org.apache.beam.sdk.transforms.DoFn; import org.apache.beam.sdk.transforms.PTransform; import org.apache.beam.sdk.transforms.ParDo; import org.apache.beam.sdk.transforms.display.DisplayData; import org.apache.beam.sdk.values.PBegin; import org.apache.beam.sdk.values.PCollection; import org.apache.beam.sdk.values.PDone; import org.bson.Document; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * IO to read and write data on MongoDB. * * <h3>Reading from MongoDB</h3> * * <p>MongoDbIO source returns a bounded collection of String as {@code PCollection<String>}. * The String is the JSON form of the MongoDB Document. * * <p>To configure the MongoDB source, you have to provide the connection URI, the database name * and the collection name. The following example illustrates various options for configuring the * source: * * <pre>{@code * * pipeline.apply(MongoDbIO.read() * .withUri("mongodb://localhost:27017") * .withDatabase("my-database") * .withCollection("my-collection")) * // above three are required configuration, returns PCollection<String> * * // rest of the settings are optional * * }</pre> * * <p>The source also accepts an optional configuration: {@code withFilter()} allows you to * define a JSON filter to get subset of data.</p> * * <h3>Writing to MongoDB</h3> * * <p>MongoDB sink supports writing of Document (as JSON String) in a MongoDB.</p> * * <p>To configure a MongoDB sink, you must specify a connection {@code URI}, a {@code Database} * name, a {@code Collection} name. For instance:</p> * * <pre>{@code * * pipeline * .apply(...) * .apply(MongoDbIO.write() * .withUri("mongodb://localhost:27017") * .withDatabase("my-database") * .withCollection("my-collection") * .withNumSplits(30)) * * }</pre> */ @Experimental public class MongoDbIO { private static final Logger LOG = LoggerFactory.getLogger(MongoDbIO.class); /** Read data from MongoDB. */ public static Read read() { return new AutoValue_MongoDbIO_Read.Builder().setNumSplits(0).build(); } /** Write data to MongoDB. */ public static Write write() { return new AutoValue_MongoDbIO_Write.Builder().setBatchSize(1024L).build(); } private MongoDbIO() { } /** * A {@link PTransform} to read data from MongoDB. */ @AutoValue public abstract static class Read extends PTransform<PBegin, PCollection<Document>> { @Nullable abstract String uri(); @Nullable abstract String database(); @Nullable abstract String collection(); @Nullable abstract String filter(); abstract int numSplits(); abstract Builder toBuilder(); @AutoValue.Builder abstract static class Builder { abstract Builder setUri(String uri); abstract Builder setDatabase(String database); abstract Builder setCollection(String collection); abstract Builder setFilter(String filter); abstract Builder setNumSplits(int numSplits); abstract Read build(); } /** * Example documentation for withUri. */ public Read withUri(String uri) { checkNotNull(uri); return toBuilder().setUri(uri).build(); } public Read withDatabase(String database) { checkNotNull(database); return toBuilder().setDatabase(database).build(); } public Read withCollection(String collection) { checkNotNull(collection); return toBuilder().setCollection(collection).build(); } public Read withFilter(String filter) { checkNotNull(filter); return toBuilder().setFilter(filter).build(); } public Read withNumSplits(int numSplits) { checkArgument(numSplits >= 0); return toBuilder().setNumSplits(numSplits).build(); } @Override public PCollection<Document> expand(PBegin input) { return input.apply(org.apache.beam.sdk.io.Read.from(new BoundedMongoDbSource(this))); } @Override public void validate(PipelineOptions options) { checkNotNull(uri(), "uri"); checkNotNull(database(), "database"); checkNotNull(collection(), "collection"); } @Override public void populateDisplayData(DisplayData.Builder builder) { super.populateDisplayData(builder); builder.add(DisplayData.item("uri", uri())); builder.add(DisplayData.item("database", database())); builder.add(DisplayData.item("collection", collection())); builder.addIfNotNull(DisplayData.item("filter", filter())); builder.add(DisplayData.item("numSplit", numSplits())); } } /** * A MongoDB {@link BoundedSource} reading {@link Document} from a given instance. */ @VisibleForTesting static class BoundedMongoDbSource extends BoundedSource<Document> { private Read spec; private BoundedMongoDbSource(Read spec) { this.spec = spec; } @Override public Coder<Document> getDefaultOutputCoder() { return SerializableCoder.of(Document.class); } @Override public void validate() { spec.validate(null); } @Override public void populateDisplayData(DisplayData.Builder builder) { spec.populateDisplayData(builder); } @Override public BoundedReader<Document> createReader(PipelineOptions options) { return new BoundedMongoDbReader(this); } @Override public long getEstimatedSizeBytes(PipelineOptions pipelineOptions) { MongoClient mongoClient = new MongoClient(new MongoClientURI(spec.uri())); MongoDatabase mongoDatabase = mongoClient.getDatabase(spec.database()); // get the Mongo collStats object // it gives the size for the entire collection BasicDBObject stat = new BasicDBObject(); stat.append("collStats", spec.collection()); Document stats = mongoDatabase.runCommand(stat); return stats.get("size", Number.class).longValue(); } @Override public List<BoundedSource<Document>> split(long desiredBundleSizeBytes, PipelineOptions options) { MongoClient mongoClient = new MongoClient(new MongoClientURI(spec.uri())); MongoDatabase mongoDatabase = mongoClient.getDatabase(spec.database()); List<Document> splitKeys; if (spec.numSplits() > 0) { // the user defines his desired number of splits // calculate the batch size long estimatedSizeBytes = getEstimatedSizeBytes(options); desiredBundleSizeBytes = estimatedSizeBytes / spec.numSplits(); } // the desired batch size is small, using default chunk size of 1MB if (desiredBundleSizeBytes < 1024 * 1024) { desiredBundleSizeBytes = 1 * 1024 * 1024; } // now we have the batch size (provided by user or provided by the runner) // we use Mongo splitVector command to get the split keys BasicDBObject splitVectorCommand = new BasicDBObject(); splitVectorCommand.append("splitVector", spec.database() + "." + spec.collection()); splitVectorCommand.append("keyPattern", new BasicDBObject().append("_id", 1)); splitVectorCommand.append("force", false); // maxChunkSize is the Mongo partition size in MB LOG.debug("Splitting in chunk of {} MB", desiredBundleSizeBytes / 1024 / 1024); splitVectorCommand.append("maxChunkSize", desiredBundleSizeBytes / 1024 / 1024); Document splitVectorCommandResult = mongoDatabase.runCommand(splitVectorCommand); splitKeys = (List<Document>) splitVectorCommandResult.get("splitKeys"); List<BoundedSource<Document>> sources = new ArrayList<>(); if (splitKeys.size() < 1) { LOG.debug("Split keys is low, using an unique source"); sources.add(this); return sources; } LOG.debug("Number of splits is {}", splitKeys.size()); for (String shardFilter : splitKeysToFilters(splitKeys, spec.filter())) { sources.add(new BoundedMongoDbSource(spec.withFilter(shardFilter))); } return sources; } /** * Transform a list of split keys as a list of filters containing corresponding range. * * <p>The list of split keys contains BSon Document basically containing for example: * <ul> * <li>_id: 56</li> * <li>_id: 109</li> * <li>_id: 256</li> * </ul> * * <p>This method will generate a list of range filters performing the following splits: * <ul> * <li>from the beginning of the collection up to _id 56, so basically data with * _id lower than 56</li> * <li>from _id 57 up to _id 109</li> * <li>from _id 110 up to _id 256</li> * <li>from _id 257 up to the end of the collection, so basically data with _id greater * than 257</li> * </ul> * * @param splitKeys The list of split keys. * @param additionalFilter A custom (user) additional filter to append to the range filters. * @return A list of filters containing the ranges. */ @VisibleForTesting static List<String> splitKeysToFilters(List<Document> splitKeys, String additionalFilter) { ArrayList<String> filters = new ArrayList<>(); String lowestBound = null; // lower boundary (previous split in the iteration) for (int i = 0; i < splitKeys.size(); i++) { String splitKey = splitKeys.get(i).get("_id").toString(); String rangeFilter; if (i == 0) { // this is the first split in the list, the filter defines // the range from the beginning up to this split rangeFilter = String.format("{ $and: [ {\"_id\":{$lte:ObjectId(\"%s\")}}", splitKey); filters.add(formatFilter(rangeFilter, additionalFilter)); } else if (i == splitKeys.size() - 1) { // this is the last split in the list, the filters define // the range from the previous split to the current split and also // the current split to the end rangeFilter = String.format("{ $and: [ {\"_id\":{$gt:ObjectId(\"%s\")," + "$lte:ObjectId(\"%s\")}}", lowestBound, splitKey); filters.add(formatFilter(rangeFilter, additionalFilter)); rangeFilter = String.format("{ $and: [ {\"_id\":{$gt:ObjectId(\"%s\")}}", splitKey); filters.add(formatFilter(rangeFilter, additionalFilter)); } else { // we are between two splits rangeFilter = String.format("{ $and: [ {\"_id\":{$gt:ObjectId(\"%s\")," + "$lte:ObjectId(\"%s\")}}", lowestBound, splitKey); filters.add(formatFilter(rangeFilter, additionalFilter)); } lowestBound = splitKey; } return filters; } /** * Cleanly format range filter, optionally adding the users filter if specified. * * @param filter The range filter. * @param additionalFilter The users filter. Null if unspecified. * @return The cleanly formatted range filter. */ private static String formatFilter(String filter, @Nullable String additionalFilter) { if (additionalFilter != null && !additionalFilter.isEmpty()) { // user provided a filter, we append the user filter to the range filter return String.format("%s,%s ]}", filter, additionalFilter); } else { // user didn't provide a filter, just cleanly close the range filter return String.format("%s ]}", filter); } } } private static class BoundedMongoDbReader extends BoundedSource.BoundedReader<Document> { private final BoundedMongoDbSource source; private MongoClient client; private MongoCursor<Document> cursor; private Document current; public BoundedMongoDbReader(BoundedMongoDbSource source) { this.source = source; } @Override public boolean start() { Read spec = source.spec; client = new MongoClient(new MongoClientURI(spec.uri())); MongoDatabase mongoDatabase = client.getDatabase(spec.database()); MongoCollection<Document> mongoCollection = mongoDatabase.getCollection(spec.collection()); if (spec.filter() == null) { cursor = mongoCollection.find().iterator(); } else { Document bson = Document.parse(spec.filter()); cursor = mongoCollection.find(bson).iterator(); } return advance(); } @Override public boolean advance() { if (cursor.hasNext()) { current = cursor.next(); return true; } else { return false; } } @Override public BoundedMongoDbSource getCurrentSource() { return source; } @Override public Document getCurrent() { return current; } @Override public void close() { try { if (cursor != null) { cursor.close(); } } catch (Exception e) { LOG.warn("Error closing MongoDB cursor", e); } try { client.close(); } catch (Exception e) { LOG.warn("Error closing MongoDB client", e); } } } /** * A {@link PTransform} to write to a MongoDB database. */ @AutoValue public abstract static class Write extends PTransform<PCollection<Document>, PDone> { @Nullable abstract String uri(); @Nullable abstract String database(); @Nullable abstract String collection(); abstract long batchSize(); abstract Builder toBuilder(); @AutoValue.Builder abstract static class Builder { abstract Builder setUri(String uri); abstract Builder setDatabase(String database); abstract Builder setCollection(String collection); abstract Builder setBatchSize(long batchSize); abstract Write build(); } public Write withUri(String uri) { return toBuilder().setUri(uri).build(); } public Write withDatabase(String database) { return toBuilder().setDatabase(database).build(); } public Write withCollection(String collection) { return toBuilder().setCollection(collection).build(); } public Write withBatchSize(long batchSize) { return toBuilder().setBatchSize(batchSize).build(); } @Override public PDone expand(PCollection<Document> input) { input.apply(ParDo.of(new WriteFn(this))); return PDone.in(input.getPipeline()); } @Override public void validate(PipelineOptions options) { checkNotNull(uri(), "uri"); checkNotNull(database(), "database"); checkNotNull(collection(), "collection"); checkNotNull(batchSize(), "batchSize"); } private static class WriteFn extends DoFn<Document, Void> { private final Write spec; private transient MongoClient client; private List<Document> batch; public WriteFn(Write spec) { this.spec = spec; } @Setup public void createMongoClient() throws Exception { client = new MongoClient(new MongoClientURI(spec.uri())); } @StartBundle public void startBundle() throws Exception { batch = new ArrayList<>(); } @ProcessElement public void processElement(ProcessContext ctx) throws Exception { // Need to copy the document because mongoCollection.insertMany() will mutate it // before inserting (will assign an id). batch.add(new Document(ctx.element())); if (batch.size() >= spec.batchSize()) { flush(); } } @FinishBundle public void finishBundle() throws Exception { flush(); } private void flush() { MongoDatabase mongoDatabase = client.getDatabase(spec.database()); MongoCollection<Document> mongoCollection = mongoDatabase.getCollection(spec.collection()); mongoCollection.insertMany(batch); batch.clear(); } @Teardown public void closeMongoClient() throws Exception { client.close(); client = null; } } } }