/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.beam.sdk.io.mongodb; import static com.google.common.base.Preconditions.checkArgument; import static com.google.common.base.Preconditions.checkNotNull; import com.google.auto.value.AutoValue; import com.mongodb.DB; import com.mongodb.DBCursor; import com.mongodb.DBObject; import com.mongodb.Mongo; import com.mongodb.MongoURI; import com.mongodb.gridfs.GridFS; import com.mongodb.gridfs.GridFSDBFile; import com.mongodb.gridfs.GridFSInputFile; import com.mongodb.util.JSON; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStream; import java.io.Serializable; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.NoSuchElementException; import javax.annotation.Nullable; import org.apache.beam.sdk.annotations.Experimental; import org.apache.beam.sdk.coders.Coder; import org.apache.beam.sdk.coders.SerializableCoder; import org.apache.beam.sdk.coders.StringUtf8Coder; import org.apache.beam.sdk.io.BoundedSource; import org.apache.beam.sdk.options.PipelineOptions; import org.apache.beam.sdk.transforms.DoFn; import org.apache.beam.sdk.transforms.PTransform; import org.apache.beam.sdk.transforms.ParDo; import org.apache.beam.sdk.transforms.display.DisplayData; import org.apache.beam.sdk.values.PBegin; import org.apache.beam.sdk.values.PCollection; import org.apache.beam.sdk.values.PDone; import org.bson.types.ObjectId; import org.joda.time.Duration; import org.joda.time.Instant; /** * IO to read and write data on MongoDB GridFS. * * <h3>Reading from MongoDB via GridFS</h3> * * <p>MongoDbGridFSIO source returns a bounded collection of Objects as {@code PCollection<T>}. * * <p>To configure the MongoDB GridFS source, you can provide the connection URI, the database name * and the bucket name. If unspecified, the default values from the GridFS driver are used.</p> * * <p>The following example illustrates various options for configuring the * source: * * <pre>{@code * pipeline.apply(MongoDbGridFSIO.<String>read() * .withUri("mongodb://localhost:27017") * .withDatabase("my-database") * .withBucket("my-bucket")) * }</pre> * * <p>The source also accepts an optional configuration: {@code withQueryFilter()} allows you to * define a JSON filter to get subset of files in the database. * * <p>There is also an optional {@code Parser} (and associated {@code Coder}) that can be * specified that can be used to parse the InputStream into objects usable with Beam. By default, * MongoDbGridFSIO will parse into Strings, splitting on line breaks and using the uploadDate of * the file as the timestamp. * When using a parser that outputs with custom timestamps, you may also need to specify * the allowedTimestampSkew option.</p> * * * * <h3>Writing to MongoDB via GridFS</h3> * * <p>MongoDBGridFS supports writing of data to a file in a MongoDB GridFS collection.</p> * * <p>To configure a MongoDB GridFS sink, you can provide the connection URI, the database name * and the bucket name. You must also provide the filename to write to. Another optional parameter * is the GridFS file chunkSize. * * For instance:</p> * * <pre>{@code * * pipeline * .apply(...) * .apply(MongoDbGridFSIO.write() * .withUri("mongodb://localhost:27017") * .withDatabase("my-database") * .withBucket("my-bucket") * .withChunkSize(256000L) * .withFilename("my-output.txt")) * * }</pre> * * <p>There is also an optional argument to the {@code create()} method to specify a writer * that is used to write the data to the OutputStream. By default, it writes UTF-8 strings * to the file separated with line feeds. * </p> */ @Experimental public class MongoDbGridFSIO { /** * Callback for the parser to use to submit data. */ public interface ParserCallback<T> extends Serializable { /** * Output the object. The default timestamp will be the GridFSDBFile * creation timestamp. */ void output(T output); /** * Output the object using the specified timestamp. */ void output(T output, Instant timestamp); } /** * Interface for the parser that is used to parse the GridFSDBFile into * the appropriate types. */ public interface Parser<T> extends Serializable { void parse(GridFSDBFile input, ParserCallback<T> callback) throws IOException; } /** * For the default {@code Read<String>} case, this is the parser that is used to * split the input file into Strings. It uses the timestamp of the file * for the event timestamp. */ private static final Parser<String> TEXT_PARSER = new Parser<String>() { @Override public void parse(GridFSDBFile input, ParserCallback<String> callback) throws IOException { final Instant time = new Instant(input.getUploadDate().getTime()); try (BufferedReader reader = new BufferedReader(new InputStreamReader(input.getInputStream()))) { for (String line = reader.readLine(); line != null; line = reader.readLine()) { callback.output(line, time); } } } }; /** Read data from GridFS. Default behavior with String. */ public static Read<String> read() { return new AutoValue_MongoDbGridFSIO_Read.Builder<String>() .setParser(TEXT_PARSER) .setCoder(StringUtf8Coder.of()) .setConnectionConfiguration(ConnectionConfiguration.create()) .setSkew(Duration.ZERO) .build(); } /** Write data to GridFS. Default behavior with String. */ public static Write<String> write() { return new AutoValue_MongoDbGridFSIO_Write.Builder<String>() .setConnectionConfiguration(ConnectionConfiguration.create()) .setWriteFn(new WriteFn<String>() { @Override public void write(String output, OutputStream outStream) throws IOException { outStream.write(output.getBytes("utf-8")); outStream.write('\n'); } }).build(); } public static <T> Write<T> write(WriteFn<T> fn) { return new AutoValue_MongoDbGridFSIO_Write.Builder<T>() .setWriteFn(fn) .setConnectionConfiguration(ConnectionConfiguration.create()) .build(); } /** * Encapsulate the MongoDB GridFS connection logic. */ @AutoValue public abstract static class ConnectionConfiguration implements Serializable { @Nullable abstract String uri(); @Nullable abstract String database(); @Nullable abstract String bucket(); static ConnectionConfiguration create() { return new AutoValue_MongoDbGridFSIO_ConnectionConfiguration(null, null, null); } static ConnectionConfiguration create(String uri, String database, String bucket) { return new AutoValue_MongoDbGridFSIO_ConnectionConfiguration(uri, database, bucket); } Mongo setupMongo() { return uri() == null ? new Mongo() : new Mongo(new MongoURI(uri())); } GridFS setupGridFS(Mongo mongo) { DB db = database() == null ? mongo.getDB("gridfs") : mongo.getDB(database()); return bucket() == null ? new GridFS(db) : new GridFS(db, bucket()); } } /** * A {@link PTransform} to read data from MongoDB GridFS. */ @AutoValue public abstract static class Read<T> extends PTransform<PBegin, PCollection<T>> { abstract ConnectionConfiguration connectionConfiguration(); @Nullable abstract Parser<T> parser(); @Nullable abstract Coder<T> coder(); @Nullable abstract Duration skew(); @Nullable abstract String filter(); abstract Builder<T> toBuilder(); @AutoValue.Builder abstract static class Builder<T> { abstract Builder<T> setConnectionConfiguration(ConnectionConfiguration connection); abstract Builder<T> setParser(Parser<T> parser); abstract Builder<T> setCoder(Coder<T> coder); abstract Builder<T> setSkew(Duration skew); abstract Builder<T> setFilter(String filter); abstract Read<T> build(); } public Read<T> withUri(String uri) { checkNotNull(uri); ConnectionConfiguration config = ConnectionConfiguration .create(uri, connectionConfiguration().database(), connectionConfiguration().bucket()); return toBuilder().setConnectionConfiguration(config).build(); } public Read<T> withDatabase(String database) { checkNotNull(database); ConnectionConfiguration config = ConnectionConfiguration .create(connectionConfiguration().uri(), database, connectionConfiguration().bucket()); return toBuilder().setConnectionConfiguration(config).build(); } public Read<T> withBucket(String bucket) { checkNotNull(bucket); ConnectionConfiguration config = ConnectionConfiguration .create(connectionConfiguration().uri(), connectionConfiguration().database(), bucket); return toBuilder().setConnectionConfiguration(config).build(); } public <X> Read<X> withParser(Parser<X> parser) { checkNotNull(parser); Builder<X> builder = (Builder<X>) toBuilder(); return builder.setParser(parser).setCoder(null).build(); } public Read<T> withCoder(Coder<T> coder) { checkNotNull(coder); return toBuilder().setCoder(coder).build(); } public Read<T> withSkew(Duration skew) { return toBuilder().setSkew(skew == null ? Duration.ZERO : skew).build(); } public Read<T> withFilter(String filter) { return toBuilder().setFilter(filter).build(); } @Override public void populateDisplayData(DisplayData.Builder builder) { super.populateDisplayData(builder); builder.addIfNotNull(DisplayData.item("uri", connectionConfiguration().uri())); builder.addIfNotNull(DisplayData.item("database", connectionConfiguration().database())); builder.addIfNotNull(DisplayData.item("bucket", connectionConfiguration().bucket())); builder.addIfNotNull(DisplayData.item("parser", parser().getClass().getName())); builder.addIfNotNull(DisplayData.item("coder", coder().getClass().getName())); builder.addIfNotNull(DisplayData.item("skew", skew())); builder.addIfNotNull(DisplayData.item("filter", filter())); } @Override public PCollection<T> expand(PBegin input) { final BoundedGridFSSource source = new BoundedGridFSSource(this, null); org.apache.beam.sdk.io.Read.Bounded<ObjectId> objectIds = org.apache.beam.sdk.io.Read.from(source); PCollection<T> output = input.getPipeline().apply(objectIds) .apply(ParDo.of(new DoFn<ObjectId, T>() { Mongo mongo; GridFS gridfs; @Setup public void setup() { mongo = source.spec.connectionConfiguration().setupMongo(); gridfs = source.spec.connectionConfiguration().setupGridFS(mongo); } @Teardown public void teardown() { mongo.close(); } @ProcessElement public void processElement(final ProcessContext c) throws IOException { ObjectId oid = c.element(); GridFSDBFile file = gridfs.find(oid); parser().parse(file, new ParserCallback<T>() { @Override public void output(T output, Instant timestamp) { checkNotNull(timestamp); c.outputWithTimestamp(output, timestamp); } @Override public void output(T output) { c.output(output); } }); } @Override public Duration getAllowedTimestampSkew() { return skew(); } })); if (coder() != null) { output.setCoder(coder()); } return output; } /** * A {@link BoundedSource} for MongoDB GridFS. */ protected static class BoundedGridFSSource extends BoundedSource<ObjectId> { private Read<?> spec; @Nullable private List<ObjectId> objectIds; BoundedGridFSSource(Read<?> spec, List<ObjectId> objectIds) { this.spec = spec; this.objectIds = objectIds; } private DBCursor createCursor(GridFS gridfs) { if (spec.filter() != null) { DBObject query = (DBObject) JSON.parse(spec.filter()); return gridfs.getFileList(query).sort(null); } return gridfs.getFileList().sort(null); } @Override public List<? extends BoundedSource<ObjectId>> split( long desiredBundleSizeBytes, PipelineOptions options) throws Exception { Mongo mongo = spec.connectionConfiguration().setupMongo(); try { GridFS gridfs = spec.connectionConfiguration().setupGridFS(mongo); DBCursor cursor = createCursor(gridfs); long size = 0; List<BoundedGridFSSource> list = new ArrayList<>(); List<ObjectId> objects = new ArrayList<>(); while (cursor.hasNext()) { GridFSDBFile file = (GridFSDBFile) cursor.next(); long len = file.getLength(); if ((size + len) > desiredBundleSizeBytes && !objects.isEmpty()) { list.add(new BoundedGridFSSource(spec, objects)); size = 0; objects = new ArrayList<>(); } objects.add((ObjectId) file.getId()); size += len; } if (!objects.isEmpty() || list.isEmpty()) { list.add(new BoundedGridFSSource(spec, objects)); } return list; } finally { mongo.close(); } } @Override public long getEstimatedSizeBytes(PipelineOptions options) throws Exception { Mongo mongo = spec.connectionConfiguration().setupMongo(); try { GridFS gridfs = spec.connectionConfiguration().setupGridFS(mongo); DBCursor cursor = createCursor(gridfs); long size = 0; while (cursor.hasNext()) { GridFSDBFile file = (GridFSDBFile) cursor.next(); size += file.getLength(); } return size; } finally { mongo.close(); } } @Override public BoundedSource.BoundedReader<ObjectId> createReader( PipelineOptions options) throws IOException { return new GridFSReader(this, objectIds); } @Override public void validate() { } @Override public void populateDisplayData(DisplayData.Builder builder) { spec.populateDisplayData(builder); } @Override public Coder<ObjectId> getDefaultOutputCoder() { return SerializableCoder.of(ObjectId.class); } static class GridFSReader extends BoundedSource.BoundedReader<ObjectId> { final BoundedGridFSSource source; /* When split into bundles, this records the ObjectId's of the files for * this bundle. Otherwise, this is null. When null, a DBCursor of the * files is used directly to avoid having the ObjectId's queried and * loaded ahead of time saving time and memory. */ @Nullable final List<ObjectId> objects; Mongo mongo; DBCursor cursor; Iterator<ObjectId> iterator; ObjectId current; GridFSReader(BoundedGridFSSource source, List<ObjectId> objects) { this.source = source; this.objects = objects; } @Override public BoundedSource<ObjectId> getCurrentSource() { return source; } @Override public boolean start() throws IOException { if (objects == null) { mongo = source.spec.connectionConfiguration().setupMongo(); GridFS gridfs = source.spec.connectionConfiguration().setupGridFS(mongo); cursor = source.createCursor(gridfs); } else { iterator = objects.iterator(); } return advance(); } @Override public boolean advance() throws IOException { if (iterator != null && iterator.hasNext()) { current = iterator.next(); return true; } else if (cursor != null && cursor.hasNext()) { GridFSDBFile file = (GridFSDBFile) cursor.next(); current = (ObjectId) file.getId(); return true; } current = null; return false; } @Override public ObjectId getCurrent() throws NoSuchElementException { if (current == null) { throw new NoSuchElementException(); } return current; } public Instant getCurrentTimestamp() throws NoSuchElementException { if (current == null) { throw new NoSuchElementException(); } long time = current.getTimestamp(); time *= 1000L; return new Instant(time); } @Override public void close() throws IOException { if (mongo != null) { mongo.close(); } } } } } /** * Function that is called to write the data to the give GridFS OutputStream. */ public interface WriteFn<T> extends Serializable { /** * Output the object to the given OutputStream. * @param output The data to output * @param outStream The OutputStream */ void write(T output, OutputStream outStream) throws IOException; } /** * A {@link PTransform} to write data to MongoDB GridFS. */ @AutoValue public abstract static class Write<T> extends PTransform<PCollection<T>, PDone> { abstract ConnectionConfiguration connectionConfiguration(); @Nullable abstract Long chunkSize(); abstract WriteFn<T> writeFn(); @Nullable abstract String filename(); abstract Builder<T> toBuilder(); @AutoValue.Builder abstract static class Builder<T> { abstract Builder<T> setConnectionConfiguration(ConnectionConfiguration connection); abstract Builder<T> setFilename(String filename); abstract Builder<T> setChunkSize(Long chunkSize); abstract Builder<T> setWriteFn(WriteFn<T> fn); abstract Write<T> build(); } public Write<T> withUri(String uri) { checkNotNull(uri); ConnectionConfiguration config = ConnectionConfiguration .create(uri, connectionConfiguration().database(), connectionConfiguration().bucket()); return toBuilder().setConnectionConfiguration(config).build(); } public Write<T> withDatabase(String database) { checkNotNull(database); ConnectionConfiguration config = ConnectionConfiguration .create(connectionConfiguration().uri(), database, connectionConfiguration().bucket()); return toBuilder().setConnectionConfiguration(config).build(); } public Write<T> withBucket(String bucket) { checkNotNull(bucket); ConnectionConfiguration config = ConnectionConfiguration .create(connectionConfiguration().uri(), connectionConfiguration().database(), bucket); return toBuilder().setConnectionConfiguration(config).build(); } public Write<T> withFilename(String filename) { checkNotNull(filename); return toBuilder().setFilename(filename).build(); } public Write<T> withChunkSize(Long chunkSize) { checkNotNull(chunkSize); checkArgument(chunkSize > 1, "Chunk Size must be greater than 1", chunkSize); return toBuilder().setChunkSize(chunkSize).build(); } public void validate(T input) { checkNotNull(filename(), "filename"); checkNotNull(writeFn(), "writeFn"); } @Override public void populateDisplayData(DisplayData.Builder builder) { super.populateDisplayData(builder); builder.addIfNotNull(DisplayData.item("uri", connectionConfiguration().uri())); builder.addIfNotNull(DisplayData.item("database", connectionConfiguration().database())); builder.addIfNotNull(DisplayData.item("bucket", connectionConfiguration().bucket())); builder.addIfNotNull(DisplayData.item("chunkSize", chunkSize())); builder.addIfNotNull(DisplayData.item("filename", filename())); } @Override public PDone expand(PCollection<T> input) { input.apply(ParDo.of(new GridFsWriteFn<T>(this))); return PDone.in(input.getPipeline()); } } private static class GridFsWriteFn<T> extends DoFn<T, Void> { private final Write<T> spec; private transient Mongo mongo; private transient GridFS gridfs; private transient GridFSInputFile gridFsFile; private transient OutputStream outputStream; public GridFsWriteFn(Write<T> spec) { this.spec = spec; } @Setup public void setup() throws Exception { mongo = spec.connectionConfiguration().setupMongo(); gridfs = spec.connectionConfiguration().setupGridFS(mongo); } @StartBundle public void startBundle() { gridFsFile = gridfs.createFile(spec.filename()); if (spec.chunkSize() != null) { gridFsFile.setChunkSize(spec.chunkSize()); } outputStream = gridFsFile.getOutputStream(); } @ProcessElement public void processElement(ProcessContext context) throws Exception { T record = context.element(); spec.writeFn().write(record, outputStream); } @FinishBundle public void finishBundle() throws Exception { if (gridFsFile != null) { outputStream.flush(); outputStream.close(); outputStream = null; gridFsFile = null; } } @Teardown public void teardown() throws Exception { try { if (gridFsFile != null) { outputStream.flush(); outputStream.close(); outputStream = null; gridFsFile = null; } } finally { if (mongo != null) { mongo.close(); mongo = null; gridfs = null; } } } } }