/** * Copyright 2013 Cloudera Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.kitesdk.data.crunch; import java.net.URI; import java.util.Map; import javax.annotation.Nullable; import org.apache.avro.Schema; import org.apache.avro.generic.GenericData; import org.apache.crunch.DoFn; import org.apache.crunch.Emitter; import org.apache.crunch.MapFn; import org.apache.crunch.PCollection; import org.apache.crunch.PGroupedTable; import org.apache.crunch.PTable; import org.apache.crunch.Pair; import org.apache.crunch.Target; import org.apache.crunch.io.ReadableSource; import org.apache.crunch.types.PTableType; import org.apache.crunch.types.PType; import org.apache.crunch.types.avro.Avros; import org.kitesdk.data.Dataset; import org.kitesdk.data.DatasetDescriptor; import org.kitesdk.data.PartitionStrategy; import org.kitesdk.data.View; import org.kitesdk.data.spi.AbstractRefinableView; import org.kitesdk.data.spi.Constraints; import org.kitesdk.data.spi.DataModelUtil; import org.kitesdk.data.spi.EntityAccessor; import org.kitesdk.data.spi.PartitionStrategyParser; import org.kitesdk.data.spi.SchemaUtil; import org.kitesdk.data.spi.StorageKey; /** * <p> * A helper class for exposing {@link Dataset}s and {@link View}s as Crunch * {@link ReadableSource}s or {@link Target}s. * </p> */ public class CrunchDatasets { /** * Expose the given {@link View} as a Crunch {@link ReadableSource}. * * @param view the view to read from * @param <E> the type of entity produced by the source * @return a {@link ReadableSource} for the view * * @since 0.14.0 */ public static <E> ReadableSource<E> asSource(View<E> view) { return new DatasetSourceTarget<E>(view); } /** * Expose the {@link View} or {@link Dataset} represented by the URI * as a Crunch {@link ReadableSource}. * * @param uri the URI of the view or dataset to read from * @param type the Java type of the entities in the dataset * @param <E> the type of entity produced by the source * @return a {@link ReadableSource} for the view * * @since 0.15.0 */ public static <E> ReadableSource<E> asSource(URI uri, Class<E> type) { return new DatasetSourceTarget<E>(uri, type); } /** * Expose the {@link View} or {@link Dataset} represented by the URI * as a Crunch {@link ReadableSource}. * * @param uri the URI of the view or dataset to read from * @param type the Java type of the entities in the dataset * @param <E> the type of entity produced by the source * @return a {@link ReadableSource} for the view * * @since 0.15.0 */ public static <E> ReadableSource<E> asSource(String uri, Class<E> type) { return asSource(URI.create(uri), type); } /** * Expose the given {@link View} as a Crunch {@link Target}. * * @param view the view to write to * @param <E> the type of entity stored in the view * @return a {@link Target} for the view * * @since 0.14.0 */ public static <E> Target asTarget(View<E> view) { return new DatasetTarget<E>(view); } /** * Expose the {@link Dataset} or {@link View} represented by the given * URI as a Crunch {@link Target}. * * @param uri the dataset or view URI * @return a {@link Target} for the dataset or view * * @since 0.15.0 */ public static Target asTarget(String uri) { return asTarget(URI.create(uri)); } /** * Expose the {@link Dataset} or {@link View} represented by the given * URI as a Crunch {@link Target}. * * @param uri the dataset or view URI * @return a {@link Target} for the dataset or view * * @since 0.15.0 */ public static Target asTarget(URI uri) { return new DatasetTarget<Object>(uri); } /** * Partitions {@code collection} to be stored efficiently in {@code View}. * <p> * This restructures the parallel collection so that all of the entities that * will be stored in a given partition will be processed by the same writer. * * @param collection a collection of entities * @param view a {@link View} of a dataset to partition the collection for * @param <E> the type of entities in the collection and underlying dataset * @return an equivalent collection of entities partitioned for the view * * @since 0.16.0 */ public static <E> PCollection<E> partition(PCollection<E> collection, View<E> view) { return partition(collection, view.getDataset(), -1); } /** * Partitions {@code collection} to be stored efficiently in {@code dataset}. * <p> * This restructures the parallel collection so that all of the entities that * will be stored in a given partition will be processed by the same writer. * * @param collection a collection of entities * @param dataset a dataset to partition the collection for * @param <E> the type of entities in the collection and underlying dataset * @return an equivalent collection of entities partitioned for the view * * @since 0.16.0 */ public static <E> PCollection<E> partition(PCollection<E> collection, Dataset<E> dataset) { return partition(collection, dataset, -1); } /** * Partitions {@code collection} to be stored efficiently in {@code View}. * <p> * This restructures the parallel collection so that all of the entities that * will be stored in a given partition will be processed by the same writer. * <p> * If the dataset is not partitioned, then this will structure all of the * entities to produce a number of files equal to {@code numWriters}. * * @param collection a collection of entities * @param view a {@link View} of a dataset to partition the collection for * @param numWriters the number of writers that should be used * @param <E> the type of entities in the collection and underlying dataset * @return an equivalent collection of entities partitioned for the view * @see #partition(PCollection, View) * * @since 0.16.0 */ public static <E> PCollection<E> partition(PCollection<E> collection, View<E> view, int numWriters) { return partition(collection, view, numWriters, 1); } /** * Partitions {@code collection} to be stored efficiently in {@code View}. * <p> * This restructures the parallel collection so that all of the entities that * will be stored in a given partition will be evenly distributed across a specified * {@code numPartitionWriters}. * <p> * If the dataset is not partitioned, then this will structure all of the * entities to produce a number of files equal to {@code numWriters}. * * @param collection a collection of entities * @param view a {@link View} of a dataset to partition the collection for * @param numWriters the number of writers that should be used * @param numPartitionWriters the number of writers data for a single partition will be distributed across * @param <E> the type of entities in the collection and underlying dataset * @return an equivalent collection of entities partitioned for the view * @see #partition(PCollection, View) * * @since 1.1.0 */ public static <E> PCollection<E> partition(PCollection<E> collection, View<E> view, int numWriters, int numPartitionWriters) { //ensure the number of writers is honored whether it is per partition or total. DatasetDescriptor descriptor = view.getDataset().getDescriptor(); if (descriptor.isPartitioned()) { GetStorageKey<E> getKey = new GetStorageKey<E>(view, numPartitionWriters); PTable<Pair<GenericData.Record, Integer>, E> table = collection .by(getKey, Avros.pairs(Avros.generics(getKey.schema()), Avros.ints())); PGroupedTable<Pair<GenericData.Record, Integer>, E> grouped = numWriters > 0 ? table.groupByKey(numWriters) : table.groupByKey(); return grouped.ungroup().values(); } else { return partition(collection, numWriters); } } private static <E> PCollection<E> partition(PCollection<E> collection, int numReducers) { PType<E> type = collection.getPType(); PTableType<E, Void> tableType = Avros.tableOf(type, Avros.nulls()); PTable<E, Void> table = collection.parallelDo(new AsKeyTable<E>(), tableType); PGroupedTable<E, Void> grouped = numReducers > 0 ? table.groupByKey(numReducers) : table.groupByKey(); return grouped.ungroup().keys(); } @edu.umd.cs.findbugs.annotations.SuppressWarnings( value="SE_NO_SERIALVERSIONID", justification="Purposely not supported across versions") private static class AsKeyTable<E> extends DoFn<E, Pair<E, Void>> { @Override public void process(E entity, Emitter<Pair<E, Void>> emitter) { emitter.emit(Pair.of(entity, (Void) null)); } } @edu.umd.cs.findbugs.annotations.SuppressWarnings( value={"SE_NO_SERIALVERSIONID","SE_TRANSIENT_FIELD_NOT_RESTORED"}, justification="Purposely not supported across versions, fields properly initialized") private static class GetStorageKey<E> extends MapFn<E, Pair<GenericData.Record, Integer>> { private final String strategyString; private final String schemaString; private final Class<E> type; private final Map<String, String> constraints; private final int numPartitionWriters; private transient AvroStorageKey key = null; private transient EntityAccessor<E> accessor = null; private transient Map<String, Object> provided = null; private transient int count; private GetStorageKey(View<E> view, int numPartitionWriters) { DatasetDescriptor descriptor = view.getDataset().getDescriptor(); // get serializable versions of transient objects this.strategyString = descriptor.getPartitionStrategy() .toString(false /* no white space */); this.schemaString = descriptor.getSchema() .toString(false /* no white space */); this.type = view.getType(); if (view instanceof AbstractRefinableView) { this.constraints = ((AbstractRefinableView) view).getConstraints() .toQueryMap(); } else { this.constraints = null; } this.numPartitionWriters = numPartitionWriters > 0 ? numPartitionWriters : 1; } public Schema schema() { initialize(); // make sure the key is not null return key.getSchema(); } @Override public void initialize() { if (key == null) { // restore transient objects from serializable versions PartitionStrategy strategy = PartitionStrategyParser.parse(strategyString); Schema schema = new Schema.Parser().parse(schemaString); this.key = new AvroStorageKey(strategy, schema); this.accessor = DataModelUtil.accessor(type, schema); if (constraints != null) { this.provided = Constraints .fromQueryMap(schema, strategy, constraints) .getProvidedValues(); } } count = 0; } @Override public Pair<GenericData.Record, Integer> map(E entity) { int marker = count % numPartitionWriters; count += 1; return Pair.<GenericData.Record, Integer>of(key.reuseFor(entity, provided, accessor), marker); } } @edu.umd.cs.findbugs.annotations.SuppressWarnings( value="EQ_DOESNT_OVERRIDE_EQUALS", justification="StorageKey equals is correct, compares the values") private static class AvroStorageKey extends GenericData.Record { private final StorageKey key; private AvroStorageKey(PartitionStrategy strategy, Schema schema) { super(SchemaUtil.keySchema(schema, strategy)); this.key = new StorageKey(strategy); } public <E> AvroStorageKey reuseFor(E entity, @Nullable Map<String, Object> provided, EntityAccessor<E> accessor) { key.reuseFor(entity, provided, accessor); return this; } @Override public void put(int i, Object v) { key.replace(i, v); } @Override public Object get(int i) { return key.get(i); } } }