// Copyright 2011 Google Inc. All Rights Reserved.
package com.google.appengine.tools.mapreduce.inputs;
import static com.google.appengine.api.datastore.FetchOptions.Builder.withLimit;
import com.google.appengine.api.datastore.DatastoreService;
import com.google.appengine.api.datastore.DatastoreServiceFactory;
import com.google.appengine.api.datastore.Entity;
import com.google.appengine.api.datastore.Key;
import com.google.appengine.api.datastore.Query;
import com.google.appengine.tools.mapreduce.Input;
import com.google.appengine.tools.mapreduce.InputReader;
import com.google.appengine.tools.mapreduce.MapperJobContext;
import com.google.common.base.Preconditions;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import java.util.logging.Logger;
/**
*/
public class DatastoreInput extends Input<Key, Entity> {
// --------------------------- STATIC FIELDS ---------------------------
private static final Logger logger = Logger.getLogger(DatastoreInput.class.getName());
private static final String SCATTER_RESERVED_PROPERTY = "__scatter__";
private static final int SCATTER_OVERSAMPLE_FACTOR = 32;
private static final long serialVersionUID = -3939543473076385308L;
// ------------------------------ FIELDS ------------------------------
private final String entityKind;
private final int shardCount;
// --------------------------- CONSTRUCTORS ---------------------------
public DatastoreInput(String entityKind) {
this(entityKind, 4);
}
public DatastoreInput(String entityKind, int shardCount) {
this.entityKind = entityKind;
this.shardCount = shardCount;
}
// ------------------------ IMPLEMENTING METHODS ------------------------
@Override
public List<? extends InputReader<Key, Entity>> split(
MapperJobContext<Key, Entity, ?, ?> context) {
Preconditions.checkNotNull(entityKind);
logger.info("Getting input splits for: " + entityKind);
DatastoreService datastoreService = DatastoreServiceFactory.getDatastoreService();
Key startKey = getStartKey(entityKind, datastoreService);
if (startKey == null) {
return Collections.emptyList();
}
Key lastKey = startKey;
List<DatastoreInputReader> result = new ArrayList<DatastoreInputReader>();
for (Key currentKey : chooseSplitPoints(datastoreService)) {
DatastoreInputReader source = new DatastoreInputReader(entityKind, lastKey, currentKey);
result.add(source);
logger.info(
String.format("Added DatastoreInputSplit %s %s %s", source, lastKey, currentKey));
lastKey = currentKey;
}
// Add in the final split. null is special cased so this split contains
// [lastKey, Infinity).
result.add(new DatastoreInputReader(entityKind, lastKey, null));
return result;
}
// -------------------------- INSTANCE METHODS --------------------------
private Collection<Key> chooseSplitPoints(DatastoreService datastoreService) {
int desiredScatterResultCount = shardCount * SCATTER_OVERSAMPLE_FACTOR;
Query scatter = new Query(entityKind)
.addSort(SCATTER_RESERVED_PROPERTY)
.setKeysOnly();
List<Entity> scatterList = datastoreService.prepare(scatter).asList(
withLimit(desiredScatterResultCount));
Collections.sort(scatterList, new Comparator<Entity>() {
@Override
public int compare(Entity o1, Entity o2) {
return o1.getKey().compareTo(o2.getKey());
}
});
Collection<Key> splitKeys = new ArrayList<Key>(shardCount);
// Possibly use a lower oversampling factor if there aren't enough scatter
// property-containing entities to fill out the list.
int usedOversampleFactor = Math.max(1, scatterList.size() / shardCount);
logger.info("Requested " + desiredScatterResultCount + " scatter entities. Got "
+ scatterList.size() + " so using oversample factor " + usedOversampleFactor);
// We expect the points to be uniformly randomly distributed. So we
// act like the first point is the start key (which we alread know) and
// omit it. This converges on correct as the number of samples goes
// to infinity.
for (int i = 1; i < shardCount; i++) {
// This can happen if we don't have as many scatter properties as we want.
if (i * usedOversampleFactor >= scatterList.size()) {
break;
}
splitKeys.add(scatterList.get(i * usedOversampleFactor).getKey());
}
return splitKeys;
}
// -------------------------- STATIC METHODS --------------------------
private static Key getStartKey(String entityKind, DatastoreService datastoreService) {
Query ascending = new Query(entityKind)
.addSort(Entity.KEY_RESERVED_PROPERTY)
.setKeysOnly();
Iterator<Entity> ascendingIt = datastoreService.prepare(ascending).asIterator(withLimit(1));
if (!ascendingIt.hasNext()) {
return null;
}
return ascendingIt.next().getKey();
}
}