/*
* Copyright 2011-2014 Proofpoint, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.proofpoint.event.collector.combiner;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Function;
import com.google.common.base.Preconditions;
import com.google.common.collect.Collections2;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.proofpoint.event.client.EventClient;
import com.proofpoint.event.collector.EventPartition;
import com.proofpoint.event.collector.ServerConfig;
import com.proofpoint.log.Logger;
import com.proofpoint.node.NodeInfo;
import com.proofpoint.units.DataSize;
import org.joda.time.DateTime;
import org.joda.time.format.DateTimeFormatter;
import org.joda.time.format.ISODateTimeFormat;
import org.weakref.jmx.Managed;
import javax.inject.Inject;
import java.net.URI;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentSkipListSet;
import java.util.concurrent.TimeUnit;
import static com.google.common.collect.Maps.newHashMap;
import static com.google.common.collect.Sets.newHashSet;
import static com.proofpoint.event.collector.combiner.CombinedGroup.createInitialCombinedGroup;
import static com.proofpoint.event.collector.combiner.S3StorageHelper.appendSuffix;
import static com.proofpoint.event.collector.combiner.S3StorageHelper.buildS3Location;
import static com.proofpoint.event.collector.combiner.S3StorageHelper.getS3Directory;
import static com.proofpoint.event.collector.combiner.S3StorageHelper.getS3FileName;
import static java.lang.System.currentTimeMillis;
import static org.joda.time.DateTimeZone.UTC;
public class StoredObjectCombiner
{
private static final Logger log = Logger.get(StoredObjectCombiner.class);
private static final DataSize S3_MINIMUM_COMBINABLE_SIZE = new DataSize(5, DataSize.Unit.MEGABYTE);
private static final DateTimeFormatter DATE_FORMAT = ISODateTimeFormat.date().withZone(UTC);
private final Set<URI> badManifests = new ConcurrentSkipListSet<>();
private final String nodeId;
private final CombineObjectMetadataStore metadataStore;
private final StorageSystem storageSystem;
private final EventClient eventClient;
private final URI stagingBaseUri;
private final URI targetBaseUri;
private final long targetFileSize;
private final boolean ignoreErrors;
private final boolean disableStartEndFiltering;
private final int startDaysAgo;
private final int endDaysAgo;
private final String groupId;
@Inject
public StoredObjectCombiner(
NodeInfo nodeInfo,
CombineObjectMetadataStore metadataStore,
StorageSystem storageSystem,
ServerConfig config,
EventClient eventClient)
{
Preconditions.checkNotNull(nodeInfo, "nodeInfo is null");
Preconditions.checkNotNull(metadataStore, "metadataStore is null");
Preconditions.checkNotNull(storageSystem, "storageSystem is null");
Preconditions.checkNotNull(eventClient, "eventClient is null");
Preconditions.checkNotNull(config, "config is null");
Preconditions.checkArgument(config.isCombinerStartEndDaysSane(), "combinerStartDaysAgo must be greater than endDaysAgo");
this.nodeId = nodeInfo.getNodeId();
this.metadataStore = metadataStore;
this.storageSystem = storageSystem;
this.eventClient = eventClient;
this.stagingBaseUri = URI.create(config.getS3StagingLocation());
this.targetBaseUri = URI.create(config.getS3DataLocation());
this.targetFileSize = config.getTargetFileSize().toBytes();
this.ignoreErrors = true;
this.disableStartEndFiltering = config.isCombinerDateRangeLimitDisabled();
this.startDaysAgo = config.getCombinerStartDaysAgo();
this.endDaysAgo = config.getCombinerEndDaysAgo();
this.groupId = config.getCombinerGroupId();
}
public StoredObjectCombiner(
String nodeId,
CombineObjectMetadataStore metadataStore,
StorageSystem storageSystem,
EventClient eventClient,
URI stagingBaseUri,
URI targetBaseUri,
DataSize targetFileSize,
int startDaysAgo,
int endDaysAgo,
String groupId)
{
Preconditions.checkNotNull(nodeId, "nodeId is null");
Preconditions.checkNotNull(metadataStore, "metadataStore is null");
Preconditions.checkNotNull(storageSystem, "storageSystem is null");
Preconditions.checkNotNull(eventClient, "eventClient is null");
Preconditions.checkNotNull(stagingBaseUri, "stagingBaseUri is null");
Preconditions.checkNotNull(targetBaseUri, "targetBaseUri is null");
Preconditions.checkNotNull(targetFileSize, "targetFileSize is null");
Preconditions.checkArgument(startDaysAgo > endDaysAgo, "startDaysAgo must be greater than endDaysAgo");
this.nodeId = nodeId;
this.metadataStore = metadataStore;
this.storageSystem = storageSystem;
this.eventClient = eventClient;
this.stagingBaseUri = stagingBaseUri;
this.targetBaseUri = targetBaseUri;
this.targetFileSize = targetFileSize.toBytes();
this.ignoreErrors = false;
this.startDaysAgo = startDaysAgo;
this.endDaysAgo = endDaysAgo;
this.disableStartEndFiltering = false;
this.groupId = groupId;
}
@Managed
public Set<URI> getBadManifests()
{
return badManifests;
}
/**
* Iterate over all event partitions, find staged objects in each partition,
* then call {@link #combineObjects} to combine them.
*/
public void combineAllObjects()
{
String startDate = createPartitionForDate(getStartDate());
String endDate = createPartitionForDate(getEndDate());
for (URI eventTypeBaseUri : storageSystem.listDirectories(stagingBaseUri)) {
combineObjects(eventTypeBaseUri, startDate, endDate);
}
}
/**
* Retrieve a list of URIs describing the partitions for each event type
* to be combined.
*/
public List<String> listEventTypes()
{
return Lists.transform(
storageSystem.listDirectories(stagingBaseUri),
new Function<URI, String>()
{
@Override
public String apply(URI uri)
{
return getS3FileName(uri);
}
});
}
/**
* Iterate over all event partitions for a given type, specified by the eventType
*
* @param eventType The type of event for which to combine.
*/
public void combineObjects(String eventType)
{
combineObjects(buildS3Location(stagingBaseUri, eventType + "/"),
createPartitionForDate(getStartDate()),
createPartitionForDate(getEndDate()));
}
/**
* Split staged objects into small and large groups based on size,
* then call {@link #combineObjectGroup} to combine the group.
* We need two groups because we want to perform server-side combines,
* but S3 has a minimum allowable size for that feature.
*
* @param eventPartition the event partition to combine
* @param baseURI base target filename
* @param stagedObjects list of all staged objects in partition
*/
@VisibleForTesting
void combineObjects(EventPartition eventPartition, URI baseURI, List<StoredObject> stagedObjects)
{
List<StoredObject> smallFiles = Lists.newArrayListWithCapacity(stagedObjects.size());
List<StoredObject> largeFiles = Lists.newArrayListWithCapacity(stagedObjects.size());
for (StoredObject stagedObject : stagedObjects) {
if (stagedObject.getSize() == 0) {
// ignore empty object
}
else if (stagedObject.getSize() < S3_MINIMUM_COMBINABLE_SIZE.toBytes()) {
smallFiles.add(stagedObject);
}
else {
largeFiles.add(stagedObject);
}
}
combineObjectGroup(eventPartition, "small", baseURI, smallFiles);
combineObjectGroup(eventPartition, "large", baseURI, largeFiles);
}
/**
* Combine objects of a single type that fall between two dates.
*
* @param eventTypeBaseUri The base URI containing all of the events of a single type to combine.
* @param startDate The earliest date of events to combine.
* @param endDate The latest date of events to combine.
*/
private void combineObjects(URI eventTypeBaseUri, String startDate, String endDate)
{
String eventType = getS3FileName(eventTypeBaseUri);
log.info("starting combining objects of type %s", eventType);
for (URI timeSliceBaseUri : storageSystem.listDirectories(eventTypeBaseUri)) {
String dateBucket = getS3FileName(timeSliceBaseUri);
if (!disableStartEndFiltering) {
if (olderThanThreshold(dateBucket, startDate) || newerThanOrEqualToThreshold(dateBucket, endDate)) {
continue;
}
}
for (URI hourBaseUri : storageSystem.listDirectories(timeSliceBaseUri)) {
log.info("combining staging bucket: %s", hourBaseUri);
String hour = getS3FileName(hourBaseUri);
URI stagingArea = buildS3Location(timeSliceBaseUri, hour + "/");
List<StoredObject> stagedObjects = storageSystem.listObjects(stagingArea);
if (!stagedObjects.isEmpty()) {
EventPartition eventPartition = new EventPartition(eventType, dateBucket, hour);
URI targetObjectLocation = buildS3Location(targetBaseUri, eventType, dateBucket, hour);
combineObjects(eventPartition, targetObjectLocation, stagedObjects);
}
}
}
log.info("finished combining objects of type %s", eventType);
eventClient.post(new CombineCompleted(groupId, eventType));
}
/**
* Add new objects to the manifest using {@link #buildCombinedGroup},
* write the updated manifest, determine what targets need updating
* and perform the combines using {@link #createCombinedObject}.
*
* @param eventPartition the event partition to combine
* @param sizeName either "small" or "large"
* @param baseURI base target filename
* @param stagedObjects list of all staged objects in partition for size group
*/
private void combineObjectGroup(EventPartition eventPartition, String sizeName, URI baseURI, List<StoredObject> stagedObjects)
{
if (stagedObjects.isEmpty()) {
return;
}
baseURI = appendSuffix(baseURI, sizeName);
CombinedGroup currentGroup = metadataStore.getCombinedGroupManifest(eventPartition, sizeName);
if (currentGroup == null) {
currentGroup = createInitialCombinedGroup(baseURI, nodeId);
log.info("creating new combined group: %s", currentGroup);
}
// only update the object if this node was the last writer or some time has passed
if ((!nodeId.equals(currentGroup.getCreator())) && !groupIsMinutesOld(currentGroup, 5)) {
log.warn("this node cannot update this group: %s", currentGroup);
return;
}
// create new manifest from existing manifest with new staged objects
CombinedGroup newGroup = buildCombinedGroup(currentGroup, stagedObjects);
// attempt to write new manifest
if (!metadataStore.replaceCombinedGroupManifest(eventPartition, sizeName, currentGroup, newGroup)) {
log.warn("failed to write manifest: %s.%s", eventPartition, sizeName);
return;
}
// get list of existing combined files on target
Map<URI, StoredObject> existingObjects = newHashMap();
for (StoredObject object : storageSystem.listObjects(getS3Directory(baseURI))) {
existingObjects.put(object.getLocation(), object);
}
// execute combines for any objects that don't exist or match manifest size
for (CombinedStoredObject newObject : newGroup.getCombinedObjects()) {
// only combine if all source parts are available
if (allPartsAvailable(stagedObjects, newObject.getSourceParts())) {
StoredObject existingObject = existingObjects.get(newObject.getLocation());
if ((existingObject == null) || (existingObject.getSize() != newObject.getSize())) {
log.info("creating combined object: %s", newObject);
createCombinedObject(newObject);
}
}
}
}
/**
* Add new staged objects to the existing manifest and return the new
* manifest. Staged objects are added to an existing combined object
* if possible, otherwise a new one is created.
*
* @param group the existing manifest
* @param stagedObjects list of staged objects to add to manifest
* @return the new manifest
*/
private CombinedGroup buildCombinedGroup(CombinedGroup group, List<StoredObject> stagedObjects)
{
// get all objects that have already been combined
Set<StoredObject> alreadyCombinedObjects = newHashSet();
for (CombinedStoredObject combinedObject : group.getCombinedObjects()) {
alreadyCombinedObjects.addAll(combinedObject.getSourceParts());
}
// get new objects that still need to be assigned to a combined object
List<StoredObject> newObjects = getNewObjects(stagedObjects, alreadyCombinedObjects);
// add each new object to a combined object
for (StoredObject newObject : newObjects) {
boolean added = false;
// try to find an open combined object
for (CombinedStoredObject combinedObject : group.getCombinedObjects()) {
// skip if combined object is at target size
if (combinedObject.getSize() >= targetFileSize) {
continue;
}
// skip if any parts are no longer available
if (!allPartsAvailable(stagedObjects, combinedObject.getSourceParts())) {
continue;
}
// skip if objects in staging do not match the current combined object
if (!containsAll(stagedObjects, combinedObject.getSourceParts())) {
if (badManifests.add(group.getLocationPrefix())) {
log.error("manifest source objects do not match objects in staging area: %s", group.getLocationPrefix());
}
continue;
}
// add object to combined object
group = group.updateCombinedObject(nodeId, combinedObject.addPart(newObject));
added = true;
break;
}
// create new combined object if necessary
if (!added) {
group = group.addNewCombinedObject(nodeId, ImmutableList.of(newObject));
}
}
return group;
}
private void createCombinedObject(CombinedStoredObject object)
{
if (ignoreErrors) {
createCombinedObjectIgnoringErrors(object);
}
else {
storageSystem.createCombinedObject(object);
}
}
private void createCombinedObjectIgnoringErrors(CombinedStoredObject object)
{
try {
storageSystem.createCombinedObject(object);
}
catch (Exception e) {
log.error(e, "create combined object failed");
}
}
private static List<StoredObject> getNewObjects(List<StoredObject> stagedObjects, Set<StoredObject> alreadyCombinedObjects)
{
Set<URI> combined = newHashSet(Iterables.transform(alreadyCombinedObjects, StoredObject.GET_LOCATION_FUNCTION));
List<StoredObject> newObjects = Lists.newArrayList();
for (StoredObject stagedObject : stagedObjects) {
if (!combined.contains(stagedObject.getLocation())) {
newObjects.add(stagedObject);
}
}
return newObjects;
}
private static boolean allPartsAvailable(Collection<StoredObject> stagedObjects, Collection<StoredObject> sourceParts)
{
Collection<URI> stagedNames = Collections2.transform(stagedObjects, StoredObject.GET_LOCATION_FUNCTION);
Collection<URI> sourceNames = Collections2.transform(sourceParts, StoredObject.GET_LOCATION_FUNCTION);
return containsAll(stagedNames, sourceNames);
}
private static boolean containsAll(Iterable<?> source, Iterable<?> target)
{
return ImmutableSet.copyOf(source).containsAll(ImmutableSet.copyOf(target));
}
private static boolean groupIsMinutesOld(CombinedGroup group, int minutes)
{
return (currentTimeMillis() - group.getUpdatedTimestamp()) >= TimeUnit.MINUTES.toMillis(minutes);
}
private static boolean olderThanThreshold(String dateBucket, String thresholdDate)
{
return dateBucket.compareTo(thresholdDate) < 0;
}
private static boolean newerThanOrEqualToThreshold(String dateBucket, String thresholdDate)
{
return dateBucket.compareTo(thresholdDate) >= 0;
}
private String createPartitionForDate(DateTime startDateMidnight)
{
return DATE_FORMAT.print(startDateMidnight);
}
private DateTime getEndDate()
{
return DateTime.now(UTC).withTimeAtStartOfDay().minusDays(endDaysAgo);
}
private DateTime getStartDate()
{
return DateTime.now(UTC).withTimeAtStartOfDay().minusDays(startDaysAgo);
}
}