/* * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.addthis.hydra.task.map; import java.util.concurrent.ConcurrentSkipListMap; import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.ReentrantLock; import com.addthis.bundle.core.Bundle; import com.addthis.bundle.value.ValueObject; import com.addthis.codec.annotations.FieldConfig; import com.addthis.hydra.data.filter.bundle.BundleFilter; import com.yammer.metrics.Metrics; import com.yammer.metrics.core.Counter; /** * This builder gathers, sorts, and de-duplicates incoming bundles. * <p/> * NOTE: this is a lossy operation. The {@code batchSize} field is used * to gather n bundles into a batch. Bundles in that batch are sorted * and de-duplicated based on the {@code field} provided as input to this * class. * <p/> * Because bundles are not emitted until the {@code batchSize} is reached * it is possible that the system will go into shutdown mode and fail to emit * up to {@code batchSize} elements that are in the map but have not yet been * emitted to the processor. (FIXME: StreamBuilder can now shutdown()) */ public class SortedDeDupBuilder extends StreamBuilder { private final ConcurrentSkipListMap<String, Bundle> sortedMap = new ConcurrentSkipListMap<>(); private final Counter dropCounter = Metrics.newCounter(this.getClass(), "dropCounter"); private final Lock flushLock = new ReentrantLock(); @FieldConfig(codable = true) private String field; @FieldConfig(codable = true) private int batchSize = 100; @FieldConfig(codable = true) private BundleFilter filter; @Override public void init() { } @Override public void process(Bundle bundle, StreamEmitter emitter) { if (filter == null || filter.filter(bundle)) { ValueObject valueObject = bundle.getValue(bundle.getFormat().getField(field)); if (valueObject == null) { return; } String keyValue = valueObject.asString().toString(); if (sortedMap.put(keyValue, bundle) != null) { dropCounter.inc(); } if (sortedMap.size() >= batchSize) { if (flushLock.tryLock()) { try { for (Bundle sortedDeDupedBundle : sortedMap.values()) { emitter.emit(sortedDeDupedBundle); } sortedMap.clear(); } finally { flushLock.unlock(); } } } } } }