package com.scopely.infrastructure.kinesis; import com.amazonaws.AmazonClientException; import com.amazonaws.services.kinesis.AmazonKinesis; import com.amazonaws.services.kinesis.model.DescribeStreamResult; import com.amazonaws.services.kinesis.model.ProvisionedThroughputExceededException; import com.amazonaws.services.kinesis.model.PutRecordsRequest; import com.amazonaws.services.kinesis.model.PutRecordsRequestEntry; import com.amazonaws.services.kinesis.model.PutRecordsResult; import com.amazonaws.services.kinesis.model.PutRecordsResultEntry; import com.amazonaws.services.kinesis.model.ResourceNotFoundException; import com.amazonaws.services.s3.AmazonS3; import com.amazonaws.services.s3.model.ObjectListing; import com.amazonaws.services.s3.model.S3Object; import com.amazonaws.services.s3.model.S3ObjectSummary; import com.amazonaws.util.IOUtils; import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.Nullable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import rx.Observable; import rx.Subscriber; import rx.schedulers.Schedulers; import java.io.IOException; import java.nio.ByteBuffer; import java.time.LocalDateTime; import java.time.ZoneOffset; import java.time.temporal.ChronoUnit; import java.util.ArrayList; import java.util.Arrays; import java.util.Base64; import java.util.Collections; import java.util.Date; import java.util.LinkedList; import java.util.List; import java.util.Optional; import java.util.UUID; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; import java.util.function.Predicate; import static java.util.stream.Collectors.toList; public class KinesisPlayer { private static final Logger LOGGER = LoggerFactory.getLogger(KinesisPlayer.class); private static final int MAX_KINESIS_BATCH_SIZE = 500; private static final int MAX_KINESIS_BATCH_WEIGHT = 1_000_000; private static final int KINESIS_PUT_BATCH_RETRIES_TIMEOUT = 30; private final VcrConfiguration vcrConfiguration; private final AmazonS3 s3; private final AmazonKinesis kinesis; private final ExecutorService kinesisWriter = Executors.newFixedThreadPool(10); private final int numberOfShards; public KinesisPlayer(VcrConfiguration vcrConfiguration, AmazonS3 s3, AmazonKinesis kinesis) { this.vcrConfiguration = vcrConfiguration; this.s3 = s3; this.kinesis = kinesis; // Check everything: S3 and Kinesis if (!s3.doesBucketExist(vcrConfiguration.bucket)) { LOGGER.error("Specified S3 bucket '{}' does not exist", vcrConfiguration.bucket); throw new IllegalArgumentException("Bucket not found"); } try { DescribeStreamResult describeStreamResult = kinesis.describeStream(vcrConfiguration.targetStream); numberOfShards = (int) describeStreamResult .getStreamDescription() .getShards() .stream() /* only take into account open shards */ .filter(shard -> shard.getSequenceNumberRange().getEndingSequenceNumber() == null) .count(); } catch (ResourceNotFoundException e) { LOGGER.error("Specified Kinesis stream '{}' not found", vcrConfiguration.targetStream); throw e; } } public Observable<PutRecordsResultEntry> play(LocalDateTime start, @Nullable LocalDateTime end) { return playableObjects(start, end) .onBackpressureBuffer() .observeOn(Schedulers.io()) .flatMap(this::objectToPayloads) .map(ByteBuffer::wrap) .lift(new OperatorBufferKinesisBatch(MAX_KINESIS_BATCH_SIZE, MAX_KINESIS_BATCH_WEIGHT)) .onBackpressureBuffer() .map(byteBuffers -> byteBuffers.stream() .map(buffer -> new PutRecordsRequestEntry() .withData(buffer) .withPartitionKey(UUID.randomUUID().toString())) .collect(toList())) .map(entries -> new PutRecordsRequest() .withStreamName(vcrConfiguration.targetStream) .withRecords(entries)) .observeOn(Schedulers.io()) .flatMap(putRecordsRequest -> Observable.create((Observable.OnSubscribe<List<PutRecordsResult>>) os -> { os.onStart(); kinesisWriter.submit(() -> { os.onNext(putWithRetry(putRecordsRequest).orElse(Collections.<PutRecordsResult>emptyList())); os.onCompleted(); }); })) .flatMap(Observable::from) .flatMap(putRecordsResult -> Observable.from(putRecordsResult.getRecords())) .doOnNext(result -> LOGGER.debug("Wrote record. Seq {}, shard {}", result.getSequenceNumber(), result.getShardId())); } /** * Tries to send the provided request to kinesis, retrying records that failed to be processed */ private Optional<List<PutRecordsResult>> putWithRetry(PutRecordsRequest putRecordsRequest) { long totalSize = putRecordsRequest.getRecords().stream().mapToLong(record -> record.getData().limit()).sum(); LOGGER.info("Sending {} records ({} bytes)", putRecordsRequest.getRecords().size(), totalSize); try { List<PutRecordsResult> resultSetList = new ArrayList<>(); return ExponentialBackoffRunner.run(() -> { PutRecordsResult putRecordsResult = kinesis.putRecords(putRecordsRequest); resultSetList.add(putRecordsResult); if (putRecordsResult.getFailedRecordCount() > 0) { List<PutRecordsRequestEntry> entriesForRetry = new ArrayList<>(); for (int i = 0; i < putRecordsResult.getRecords().size(); i++) { PutRecordsResultEntry resultEntry = putRecordsResult.getRecords().get(i); PutRecordsRequestEntry requestEntry = putRecordsRequest.getRecords().get(i); if (resultEntry.getErrorCode() != null) { entriesForRetry.add(requestEntry); } } putRecordsRequest.withRecords(entriesForRetry); if (entriesForRetry.size() > 0) { LOGGER.warn("Retrying {} records", entriesForRetry.size()); throw new PartialFailureException(); } } return resultSetList; }, throwable -> throwable instanceof ProvisionedThroughputExceededException || throwable instanceof AmazonClientException || throwable instanceof PartialFailureException, TimeUnit.SECONDS.toMillis(KINESIS_PUT_BATCH_RETRIES_TIMEOUT)); } catch (Throwable throwable) { throw new RuntimeException("Unhandled exception from Kinesis put", throwable); } } private class PartialFailureException extends RuntimeException { } public Observable<byte[]> objectToPayloads(S3ObjectSummary summary) { LOGGER.info("Found playable object from {} at key '{}'", summary.getLastModified(), summary.getKey()); List<byte[]> kinesisPayloads = new LinkedList<>(); try (S3Object s3Object = s3.getObject(summary.getBucketName(), summary.getKey())) { byte[] contents = IOUtils.toByteArray(s3Object.getObjectContent()); int blockStart = 0; for (int position = 0; position < contents.length; position++) { if (contents[position] == '\n') { if (position == blockStart) { continue; } // Copy out the range exclusive of our one-byte delimiter kinesisPayloads.add(Arrays.copyOfRange(contents, blockStart, position)); blockStart = position + 1; } } if (blockStart < contents.length) { kinesisPayloads.add(Arrays.copyOfRange(contents, blockStart, contents.length)); } } catch (IOException e) { LOGGER.error("Error reading object at key: " + summary.getKey(), e); } LOGGER.debug("Read {} records from object at key {}", kinesisPayloads.size(), summary.getKey()); return Observable.from(kinesisPayloads) .map(b64Payload -> Base64.getDecoder().decode(b64Payload)).subscribeOn(Schedulers.io()); } /** * Returns an observable that emits all the S3 objects between the provided start and end date. */ public Observable<S3ObjectSummary> playableObjects(@NotNull LocalDateTime start, @Nullable LocalDateTime end) { if (end != null && start.isAfter(end)) { throw new IllegalArgumentException("startDate > endDate"); } if (end == null) { // Since end was not provided, let's assume we want data for the provided start day. // That means we must match s3 objects between start and start + 1 day. // We subtract 1 second in order to avoid checking for files in the next day folder. end = start.plusDays(1).minusSeconds(1); } final LocalDateTime finalEnd = end; Predicate<Date> dateFilter = date -> { return start.atOffset(ZoneOffset.UTC).toEpochSecond() < date.getTime() / 1000 && finalEnd.atOffset(ZoneOffset.UTC).toEpochSecond() > date.getTime() / 1000; }; return Observable.create(new Observable.OnSubscribe<Observable<S3ObjectSummary>>() { @Override public void call(Subscriber<? super Observable<S3ObjectSummary>> subscriber) { // get all S3 objects for each date between start and end for (LocalDateTime currentDate = start; !finalEnd.isBefore(currentDate); currentDate = currentDate.plus(1, ChronoUnit.DAYS)) { subscriber.onNext(playableObjects(currentDate)); } subscriber.onCompleted(); } }) .subscribeOn(Schedulers.io()) .flatMap(x -> x) .filter(x -> dateFilter.test(x.getLastModified())) .onBackpressureBuffer(); } /** * Returns all objects saved under the provided date folder */ private Observable<S3ObjectSummary> playableObjects(LocalDateTime date) { return Observable.create(new Observable.OnSubscribe<Observable<S3ObjectSummary>>() { @Override public void call(Subscriber<? super Observable<S3ObjectSummary>> subscriber) { // list objects under the currentDate folder String prefix = vcrConfiguration.sourceStream + "/" + date.format(S3RecorderPipeline.FORMATTER); ObjectListing listing = s3.listObjects(vcrConfiguration.bucket, prefix); while (!subscriber.isUnsubscribed() && !listing.getObjectSummaries().isEmpty()) { subscriber.onNext(Observable.from(listing.getObjectSummaries())); // Retry on everything! final ObjectListing finalListing = listing; try { listing = ExponentialBackoffRunner.run(() -> s3.listNextBatchOfObjects(finalListing), t -> true, TimeUnit.MINUTES.toMillis(2)) .orElseThrow(() -> new TimeoutException("Failed to get a listing in the allotted time")); } catch (Throwable throwable) { throw new RuntimeException(throwable); } } subscriber.onCompleted(); } }).onBackpressureBuffer().flatMap(x -> x).subscribeOn(Schedulers.io()); } /** * @return number of shards of the target stream */ public int getNumberOfShards() { return numberOfShards; } }