/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nifi.provenance.serialization; import java.io.BufferedInputStream; import java.io.DataInputStream; import java.io.EOFException; import java.io.IOException; import java.io.InputStream; import java.nio.charset.StandardCharsets; import java.util.Optional; import java.util.zip.GZIPInputStream; import org.apache.nifi.provenance.ProvenanceEventRecord; import org.apache.nifi.provenance.StandardProvenanceEventRecord; import org.apache.nifi.provenance.toc.TocReader; import org.apache.nifi.stream.io.ByteCountingInputStream; import org.apache.nifi.stream.io.LimitingInputStream; import org.apache.nifi.stream.io.StreamUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public abstract class CompressableRecordReader implements RecordReader { private static final Logger logger = LoggerFactory.getLogger(CompressableRecordReader.class); private final ByteCountingInputStream rawInputStream; private final String filename; private final int serializationVersion; private final boolean compressed; private final TocReader tocReader; private final int headerLength; private final int maxAttributeChars; private DataInputStream dis; private ByteCountingInputStream byteCountingIn; private StandardProvenanceEventRecord pushbackEvent = null; public CompressableRecordReader(final InputStream in, final String filename, final int maxAttributeChars) throws IOException { this(in, filename, null, maxAttributeChars); } public CompressableRecordReader(final InputStream in, final String filename, final TocReader tocReader, final int maxAttributeChars) throws IOException { logger.trace("Creating RecordReader for {}", filename); rawInputStream = new ByteCountingInputStream(in); this.maxAttributeChars = maxAttributeChars; final InputStream limitedStream; if (tocReader == null) { limitedStream = rawInputStream; } else { final long offset1 = tocReader.getBlockOffset(1); if (offset1 < 0) { limitedStream = rawInputStream; } else { limitedStream = new LimitingInputStream(rawInputStream, offset1 - rawInputStream.getBytesConsumed()); } } final InputStream readableStream; if (filename.endsWith(".gz")) { readableStream = new BufferedInputStream(new GZIPInputStream(limitedStream)); compressed = true; } else { readableStream = new BufferedInputStream(limitedStream); compressed = false; } byteCountingIn = new ByteCountingInputStream(readableStream); dis = new DataInputStream(byteCountingIn); final String repoClassName = dis.readUTF(); final int serializationVersion = dis.readInt(); headerLength = repoClassName.getBytes(StandardCharsets.UTF_8).length + 2 + 4; // 2 bytes for string length, 4 for integer. this.serializationVersion = serializationVersion; this.filename = filename; this.tocReader = tocReader; readHeader(dis, serializationVersion); } @Override public void skipToBlock(final int blockIndex) throws IOException { if (tocReader == null) { throw new IllegalStateException("Cannot skip to block " + blockIndex + " for Provenance Log " + filename + " because no Table-of-Contents file was found for this Log"); } if (blockIndex < 0) { throw new IllegalArgumentException("Cannot skip to block " + blockIndex + " because the value is negative"); } if (blockIndex == getBlockIndex()) { return; } final long offset = tocReader.getBlockOffset(blockIndex); if (offset < 0) { throw new IOException("Unable to find block " + blockIndex + " in Provenance Log " + filename); } final long curOffset = rawInputStream.getBytesConsumed(); final long bytesToSkip = offset - curOffset; if (bytesToSkip >= 0) { try { StreamUtils.skip(rawInputStream, bytesToSkip); logger.debug("Skipped stream from offset {} to {} ({} bytes skipped)", curOffset, offset, bytesToSkip); } catch (final EOFException eof) { throw new EOFException("Attempted to skip to byte offset " + offset + " for " + filename + " but file does not have that many bytes (TOC Reader=" + getTocReader() + ")"); } catch (final IOException e) { throw new IOException("Failed to skip to offset " + offset + " for block " + blockIndex + " of Provenance Log " + filename, e); } resetStreamForNextBlock(); } } private void resetStreamForNextBlock() throws IOException { final InputStream limitedStream; if (tocReader == null) { limitedStream = rawInputStream; } else { final long offset = tocReader.getBlockOffset(1 + getBlockIndex()); if (offset < 0) { limitedStream = rawInputStream; } else { limitedStream = new LimitingInputStream(rawInputStream, offset - rawInputStream.getBytesConsumed()); } } final InputStream readableStream; if (compressed) { readableStream = new BufferedInputStream(new GZIPInputStream(limitedStream)); } else { readableStream = new BufferedInputStream(limitedStream); } byteCountingIn = new ByteCountingInputStream(readableStream, rawInputStream.getBytesConsumed()); dis = new DataInputStream(byteCountingIn); } @Override public TocReader getTocReader() { return tocReader; } @Override public boolean isBlockIndexAvailable() { return tocReader != null; } @Override public int getBlockIndex() { if (tocReader == null) { throw new IllegalStateException("Cannot determine Block Index because no Table-of-Contents could be found for Provenance Log " + filename); } return tocReader.getBlockIndex(rawInputStream.getBytesConsumed()); } @Override public long getBytesConsumed() { return byteCountingIn.getBytesConsumed(); } @Override public boolean isData() { try { byteCountingIn.mark(1); int nextByte = byteCountingIn.read(); byteCountingIn.reset(); if (nextByte < 0) { try { resetStreamForNextBlock(); } catch (final EOFException eof) { return false; } byteCountingIn.mark(1); nextByte = byteCountingIn.read(); byteCountingIn.reset(); } return nextByte >= 0; } catch (final IOException ioe) { return false; } } @Override public long getMaxEventId() throws IOException { if (tocReader != null) { final long lastBlockOffset = tocReader.getLastBlockOffset(); skipToBlock(tocReader.getBlockIndex(lastBlockOffset)); } ProvenanceEventRecord record; ProvenanceEventRecord lastRecord = null; try { while ((record = nextRecord()) != null) { lastRecord = record; } } catch (final EOFException eof) { // This can happen if we stop NIFi while the record is being written. // This is OK, we just ignore this record. The session will not have been // committed, so we can just process the FlowFile again. } return lastRecord == null ? -1L : lastRecord.getEventId(); } @Override public void close() throws IOException { logger.trace("Closing Record Reader for {}", filename); try { dis.close(); } finally { try { rawInputStream.close(); } finally { if (tocReader != null) { tocReader.close(); } } } } @Override public void skip(final long bytesToSkip) throws IOException { StreamUtils.skip(dis, bytesToSkip); } @Override public void skipTo(final long position) throws IOException { // we are subtracting headerLength from the number of bytes consumed because we used to // consider the offset of the first record "0" - now we consider it whatever position it // it really is in the stream. final long currentPosition = byteCountingIn.getBytesConsumed() - headerLength; if (currentPosition == position) { return; } if (currentPosition > position) { throw new IOException("Cannot skip to byte offset " + position + " in stream because already at byte offset " + currentPosition); } final long toSkip = position - currentPosition; StreamUtils.skip(dis, toSkip); } protected String getFilename() { return filename; } protected int getMaxAttributeLength() { return maxAttributeChars; } @Override public StandardProvenanceEventRecord nextRecord() throws IOException { if (pushbackEvent != null) { final StandardProvenanceEventRecord toReturn = pushbackEvent; pushbackEvent = null; return toReturn; } if (isData()) { while (true) { try { return nextRecord(dis, serializationVersion); } catch (final IOException ioe) { throw ioe; } catch (final Exception e) { // This would only happen if a bug were to exist such that an 'invalid' event were written // out. For example an Event that has no FlowFile UUID. While there is in fact an underlying // cause that would need to be sorted out in this case, the Provenance Repository should be // resilient enough to handle this. Otherwise, we end up throwing an Exception, which may // prevent iterating over additional events in the repository. logger.error("Failed to read Provenance Event from " + filename + "; will skip this event and continue reading subsequent events", e); } } } else { return null; } } protected Optional<Integer> getBlockIndex(final long eventId) { final TocReader tocReader = getTocReader(); if (tocReader == null) { return Optional.empty(); } else { final Integer blockIndex = tocReader.getBlockIndexForEventId(eventId); return Optional.ofNullable(blockIndex); } } @Override public Optional<ProvenanceEventRecord> skipToEvent(final long eventId) throws IOException { if (pushbackEvent != null) { final StandardProvenanceEventRecord previousPushBack = pushbackEvent; if (previousPushBack.getEventId() >= eventId) { return Optional.of(previousPushBack); } else { pushbackEvent = null; } } final Optional<Integer> blockIndex = getBlockIndex(eventId); if (blockIndex.isPresent()) { // Skip to the appropriate block index and then read until we've found an Event // that has an ID >= the event id. skipToBlock(blockIndex.get()); } try { boolean read = true; while (read) { final Optional<StandardProvenanceEventRecord> eventOptional = this.readToEvent(eventId, dis, serializationVersion); if (eventOptional.isPresent()) { pushbackEvent = eventOptional.get(); return Optional.of(pushbackEvent); } else { read = isData(); } } return Optional.empty(); } catch (final EOFException eof) { // This can occur if we run out of data and attempt to read the next event ID. logger.error("Unexpectedly reached end of File when looking for Provenance Event with ID {} in {}", eventId, filename); return Optional.empty(); } } protected Optional<StandardProvenanceEventRecord> readToEvent(final long eventId, final DataInputStream dis, final int serializationVerison) throws IOException { StandardProvenanceEventRecord event; while ((event = nextRecord()) != null) { if (event.getEventId() >= eventId) { return Optional.of(event); } } return Optional.empty(); } protected abstract StandardProvenanceEventRecord nextRecord(DataInputStream in, int serializationVersion) throws IOException; protected void readHeader(DataInputStream in, int serializationVersion) throws IOException { } }