/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nifi.stream.io.util; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; /** * Implementation of demarcator of text lines in the provided * {@link InputStream}. It works similar to the {@link BufferedReader} and its * {@link BufferedReader#readLine()} methods except that it does not create a * String representing the text line and instead returns the offset info for the * computed text line. See {@link #nextOffsetInfo()} and * {@link #nextOffsetInfo(byte[])} for more details. * <p> * NOTE: Not intended for multi-thread usage hence not Thread-safe. * </p> */ public class TextLineDemarcator extends AbstractDemarcator { private static int CR = 13; // \r private static int LF = 10; // \n /** * Constructs an instance of demarcator with provided {@link InputStream} * and default buffer size. */ public TextLineDemarcator(InputStream is) { this(is, INIT_BUFFER_SIZE); } /** * Constructs an instance of demarcator with provided {@link InputStream} * and initial buffer size. */ public TextLineDemarcator(InputStream is, int initialBufferSize) { super(is, Integer.MAX_VALUE, initialBufferSize); } /** * Will compute the next <i>offset info</i> for a text line (line terminated * by either '\r', '\n' or '\r\n'). <br> * The <i>offset info</i> computed and returned as {@link OffsetInfo} where * {@link OffsetInfo#isStartsWithMatch()} will always return true. * * @return offset info */ public OffsetInfo nextOffsetInfo() throws IOException { return this.nextOffsetInfo(null); } /** * Will compute the next <i>offset info</i> for a text line (line terminated * by either '\r', '\n' or '\r\n'). <br> * The <i>offset info</i> computed and returned as {@link OffsetInfo} where * {@link OffsetInfo#isStartsWithMatch()} will return true if * <code>startsWith</code> was successfully matched with the starting bytes * of the text line. * * NOTE: The reason for 2 'nextOffsetInfo(..)' operations is that the * 'startsWith' argument will force the actual token to be extracted and * then matched introducing the overhead for System.arrayCopy and matching * logic which is an optional scenario and is avoided all together if * 'startsWith' is not provided (i.e., null). * * @return offset info */ public OffsetInfo nextOffsetInfo(byte[] startsWith) throws IOException { OffsetInfo offsetInfo = null; byte previousByteVal = 0; byte[] data = null; nextTokenLoop: while (data == null && this.availableBytesLength != -1) { if (this.index >= this.availableBytesLength) { this.fill(); } int delimiterSize = 0; if (this.availableBytesLength != -1) { byte byteVal; int i; for (i = this.index; i < this.availableBytesLength; i++) { byteVal = this.buffer[i]; if (byteVal == LF) { delimiterSize = previousByteVal == CR ? 2 : 1; } else if (previousByteVal == CR) { delimiterSize = 1; i--; } previousByteVal = byteVal; if (delimiterSize > 0) { this.index = i + 1; int size = Math.max(1, this.index - this.mark); offsetInfo = new OffsetInfo(this.offset, size, delimiterSize); this.offset += size; if (startsWith != null) { data = this.extractDataToken(size); } this.mark = this.index; break nextTokenLoop; } } this.index = i; } else { delimiterSize = previousByteVal == CR || previousByteVal == LF ? 1 : 0; if (offsetInfo == null) { int size = this.index - this.mark; if (size > 0) { offsetInfo = new OffsetInfo(this.offset, size, delimiterSize); this.offset += size; } } if (startsWith != null) { data = this.extractDataToken(this.index - this.mark); } } } if (startsWith != null && data != null) { if (startsWith.length > data.length) { offsetInfo.setStartsWithMatch(false); } else { for (int i = 0; i < startsWith.length; i++) { byte sB = startsWith[i]; if (sB != data[i]) { offsetInfo.setStartsWithMatch(false); break; } } } } return offsetInfo; } /** * Container to hold offset and meta info for a computed text line. * The offset and meta info is represented with the following 4 values: * <ul> * <li><i>startOffset</i> - the offset in the overall stream which represents the beginning of the text line</li> * <li><i>length</i> - length of the text line including CRLF characters</li> * <li><i>crlfLength</i> - the length of the CRLF. * Value 0 is returned if text line represents the last text line in the * {@link InputStream} (i.e., EOF) and such line does not terminate with CR or LF or the combination of the two. * Value 1 is returned if text line ends with '\n' or '\r'. * Value 2 is returned if line ends with '\r\n').</li> * <li><i>startsWithMatch</i> - <code>true</code> by default unless <code>startWith</code> bytes are provided and not matched. * See {@link #nextOffsetInfo(byte[])} for more info.</li> * </ul> **/ public static class OffsetInfo { private final long startOffset, length; private final int crlfLength; private boolean startsWithMatch = true; private OffsetInfo(long startOffset, long length, int crlfLength) { this.startOffset = startOffset; this.length = length; this.crlfLength = crlfLength; } public long getStartOffset() { return startOffset; } public long getLength() { return length; } public int getCrlfLength() { return this.crlfLength; } public boolean isStartsWithMatch() { return this.startsWithMatch; } void setStartsWithMatch(boolean startsWithMatch) { this.startsWithMatch = startsWithMatch; } @Override public String toString() { return "offset:" + this.startOffset + "; length:" + this.length + "; crlfLength:" + this.crlfLength; } } }