CodecUtil.java example

Explorer
lucene-solr-master
- lucene
- solr
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.codecs;


import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;

import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexFormatTooNewException;
import org.apache.lucene.index.IndexFormatTooOldException;
import org.apache.lucene.store.BufferedChecksumIndexInput;
import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.StringHelper;

/**
 * Utility class for reading and writing versioned headers.
 * <p>
 * Writing codec headers is useful to ensure that a file is in 
 * the format you think it is.
 * 
 * @lucene.experimental
 */

public final class CodecUtil {
  private CodecUtil() {} // no instance

  /**
   * Constant to identify the start of a codec header.
   */
  public final static int CODEC_MAGIC = 0x3fd76c17;
  /**
   * Constant to identify the start of a codec footer.
   */
  public final static int FOOTER_MAGIC = ~CODEC_MAGIC;

  /**
   * Writes a codec header, which records both a string to
   * identify the file and a version number. This header can
   * be parsed and validated with 
   * {@link #checkHeader(DataInput, String, int, int) checkHeader()}.
   * <p>
   * CodecHeader --> Magic,CodecName,Version
   * <ul>
   *    <li>Magic --> {@link DataOutput#writeInt Uint32}. This
   *        identifies the start of the header. It is always {@value #CODEC_MAGIC}.
   *    <li>CodecName --> {@link DataOutput#writeString String}. This
   *        is a string to identify this file.
   *    <li>Version --> {@link DataOutput#writeInt Uint32}. Records
   *        the version of the file.
   * </ul>
   * <p>
   * Note that the length of a codec header depends only upon the
   * name of the codec, so this length can be computed at any time
   * with {@link #headerLength(String)}.
   * 
   * @param out Output stream
   * @param codec String to identify this file. It should be simple ASCII, 
   *              less than 128 characters in length.
   * @param version Version number
   * @throws IOException If there is an I/O error writing to the underlying medium.
   * @throws IllegalArgumentException If the codec name is not simple ASCII, or is more than 127 characters in length
   */
  public static void writeHeader(DataOutput out, String codec, int version) throws IOException {
    BytesRef bytes = new BytesRef(codec);
    if (bytes.length != codec.length() || bytes.length >= 128) {
      throw new IllegalArgumentException("codec must be simple ASCII, less than 128 characters in length [got " + codec + "]");
    }
    out.writeInt(CODEC_MAGIC);
    out.writeString(codec);
    out.writeInt(version);
  }
  
  /**
   * Writes a codec header for an index file, which records both a string to
   * identify the format of the file, a version number, and data to identify
   * the file instance (ID and auxiliary suffix such as generation).
   * <p>
   * This header can be parsed and validated with 
   * {@link #checkIndexHeader(DataInput, String, int, int, byte[], String) checkIndexHeader()}.
   * <p>
   * IndexHeader --> CodecHeader,ObjectID,ObjectSuffix
   * <ul>
   *    <li>CodecHeader   --> {@link #writeHeader}
   *    <li>ObjectID     --> {@link DataOutput#writeByte byte}<sup>16</sup>
   *    <li>ObjectSuffix --> SuffixLength,SuffixBytes
   *    <li>SuffixLength  --> {@link DataOutput#writeByte byte}
   *    <li>SuffixBytes   --> {@link DataOutput#writeByte byte}<sup>SuffixLength</sup>
   * </ul>
   * <p>
   * Note that the length of an index header depends only upon the
   * name of the codec and suffix, so this length can be computed at any time
   * with {@link #indexHeaderLength(String,String)}.
   * 
   * @param out Output stream
   * @param codec String to identify the format of this file. It should be simple ASCII, 
   *              less than 128 characters in length.
   * @param id Unique identifier for this particular file instance.
   * @param suffix auxiliary suffix information for the file. It should be simple ASCII,
   *              less than 256 characters in length.
   * @param version Version number
   * @throws IOException If there is an I/O error writing to the underlying medium.
   * @throws IllegalArgumentException If the codec name is not simple ASCII, or 
   *         is more than 127 characters in length, or if id is invalid,
   *         or if the suffix is not simple ASCII, or more than 255 characters
   *         in length.
   */
  public static void writeIndexHeader(DataOutput out, String codec, int version, byte[] id, String suffix) throws IOException {
    if (id.length != StringHelper.ID_LENGTH) {
      throw new IllegalArgumentException("Invalid id: " + StringHelper.idToString(id));
    }
    writeHeader(out, codec, version);
    out.writeBytes(id, 0, id.length);
    BytesRef suffixBytes = new BytesRef(suffix);
    if (suffixBytes.length != suffix.length() || suffixBytes.length >= 256) {
      throw new IllegalArgumentException("suffix must be simple ASCII, less than 256 characters in length [got " + suffix + "]");
    }
    out.writeByte((byte) suffixBytes.length);
    out.writeBytes(suffixBytes.bytes, suffixBytes.offset, suffixBytes.length);
  }

  /**
   * Computes the length of a codec header.
   * 
   * @param codec Codec name.
   * @return length of the entire codec header.
   * @see #writeHeader(DataOutput, String, int)
   */
  public static int headerLength(String codec) {
    return 9+codec.length();
  }
  
  /**
   * Computes the length of an index header.
   * 
   * @param codec Codec name.
   * @return length of the entire index header.
   * @see #writeIndexHeader(DataOutput, String, int, byte[], String)
   */
  public static int indexHeaderLength(String codec, String suffix) {
    return headerLength(codec) + StringHelper.ID_LENGTH + 1 + suffix.length();
  }

  /**
   * Reads and validates a header previously written with 
   * {@link #writeHeader(DataOutput, String, int)}.
   * <p>
   * When reading a file, supply the expected <code>codec</code> and
   * an expected version range (<code>minVersion to maxVersion</code>).
   * 
   * @param in Input stream, positioned at the point where the
   *        header was previously written. Typically this is located
   *        at the beginning of the file.
   * @param codec The expected codec name.
   * @param minVersion The minimum supported expected version number.
   * @param maxVersion The maximum supported expected version number.
   * @return The actual version found, when a valid header is found 
   *         that matches <code>codec</code>, with an actual version 
   *         where {@code minVersion <= actual <= maxVersion}.
   *         Otherwise an exception is thrown.
   * @throws CorruptIndexException If the first four bytes are not
   *         {@link #CODEC_MAGIC}, or if the actual codec found is
   *         not <code>codec</code>.
   * @throws IndexFormatTooOldException If the actual version is less 
   *         than <code>minVersion</code>.
   * @throws IndexFormatTooNewException If the actual version is greater 
   *         than <code>maxVersion</code>.
   * @throws IOException If there is an I/O error reading from the underlying medium.
   * @see #writeHeader(DataOutput, String, int)
   */
  public static int checkHeader(DataInput in, String codec, int minVersion, int maxVersion) throws IOException {
    // Safety to guard against reading a bogus string:
    final int actualHeader = in.readInt();
    if (actualHeader != CODEC_MAGIC) {
      throw new CorruptIndexException("codec header mismatch: actual header=" + actualHeader + " vs expected header=" + CODEC_MAGIC, in);
    }
    return checkHeaderNoMagic(in, codec, minVersion, maxVersion);
  }

  /** Like {@link
   *  #checkHeader(DataInput,String,int,int)} except this
   *  version assumes the first int has already been read
   *  and validated from the input. */
  public static int checkHeaderNoMagic(DataInput in, String codec, int minVersion, int maxVersion) throws IOException {
    final String actualCodec = in.readString();
    if (!actualCodec.equals(codec)) {
      throw new CorruptIndexException("codec mismatch: actual codec=" + actualCodec + " vs expected codec=" + codec, in);
    }

    final int actualVersion = in.readInt();
    if (actualVersion < minVersion) {
      throw new IndexFormatTooOldException(in, actualVersion, minVersion, maxVersion);
    }
    if (actualVersion > maxVersion) {
      throw new IndexFormatTooNewException(in, actualVersion, minVersion, maxVersion);
    }

    return actualVersion;
  }
  
  /**
   * Reads and validates a header previously written with 
   * {@link #writeIndexHeader(DataOutput, String, int, byte[], String)}.
   * <p>
   * When reading a file, supply the expected <code>codec</code>,
   * expected version range (<code>minVersion to maxVersion</code>),
   * and object ID and suffix.
   * 
   * @param in Input stream, positioned at the point where the
   *        header was previously written. Typically this is located
   *        at the beginning of the file.
   * @param codec The expected codec name.
   * @param minVersion The minimum supported expected version number.
   * @param maxVersion The maximum supported expected version number.
   * @param expectedID The expected object identifier for this file.
   * @param expectedSuffix The expected auxiliary suffix for this file.
   * @return The actual version found, when a valid header is found 
   *         that matches <code>codec</code>, with an actual version 
   *         where {@code minVersion <= actual <= maxVersion}, 
   *         and matching <code>expectedID</code> and <code>expectedSuffix</code>
   *         Otherwise an exception is thrown.
   * @throws CorruptIndexException If the first four bytes are not
   *         {@link #CODEC_MAGIC}, or if the actual codec found is
   *         not <code>codec</code>, or if the <code>expectedID</code>
   *         or <code>expectedSuffix</code> do not match.
   * @throws IndexFormatTooOldException If the actual version is less 
   *         than <code>minVersion</code>.
   * @throws IndexFormatTooNewException If the actual version is greater 
   *         than <code>maxVersion</code>.
   * @throws IOException If there is an I/O error reading from the underlying medium.
   * @see #writeIndexHeader(DataOutput, String, int, byte[],String)
   */
  public static int checkIndexHeader(DataInput in, String codec, int minVersion, int maxVersion, byte[] expectedID, String expectedSuffix) throws IOException {
    int version = checkHeader(in, codec, minVersion, maxVersion);
    checkIndexHeaderID(in, expectedID);
    checkIndexHeaderSuffix(in, expectedSuffix);
    return version;
  }

  /**
   * Expert: verifies the incoming {@link IndexInput} has an index header
   * and that its segment ID matches the expected one, and then copies
   * that index header into the provided {@link DataOutput}.  This is
   * useful when building compound files.
   *
   * @param in Input stream, positioned at the point where the
   *        index header was previously written. Typically this is located
   *        at the beginning of the file.
   * @param out Output stream, where the header will be copied to.
   * @param expectedID Expected segment ID
   * @throws CorruptIndexException If the first four bytes are not
   *         {@link #CODEC_MAGIC}, or if the <code>expectedID</code>
   *         does not match.
   * @throws IOException If there is an I/O error reading from the underlying medium.
   *
   * @lucene.internal 
   */
  public static void verifyAndCopyIndexHeader(IndexInput in, DataOutput out, byte[] expectedID) throws IOException {
    // make sure it's large enough to have a header and footer
    if (in.length() < footerLength() + headerLength("")) {
      throw new CorruptIndexException("compound sub-files must have a valid codec header and footer: file is too small (" + in.length() + " bytes)", in);
    }

    int actualHeader = in.readInt();
    if (actualHeader != CODEC_MAGIC) {
      throw new CorruptIndexException("compound sub-files must have a valid codec header and footer: codec header mismatch: actual header=" + actualHeader + " vs expected header=" + CodecUtil.CODEC_MAGIC, in);
    }

    // we can't verify these, so we pass-through:
    String codec = in.readString();
    int version = in.readInt();

    // verify id:
    checkIndexHeaderID(in, expectedID);

    // we can't verify extension either, so we pass-through:
    int suffixLength = in.readByte() & 0xFF;
    byte[] suffixBytes = new byte[suffixLength];
    in.readBytes(suffixBytes, 0, suffixLength);

    // now write the header we just verified
    out.writeInt(CodecUtil.CODEC_MAGIC);
    out.writeString(codec);
    out.writeInt(version);
    out.writeBytes(expectedID, 0, expectedID.length);
    out.writeByte((byte) suffixLength);
    out.writeBytes(suffixBytes, 0, suffixLength);
  }


  /** Retrieves the full index header from the provided {@link IndexInput}.
   *  This throws {@link CorruptIndexException} if this file does
   * not appear to be an index file. */
  public static byte[] readIndexHeader(IndexInput in) throws IOException {
    in.seek(0);
    final int actualHeader = in.readInt();
    if (actualHeader != CODEC_MAGIC) {
      throw new CorruptIndexException("codec header mismatch: actual header=" + actualHeader + " vs expected header=" + CODEC_MAGIC, in);
    }
    String codec = in.readString();
    in.readInt();
    in.seek(in.getFilePointer() + StringHelper.ID_LENGTH);
    int suffixLength = in.readByte() & 0xFF;
    byte[] bytes = new byte[headerLength(codec) + StringHelper.ID_LENGTH + 1 + suffixLength];
    in.seek(0);
    in.readBytes(bytes, 0, bytes.length);
    return bytes;
  }

  /** Retrieves the full footer from the provided {@link IndexInput}.  This throws
   *  {@link CorruptIndexException} if this file does not have a valid footer. */
  public static byte[] readFooter(IndexInput in) throws IOException {
    if (in.length() < footerLength()) {
      throw new CorruptIndexException("misplaced codec footer (file truncated?): length=" + in.length() + " but footerLength==" + footerLength(), in);
    }
    in.seek(in.length() - footerLength());
    validateFooter(in);
    in.seek(in.length() - footerLength());
    byte[] bytes = new byte[footerLength()];
    in.readBytes(bytes, 0, bytes.length);
    return bytes;
  }
  
  /** Expert: just reads and verifies the object ID of an index header */
  public static byte[] checkIndexHeaderID(DataInput in, byte[] expectedID) throws IOException {
    byte id[] = new byte[StringHelper.ID_LENGTH];
    in.readBytes(id, 0, id.length);
    if (!Arrays.equals(id, expectedID)) {
      throw new CorruptIndexException("file mismatch, expected id=" + StringHelper.idToString(expectedID) 
                                                         + ", got=" + StringHelper.idToString(id), in);
    }
    return id;
  }
  
  /** Expert: just reads and verifies the suffix of an index header */
  public static String checkIndexHeaderSuffix(DataInput in, String expectedSuffix) throws IOException {
    int suffixLength = in.readByte() & 0xFF;
    byte suffixBytes[] = new byte[suffixLength];
    in.readBytes(suffixBytes, 0, suffixBytes.length);
    String suffix = new String(suffixBytes, 0, suffixBytes.length, StandardCharsets.UTF_8);
    if (!suffix.equals(expectedSuffix)) {
      throw new CorruptIndexException("file mismatch, expected suffix=" + expectedSuffix
                                                             + ", got=" + suffix, in);
    }
    return suffix;
  }
  
  /**
   * Writes a codec footer, which records both a checksum
   * algorithm ID and a checksum. This footer can
   * be parsed and validated with 
   * {@link #checkFooter(ChecksumIndexInput) checkFooter()}.
   * <p>
   * CodecFooter --> Magic,AlgorithmID,Checksum
   * <ul>
   *    <li>Magic --> {@link DataOutput#writeInt Uint32}. This
   *        identifies the start of the footer. It is always {@value #FOOTER_MAGIC}.
   *    <li>AlgorithmID --> {@link DataOutput#writeInt Uint32}. This
   *        indicates the checksum algorithm used. Currently this is always 0,
   *        for zlib-crc32.
   *    <li>Checksum --> {@link DataOutput#writeLong Uint64}. The
   *        actual checksum value for all previous bytes in the stream, including
   *        the bytes from Magic and AlgorithmID.
   * </ul>
   * 
   * @param out Output stream
   * @throws IOException If there is an I/O error writing to the underlying medium.
   */
  public static void writeFooter(IndexOutput out) throws IOException {
    out.writeInt(FOOTER_MAGIC);
    out.writeInt(0);
    writeCRC(out);
  }
  
  /**
   * Computes the length of a codec footer.
   * 
   * @return length of the entire codec footer.
   * @see #writeFooter(IndexOutput)
   */
  public static int footerLength() {
    return 16;
  }
  
  /** 
   * Validates the codec footer previously written by {@link #writeFooter}. 
   * @return actual checksum value
   * @throws IOException if the footer is invalid, if the checksum does not match, 
   *                     or if {@code in} is not properly positioned before the footer
   *                     at the end of the stream.
   */
  public static long checkFooter(ChecksumIndexInput in) throws IOException {
    validateFooter(in);
    long actualChecksum = in.getChecksum();
    long expectedChecksum = readCRC(in);
    if (expectedChecksum != actualChecksum) {
      throw new CorruptIndexException("checksum failed (hardware problem?) : expected=" + Long.toHexString(expectedChecksum) +  
                                                       " actual=" + Long.toHexString(actualChecksum), in);
    }
    return actualChecksum;
  }
  
  /** 
   * Validates the codec footer previously written by {@link #writeFooter}, optionally
   * passing an unexpected exception that has already occurred.
   * <p>
   * When a {@code priorException} is provided, this method will add a suppressed exception 
   * indicating whether the checksum for the stream passes, fails, or cannot be computed, and 
   * rethrow it. Otherwise it behaves the same as {@link #checkFooter(ChecksumIndexInput)}.
   * <p>
   * Example usage:
   * <pre class="prettyprint">
   * try (ChecksumIndexInput input = ...) {
   *   Throwable priorE = null;
   *   try {
   *     // ... read a bunch of stuff ... 
   *   } catch (Throwable exception) {
   *     priorE = exception;
   *   } finally {
   *     CodecUtil.checkFooter(input, priorE);
   *   }
   * }
   * </pre>
   */
  public static void checkFooter(ChecksumIndexInput in, Throwable priorException) throws IOException {
    if (priorException == null) {
      checkFooter(in);
    } else {
      try {
        long remaining = in.length() - in.getFilePointer();
        if (remaining < footerLength()) {
          // corruption caused us to read into the checksum footer already: we can't proceed
          priorException.addSuppressed(new CorruptIndexException("checksum status indeterminate: remaining=" + remaining +
                                                                 ", please run checkindex for more details", in));
        } else {
          // otherwise, skip any unread bytes.
          in.skipBytes(remaining - footerLength());
          
          // now check the footer
          try {
            long checksum = checkFooter(in);
            priorException.addSuppressed(new CorruptIndexException("checksum passed (" + Long.toHexString(checksum) + 
                                                                   "). possibly transient resource issue, or a Lucene or JVM bug", in));
          } catch (CorruptIndexException t) {
            priorException.addSuppressed(t);
          }
        }
      } catch (Throwable t) {
        // catch-all for things that shouldn't go wrong (e.g. OOM during readInt) but could...
        priorException.addSuppressed(new CorruptIndexException("checksum status indeterminate: unexpected exception", in, t));
      }
      throw IOUtils.rethrowAlways(priorException);
    }
  }
  
  /** 
   * Returns (but does not validate) the checksum previously written by {@link #checkFooter}.
   * @return actual checksum value
   * @throws IOException if the footer is invalid
   */
  public static long retrieveChecksum(IndexInput in) throws IOException {
    if (in.length() < footerLength()) {
      throw new CorruptIndexException("misplaced codec footer (file truncated?): length=" + in.length() + " but footerLength==" + footerLength(), in);
    }
    in.seek(in.length() - footerLength());
    validateFooter(in);
    return readCRC(in);
  }
  
  private static void validateFooter(IndexInput in) throws IOException {
    long remaining = in.length() - in.getFilePointer();
    long expected = footerLength();
    if (remaining < expected) {
      throw new CorruptIndexException("misplaced codec footer (file truncated?): remaining=" + remaining + ", expected=" + expected + ", fp=" + in.getFilePointer(), in);
    } else if (remaining > expected) {
      throw new CorruptIndexException("misplaced codec footer (file extended?): remaining=" + remaining + ", expected=" + expected + ", fp=" + in.getFilePointer(), in);
    }
    
    final int magic = in.readInt();
    if (magic != FOOTER_MAGIC) {
      throw new CorruptIndexException("codec footer mismatch (file truncated?): actual footer=" + magic + " vs expected footer=" + FOOTER_MAGIC, in);
    }
    
    final int algorithmID = in.readInt();
    if (algorithmID != 0) {
      throw new CorruptIndexException("codec footer mismatch: unknown algorithmID: " + algorithmID, in);
    }
  }
  
  /** 
   * Clones the provided input, reads all bytes from the file, and calls {@link #checkFooter} 
   * <p>
   * Note that this method may be slow, as it must process the entire file.
   * If you just need to extract the checksum value, call {@link #retrieveChecksum}.
   */
  public static long checksumEntireFile(IndexInput input) throws IOException {
    IndexInput clone = input.clone();
    clone.seek(0);
    ChecksumIndexInput in = new BufferedChecksumIndexInput(clone);
    assert in.getFilePointer() == 0;
    if (in.length() < footerLength()) {
      throw new CorruptIndexException("misplaced codec footer (file truncated?): length=" + in.length() + " but footerLength==" + footerLength(), input);
    }
    in.seek(in.length() - footerLength());
    return checkFooter(in);
  }
  
  /**
   * Reads CRC32 value as a 64-bit long from the input.
   * @throws CorruptIndexException if CRC is formatted incorrectly (wrong bits set)
   * @throws IOException if an i/o error occurs
   */
  static long readCRC(IndexInput input) throws IOException {
    long value = input.readLong();
    if ((value & 0xFFFFFFFF00000000L) != 0) {
      throw new CorruptIndexException("Illegal CRC-32 checksum: " + value, input);
    }
    return value;
  }
  
  /**
   * Writes CRC32 value as a 64-bit long to the output.
   * @throws IllegalStateException if CRC is formatted incorrectly (wrong bits set)
   * @throws IOException if an i/o error occurs
   */
  static void writeCRC(IndexOutput output) throws IOException {
    long value = output.getChecksum();
    if ((value & 0xFFFFFFFF00000000L) != 0) {
      throw new IllegalStateException("Illegal CRC-32 checksum: " + value + " (resource=" + output + ")");
    }
    output.writeLong(value);
  }
}