TransactionLog.java example

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.solr.update;

import java.io.Closeable;
import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
import java.io.RandomAccessFile;
import java.lang.invoke.MethodHandles;
import java.nio.ByteBuffer;
import java.nio.channels.Channels;
import java.nio.channels.FileChannel;
import java.nio.file.Files;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import java.util.concurrent.atomic.AtomicInteger;

import org.apache.lucene.util.BytesRef;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.util.DataInputInputStream;
import org.apache.solr.common.util.FastInputStream;
import org.apache.solr.common.util.FastOutputStream;
import org.apache.solr.common.util.JavaBinCodec;
import org.apache.solr.common.util.ObjectReleaseTracker;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 *  Log Format: List{Operation, Version, ...}
 *  ADD, VERSION, DOC
 *  DELETE, VERSION, ID_BYTES
 *  DELETE_BY_QUERY, VERSION, String
 *
 *  TODO: keep two files, one for [operation, version, id] and the other for the actual
 *  document data.  That way we could throw away document log files more readily
 *  while retaining the smaller operation log files longer (and we can retrieve
 *  the stored fields from the latest documents from the index).
 *
 *  This would require keeping all source fields stored of course.
 *
 *  This would also allow to not log document data for requests with commit=true
 *  in them (since we know that if the request succeeds, all docs will be committed)
 *
 */
public class TransactionLog implements Closeable {
  private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
  private boolean debug = log.isDebugEnabled();
  private boolean trace = log.isTraceEnabled();

  public final static String END_MESSAGE="SOLR_TLOG_END";

  long id;
  File tlogFile;
  RandomAccessFile raf;
  FileChannel channel;
  OutputStream os;
  FastOutputStream fos;    // all accesses to this stream should be synchronized on "this" (The TransactionLog)
  int numRecords;

  protected volatile boolean deleteOnClose = true;  // we can delete old tlogs since they are currently only used for real-time-get (and in the future, recovery)

  AtomicInteger refcount = new AtomicInteger(1);
  Map<String,Integer> globalStringMap = new HashMap<>();
  List<String> globalStringList = new ArrayList<>();

  long snapshot_size;
  int snapshot_numRecords;

  // write a BytesRef as a byte array
  static final JavaBinCodec.ObjectResolver resolver = new JavaBinCodec.ObjectResolver() {
    @Override
    public Object resolve(Object o, JavaBinCodec codec) throws IOException {
      if (o instanceof BytesRef) {
        BytesRef br = (BytesRef)o;
        codec.writeByteArray(br.bytes, br.offset, br.length);
        return null;
      }
      // Fallback: we have no idea how to serialize this.  Be noisy to prevent insidious bugs
      throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
          "TransactionLog doesn't know how to serialize " + o.getClass() + "; try implementing ObjectResolver?");
    }
  };

  public class LogCodec extends JavaBinCodec {
    public LogCodec(JavaBinCodec.ObjectResolver resolver) {
      super(resolver);
    }

    @Override
    public void writeExternString(String s) throws IOException {
      if (s == null) {
        writeTag(NULL);
        return;
      }

      // no need to synchronize globalStringMap - it's only updated before the first record is written to the log
      Integer idx = globalStringMap.get(s);
      if (idx == null) {
        // write a normal string
        writeStr(s);
      } else {
        // write the extern string
        writeTag(EXTERN_STRING, idx);
      }
    }

    @Override
    public String readExternString(DataInputInputStream fis) throws IOException {
      int idx = readSize(fis);
      if (idx != 0) {// idx != 0 is the index of the extern string
      // no need to synchronize globalStringList - it's only updated before the first record is written to the log
        return globalStringList.get(idx - 1);
      } else {// idx == 0 means it has a string value
        // this shouldn't happen with this codec subclass.
        throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Corrupt transaction log");
      }
    }

  }

  TransactionLog(File tlogFile, Collection<String> globalStrings) {
    this(tlogFile, globalStrings, false);
  }

  TransactionLog(File tlogFile, Collection<String> globalStrings, boolean openExisting) {
    boolean success = false;
    try {
      if (debug) {
        log.debug("New TransactionLog file=" + tlogFile + ", exists=" + tlogFile.exists() + ", size=" + tlogFile.length() + ", openExisting=" + openExisting);
      }

      // Parse tlog id from the filename
      String filename = tlogFile.getName();
      id = Long.parseLong(filename.substring(filename.indexOf('.') + 1, filename.indexOf('.') + 20));

      this.tlogFile = tlogFile;
      raf = new RandomAccessFile(this.tlogFile, "rw");
      long start = raf.length();
      channel = raf.getChannel();
      os = Channels.newOutputStream(channel);
      fos = new FastOutputStream(os, new byte[65536], 0);
      // fos = FastOutputStream.wrap(os);

      if (openExisting) {
        if (start > 0) {
          readHeader(null);
          raf.seek(start);
          assert channel.position() == start;
          fos.setWritten(start);    // reflect that we aren't starting at the beginning
          assert fos.size() == channel.size();
        } else {
          addGlobalStrings(globalStrings);
        }
      } else {
        if (start > 0) {
          log.warn("New transaction log already exists:" + tlogFile + " size=" + raf.length());
          return;
        }
       
        if (start > 0) {
          raf.setLength(0);
        }
        addGlobalStrings(globalStrings);
      }

      success = true;
      
      assert ObjectReleaseTracker.track(this);

    } catch (IOException e) {
      throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
    } finally {
      if (!success && raf != null) {
        try {
          raf.close();
        } catch (Exception e) {
          log.error("Error closing tlog file (after error opening)", e);
        }
      }
    }
  }

  // for subclasses
  protected TransactionLog() {}

  /** Returns the number of records in the log (currently includes the header and an optional commit).
   * Note: currently returns 0 for reopened existing log files.
   */
  public int numRecords() {
    synchronized (this) {
      return this.numRecords;
    }
  }

  public boolean endsWithCommit() throws IOException {
    long size;
    synchronized (this) {
      fos.flush();
      size = fos.size();
    }

    // the end of the file should have the end message (added during a commit) plus a 4 byte size
    byte[] buf = new byte[ END_MESSAGE.length() ];
    long pos = size - END_MESSAGE.length() - 4;
    if (pos < 0) return false;
    ChannelFastInputStream is = new ChannelFastInputStream(channel, pos);
    is.read(buf);
    for (int i=0; i<buf.length; i++) {
      if (buf[i] != END_MESSAGE.charAt(i)) return false;
    }
    return true;
  }

  /** takes a snapshot of the current position and number of records
   * for later possible rollback, and returns the position */
  public long snapshot() {
    synchronized (this) {
      snapshot_size = fos.size();
      snapshot_numRecords = numRecords;
      return snapshot_size;
    }
  }

  // This could mess with any readers or reverse readers that are open, or anything that might try to do a log lookup.
  // This should only be used to roll back buffered updates, not actually applied updates.
  public void rollback(long pos) throws IOException {
    synchronized (this) {
      assert snapshot_size == pos;
      fos.flush();
      raf.setLength(pos);
      fos.setWritten(pos);
      assert fos.size() == pos;
      numRecords = snapshot_numRecords;
    }
  }

  public long writeData(Object o) {
    LogCodec codec = new LogCodec(resolver);
    try {
      long pos = fos.size();   // if we had flushed, this should be equal to channel.position()
      codec.init(fos);
      codec.writeVal(o);
      return pos;
    } catch (IOException e) {
      throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
    }
  }


  private void readHeader(FastInputStream fis) throws IOException {
    // read existing header
    fis = fis != null ? fis : new ChannelFastInputStream(channel, 0);
    LogCodec codec = new LogCodec(resolver);
    Map header = (Map)codec.unmarshal(fis);

    fis.readInt(); // skip size

    // needed to read other records

    synchronized (this) {
      globalStringList = (List<String>)header.get("strings");
      globalStringMap = new HashMap<>(globalStringList.size());
      for (int i=0; i<globalStringList.size(); i++) {
        globalStringMap.put( globalStringList.get(i), i+1);
      }
    }
  }

  protected void addGlobalStrings(Collection<String> strings) {
    if (strings == null) return;
    int origSize = globalStringMap.size();
    for (String s : strings) {
      Integer idx = null;
      if (origSize > 0) {
        idx = globalStringMap.get(s);
      }
      if (idx != null) continue;  // already in list
      globalStringList.add(s);
      globalStringMap.put(s, globalStringList.size());
    }
    assert globalStringMap.size() == globalStringList.size();
  }

  Collection<String> getGlobalStrings() {
    synchronized (this) {
      return new ArrayList<>(globalStringList);
    }
  }

  protected void writeLogHeader(LogCodec codec) throws IOException {
    long pos = fos.size();
    assert pos == 0;

    Map header = new LinkedHashMap<String,Object>();
    header.put("SOLR_TLOG",1); // a magic string + version number
    header.put("strings",globalStringList);
    codec.marshal(header, fos);

    endRecord(pos);
  }

  protected void endRecord(long startRecordPosition) throws IOException {
    fos.writeInt((int)(fos.size() - startRecordPosition));
    numRecords++;
  }

  private void checkWriteHeader(LogCodec codec, SolrInputDocument optional) throws IOException {

    // Unsynchronized access. We can get away with an unsynchronized access here
    // since we will never get a false non-zero when the position is in fact 0.
    // rollback() is the only function that can reset to zero, and it blocks updates.
    if (fos.size() != 0) return;

    synchronized (this) {
      if (fos.size() != 0) return;  // check again while synchronized
      if (optional != null) {
        addGlobalStrings(optional.getFieldNames());
      }
      writeLogHeader(codec);
    }
  }

  int lastAddSize;

  /**
   * Writes an add update command to the transaction log. This is not applicable for
   * in-place updates; use {@link #write(AddUpdateCommand, long, int)}.
   * (The previous pointer (applicable for in-place updates) is set to -1 while writing
   * the command to the transaction log.)
   * @param cmd The add update command to be written
   * @param flags Options for writing the command to the transaction log
   * @return Returns the position pointer of the written update command
   * 
   * @see #write(AddUpdateCommand, long, int)
   */
  public long write(AddUpdateCommand cmd, int flags) {
    return write(cmd, -1, flags);
  }

  /**
   * Writes an add update command to the transaction log. This should be called only for
   * writing in-place updates, or else pass -1 as the prevPointer.
   * @param cmd The add update command to be written
   * @param prevPointer The pointer in the transaction log which this update depends 
   * on (applicable for in-place updates)
   * @param flags Options for writing the command to the transaction log
   * @return Returns the position pointer of the written update command
   */
  public long write(AddUpdateCommand cmd, long prevPointer, int flags) {
    assert (-1 <= prevPointer && (cmd.isInPlaceUpdate() || (-1 == prevPointer)));
    
    LogCodec codec = new LogCodec(resolver);
    SolrInputDocument sdoc = cmd.getSolrInputDocument();

    try {
      checkWriteHeader(codec, sdoc);

      // adaptive buffer sizing
      int bufSize = lastAddSize;    // unsynchronized access of lastAddSize should be fine
      bufSize = Math.min(1024*1024, bufSize+(bufSize>>3)+256);

      MemOutputStream out = new MemOutputStream(new byte[bufSize]);
      codec.init(out);
      if (cmd.isInPlaceUpdate()) {
        codec.writeTag(JavaBinCodec.ARR, 5);
        codec.writeInt(UpdateLog.UPDATE_INPLACE | flags);  // should just take one byte
        codec.writeLong(cmd.getVersion());
        codec.writeLong(prevPointer);
        codec.writeLong(cmd.prevVersion);
        codec.writeSolrInputDocument(cmd.getSolrInputDocument());
      } else {
        codec.writeTag(JavaBinCodec.ARR, 3);
        codec.writeInt(UpdateLog.ADD | flags);  // should just take one byte
        codec.writeLong(cmd.getVersion());
        codec.writeSolrInputDocument(cmd.getSolrInputDocument());
      }
      lastAddSize = (int)out.size();

      synchronized (this) {
        long pos = fos.size();   // if we had flushed, this should be equal to channel.position()
        assert pos != 0;

        /***
         System.out.println("###writing at " + pos + " fos.size()=" + fos.size() + " raf.length()=" + raf.length());
         if (pos != fos.size()) {
         throw new RuntimeException("ERROR" + "###writing at " + pos + " fos.size()=" + fos.size() + " raf.length()=" + raf.length());
         }
         ***/

        out.writeAll(fos);
        endRecord(pos);
        // fos.flushBuffer();  // flush later
        return pos;
      }

    } catch (IOException e) {
      // TODO: reset our file pointer back to "pos", the start of this record.
      throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Error logging add", e);
    }
  }

  public long writeDelete(DeleteUpdateCommand cmd, int flags) {
    LogCodec codec = new LogCodec(resolver);

    try {
      checkWriteHeader(codec, null);

      BytesRef br = cmd.getIndexedId();

      MemOutputStream out = new MemOutputStream(new byte[20 + br.length]);
      codec.init(out);
      codec.writeTag(JavaBinCodec.ARR, 3);
      codec.writeInt(UpdateLog.DELETE | flags);  // should just take one byte
      codec.writeLong(cmd.getVersion());
      codec.writeByteArray(br.bytes, br.offset, br.length);

      synchronized (this) {
        long pos = fos.size();   // if we had flushed, this should be equal to channel.position()
        assert pos != 0;
        out.writeAll(fos);
        endRecord(pos);
        // fos.flushBuffer();  // flush later
        return pos;
      }

    } catch (IOException e) {
      throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
    }

  }

  public long writeDeleteByQuery(DeleteUpdateCommand cmd, int flags) {
    LogCodec codec = new LogCodec(resolver);
    try {
      checkWriteHeader(codec, null);

      MemOutputStream out = new MemOutputStream(new byte[20 + (cmd.query.length())]);
      codec.init(out);
      codec.writeTag(JavaBinCodec.ARR, 3);
      codec.writeInt(UpdateLog.DELETE_BY_QUERY | flags);  // should just take one byte
      codec.writeLong(cmd.getVersion());
      codec.writeStr(cmd.query);

      synchronized (this) {
        long pos = fos.size();   // if we had flushed, this should be equal to channel.position()
        out.writeAll(fos);
        endRecord(pos);
        // fos.flushBuffer();  // flush later
        return pos;
      }
      } catch (IOException e) {
        throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
      }

  }


  public long writeCommit(CommitUpdateCommand cmd, int flags) {
    LogCodec codec = new LogCodec(resolver);
    synchronized (this) {
      try {
        long pos = fos.size();   // if we had flushed, this should be equal to channel.position()

        if (pos == 0) {
          writeLogHeader(codec);
          pos = fos.size();
        }
        codec.init(fos);
        codec.writeTag(JavaBinCodec.ARR, 3);
        codec.writeInt(UpdateLog.COMMIT | flags);  // should just take one byte
        codec.writeLong(cmd.getVersion());
        codec.writeStr(END_MESSAGE);  // ensure these bytes are (almost) last in the file

        endRecord(pos);

        fos.flush();  // flush since this will be the last record in a log fill
        assert fos.size() == channel.size();

        return pos;
      } catch (IOException e) {
        throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
      }
    }
  }


  /* This method is thread safe */
  public Object lookup(long pos) {
    // A negative position can result from a log replay (which does not re-log, but does
    // update the version map.  This is OK since the node won't be ACTIVE when this happens.
    if (pos < 0) return null;

    try {
      // make sure any unflushed buffer has been flushed
      synchronized (this) {
        // TODO: optimize this by keeping track of what we have flushed up to
        fos.flushBuffer();
        /***
         System.out.println("###flushBuffer to " + fos.size() + " raf.length()=" + raf.length() + " pos="+pos);
        if (fos.size() != raf.length() || pos >= fos.size() ) {
          throw new RuntimeException("ERROR" + "###flushBuffer to " + fos.size() + " raf.length()=" + raf.length() + " pos="+pos);
        }
        ***/
      }

      ChannelFastInputStream fis = new ChannelFastInputStream(channel, pos);
      LogCodec codec = new LogCodec(resolver);
      return codec.readVal(fis);
    } catch (IOException e) {
      throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
    }
  }

  public void incref() {
    int result = refcount.incrementAndGet();
    if (result <= 1) {
      throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "incref on a closed log: " + this);
    }
  }

  public boolean try_incref() {
    return refcount.incrementAndGet() > 1;
  }

  public void decref() {
    if (refcount.decrementAndGet() == 0) {
      close();
    }
  }

  /** returns the current position in the log file */
  public long position() {
    synchronized (this) {
      return fos.size();
    }
  }

  /** Move to a read-only state, closing and releasing resources while keeping the log available for reads */
  public void closeOutput() {

  }

  public void finish(UpdateLog.SyncLevel syncLevel) {
    if (syncLevel == UpdateLog.SyncLevel.NONE) return;
    try {
      synchronized (this) {
        fos.flushBuffer();
      }

      if (syncLevel == UpdateLog.SyncLevel.FSYNC) {
        // Since fsync is outside of synchronized block, we can end up with a partial
        // last record on power failure (which is OK, and does not represent an error...
        // we just need to be aware of it when reading).
        raf.getFD().sync();
      }

    } catch (IOException e) {
      throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
    }
  }

  public void close() {
    try {
      if (debug) {
        log.debug("Closing tlog" + this);
      }

      synchronized (this) {
        fos.flush();
        fos.close();
      }

      if (deleteOnClose) {
        try {
          Files.deleteIfExists(tlogFile.toPath());
        } catch (IOException e) {
          // TODO: should this class care if a file couldnt be deleted?
          // this just emulates previous behavior, where only SecurityException would be handled.
        }
      }
    } catch (IOException e) {
      throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
    } finally {
      assert ObjectReleaseTracker.release(this);
    }
  }

  public void forceClose() {
    if (refcount.get() > 0) {
      log.error("Error: Forcing close of " + this);
      refcount.set(0);
      close();
    }
  }

  @Override
  public String toString() {
    return "tlog{file=" + tlogFile.toString() + " refcount=" + refcount.get() + "}";
  }

  public long getLogSize() {
    if (tlogFile != null) {
      return tlogFile.length();
    }
    return 0;
  }

  /** Returns a reader that can be used while a log is still in use.
   * Currently only *one* LogReader may be outstanding, and that log may only
   * be used from a single thread. */
  public LogReader getReader(long startingPos) {
    return new LogReader(startingPos);
  }

  public LogReader getSortedReader(long startingPos) {
    return new SortedLogReader(startingPos);
  }

  /** Returns a single threaded reverse reader */
  public ReverseReader getReverseReader() throws IOException {
    return new FSReverseReader();
  }

  public class LogReader {
    protected ChannelFastInputStream fis;
    private LogCodec codec = new LogCodec(resolver);

    public LogReader(long startingPos) {
      incref();
      fis = new ChannelFastInputStream(channel, startingPos);
    }

    // for classes that extend
    protected LogReader() {}

    /** Returns the next object from the log, or null if none available.
     *
     * @return The log record, or null if EOF
     * @throws IOException If there is a low-level I/O error.
     */
    public Object next() throws IOException, InterruptedException {
      long pos = fis.position();

      synchronized (TransactionLog.this) {
        if (trace) {
          log.trace("Reading log record.  pos="+pos+" currentSize="+fos.size());
        }

        if (pos >= fos.size()) {
          return null;
        }

        fos.flushBuffer();
      }

      if (pos == 0) {
        readHeader(fis);

        // shouldn't currently happen - header and first record are currently written at the same time
        synchronized (TransactionLog.this) {
          if (fis.position() >= fos.size()) {
            return null;
          }
          pos = fis.position();
        }
      }

      Object o = codec.readVal(fis);

      // skip over record size
      int size = fis.readInt();
      assert size == fis.position() - pos - 4;

      return o;
    }

    public void close() {
      decref();
    }

    @Override
    public String toString() {
      synchronized (TransactionLog.this) {
        return "LogReader{" + "file=" + tlogFile + ", position=" + fis.position() + ", end=" + fos.size() + "}";
      }
    }

    // returns best effort current position
    // for info purposes
    public long currentPos() {
      return fis.position();
    }

    // returns best effort current size
    // for info purposes
    public long currentSize() throws IOException {
      return channel.size();
    }

  }

  public class SortedLogReader extends LogReader {
    private long startingPos;
    private boolean inOrder = true;
    private TreeMap<Long, Long> versionToPos;
    Iterator<Long> iterator;

    public SortedLogReader(long startingPos) {
      super(startingPos);
      this.startingPos = startingPos;
    }

    @Override
    public Object next() throws IOException, InterruptedException {
      if (versionToPos == null) {
        versionToPos = new TreeMap<>();
        Object o;
        long pos = startingPos;

        long lastVersion = Long.MIN_VALUE;
        while ( (o = super.next()) != null) {
          List entry = (List) o;
          long version = (Long) entry.get(UpdateLog.VERSION_IDX);
          version = Math.abs(version);
          versionToPos.put(version, pos);
          pos = currentPos();

          if (version < lastVersion) inOrder = false;
          lastVersion = version;
        }
        fis.seek(startingPos);
      }

      if (inOrder) {
        return super.next();
      } else {
        if (iterator == null) iterator = versionToPos.values().iterator();
        if (!iterator.hasNext()) return null;
        long pos = iterator.next();
        if (pos != currentPos()) fis.seek(pos);
        return super.next();
      }
    }
  }

  public abstract class ReverseReader {

    /** Returns the next object from the log, or null if none available.
     *
     * @return The log record, or null if EOF
     * @throws IOException If there is a low-level I/O error.
     */
    public abstract Object next() throws IOException;

    /* returns the position in the log file of the last record returned by next() */
    public abstract long position();
    public abstract void close();

    @Override
    public abstract String toString() ;

  }

  public class FSReverseReader extends ReverseReader {
    ChannelFastInputStream fis;
    private LogCodec codec = new LogCodec(resolver) {
      @Override
      public SolrInputDocument readSolrInputDocument(DataInputInputStream dis) {
        // Given that the SolrInputDocument is last in an add record, it's OK to just skip
        // reading it completely.
        return null;
      }
    };

    int nextLength;  // length of the next record (the next one closer to the start of the log file)
    long prevPos;    // where we started reading from last time (so prevPos - nextLength == start of next record)

    public FSReverseReader() throws IOException {
      incref();

      long sz;
      synchronized (TransactionLog.this) {
        fos.flushBuffer();
        sz = fos.size();
        assert sz == channel.size();
      }

      fis = new ChannelFastInputStream(channel, 0);
      if (sz >=4) {
        // readHeader(fis);  // should not be needed
        prevPos = sz - 4;
        fis.seek(prevPos);
        nextLength = fis.readInt();
      }
    }

    /** Returns the next object from the log, or null if none available.
     *
     * @return The log record, or null if EOF
     * @throws IOException If there is a low-level I/O error.
     */
    public Object next() throws IOException {
      if (prevPos <= 0) return null;

      long endOfThisRecord = prevPos;

      int thisLength = nextLength;

      long recordStart = prevPos - thisLength;  // back up to the beginning of the next record
      prevPos = recordStart - 4;  // back up 4 more to read the length of the next record

      if (prevPos <= 0) return null;  // this record is the header

      long bufferPos = fis.getBufferPos();
      if (prevPos >= bufferPos) {
        // nothing to do... we're within the current buffer
      } else {
        // Position buffer so that this record is at the end.
        // For small records, this will cause subsequent calls to next() to be within the buffer.
        long seekPos =  endOfThisRecord - fis.getBufferSize();
        seekPos = Math.min(seekPos, prevPos); // seek to the start of the record if it's larger then the block size.
        seekPos = Math.max(seekPos, 0);
        fis.seek(seekPos);
        fis.peek();  // cause buffer to be filled
      }

      fis.seek(prevPos);
      nextLength = fis.readInt();     // this is the length of the *next* record (i.e. closer to the beginning)

      // TODO: optionally skip document data
      Object o = codec.readVal(fis);

      // assert fis.position() == prevPos + 4 + thisLength;  // this is only true if we read all the data (and we currently skip reading SolrInputDocument

      return o;
    }

    /* returns the position in the log file of the last record returned by next() */
    public long position() {
      return prevPos + 4;  // skip the length
    }

    public void close() {
      decref();
    }

    @Override
    public String toString() {
      synchronized (TransactionLog.this) {
        return "LogReader{" + "file=" + tlogFile + ", position=" + fis.position() + ", end=" + fos.size() + "}";
      }
    }


  }

}



class ChannelFastInputStream extends FastInputStream {
  private FileChannel ch;

  public ChannelFastInputStream(FileChannel ch, long chPosition) {
    // super(null, new byte[10],0,0);    // a small buffer size for testing purposes
    super(null);
    this.ch = ch;
    super.readFromStream = chPosition;
  }

  @Override
  public int readWrappedStream(byte[] target, int offset, int len) throws IOException {
    ByteBuffer bb = ByteBuffer.wrap(target, offset, len);
    int ret = ch.read(bb, readFromStream);
    return ret;
  }

  public void seek(long position) throws IOException {
    if (position <= readFromStream && position >= getBufferPos()) {
      // seek within buffer
      pos = (int)(position - getBufferPos());
    } else {
      // long currSize = ch.size();   // not needed - underlying read should handle (unless read never done)
      // if (position > currSize) throw new EOFException("Read past EOF: seeking to " + position + " on file of size " + currSize + " file=" + ch);
      readFromStream = position;
      end = pos = 0;
    }
    assert position() == position;
  }

  /** where is the start of the buffer relative to the whole file */
  public long getBufferPos() {
    return readFromStream - end;
  }

  public int getBufferSize() {
    return buf.length;
  }

  @Override
  public void close() throws IOException {
    ch.close();
  }

  @Override
  public String toString() {
    return "readFromStream="+readFromStream +" pos="+pos +" end="+end + " bufferPos="+getBufferPos() + " position="+position() ;
  }
}