package com.orientechnologies.orient.core.storage.impl.local.paginated.wal;
import com.orientechnologies.common.directmemory.OByteBufferPool;
import com.orientechnologies.common.exception.OException;
import com.orientechnologies.common.io.OFileUtils;
import com.orientechnologies.common.log.OLogManager;
import com.orientechnologies.common.serialization.types.OByteSerializer;
import com.orientechnologies.common.serialization.types.OIntegerSerializer;
import com.orientechnologies.common.serialization.types.OLongSerializer;
import com.orientechnologies.common.util.OPair;
import com.orientechnologies.orient.core.config.OGlobalConfiguration;
import com.orientechnologies.orient.core.exception.OStorageException;
import com.orientechnologies.orient.core.storage.impl.local.statistic.OPerformanceStatisticManager;
import com.orientechnologies.orient.core.storage.impl.local.statistic.OSessionStoragePerformanceStatistic;
import edu.umd.cs.findbugs.annotations.SuppressFBWarnings;
import java.io.File;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.lang.ref.WeakReference;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.channels.FileChannel;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.ScheduledFuture;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.CRC32;
final class OLogSegment implements Comparable<OLogSegment> {
private ODiskWriteAheadLog writeAheadLog;
/**
* File which contains WAL segment data.
* It is <code>null</code> by default and initialized on request.
* <p>
* When file is requested and if this file is not file of active WAL segment
* then timer which will close file if it is not accessed any more in {@link #fileTTL} seconds will
* be started.
* <p>
* This field is not supposed to be accessed directly please use {@link #getRndFile()} instead.
*
* @see #closer
* @see #fileTTL
*/
private RandomAccessFile rndFile;
/**
* Lock which protects {@link #rndFile} access. Any time you call {@link #getRndFile()} you should also
* acquire this lock.
*/
private final Lock fileLock = new ReentrantLock();
/**
* Flag which indicates if auto close timer is started.
* This flag is used to guarantee that one and only one instance of auto close timer is active at any moment.
*
* @see #rndFile
*/
private final AtomicBoolean autoCloseInProgress = new AtomicBoolean();
/**
* Flag which when is set will prevent auto close timer to close of file. But timer itself
* will not be stopped.
*
* @see #rndFile
*/
private volatile boolean preventAutoClose = false;
private final File file;
/**
* Flag which indicates that file was accessed inside of {@link #fileTTL} which means that file will not be accessed
* at least inside of next {@link #fileTTL} interval.
*/
private volatile boolean closeNextTime;
/**
* If {@link #rndFile} will not be accessed inside of this interval (in seconds) it will be closed by timer.
*
* @see #rndFile
*/
private final int fileTTL;
/**
* Scheduler which will be used to start timer which will close file if last one will not be accessed inside of
* {@link #fileTTL} in seconds.
*
* @see #rndFile
*/
private final ScheduledExecutorService closer;
private final long order;
private final int maxPagesCacheSize;
private final OPerformanceStatisticManager performanceStatisticManager;
protected final Lock cacheLock = new ReentrantLock();
private volatile List<OLogRecord> logCache = new ArrayList<OLogRecord>();
private final ScheduledExecutorService commitExecutor;
private volatile long filledUpTo;
private volatile long writtenUpTo;
private boolean closed;
private OLogSequenceNumber last = null;
private volatile boolean flushNewData = true;
private WeakReference<OPair<OLogSequenceNumber, byte[]>> lastReadRecord = new WeakReference<OPair<OLogSequenceNumber, byte[]>>(
null);
private final class FlushTask implements Runnable {
private ThreadLocal<ByteBuffer> pageBuffer = new ThreadLocal<ByteBuffer>() {
@Override
protected ByteBuffer initialValue() {
return ByteBuffer.allocateDirect(OWALPage.PAGE_SIZE).order(ByteOrder.nativeOrder());
}
};
private FlushTask() {
}
@Override
public void run() {
try {
try {
commitLog();
} catch (Throwable e) {
OLogManager.instance().error(this, "Error during WAL background flush", e);
}
} finally {
writeAheadLog.checkFreeSpace();
}
}
private void commitLog() throws IOException {
if (!flushNewData)
return;
final OSessionStoragePerformanceStatistic statistic = performanceStatisticManager.getSessionPerformanceStatistic();
if (statistic != null)
statistic.startWALFlushTimer();
try {
flushNewData = false;
List<OLogRecord> toFlush;
try {
cacheLock.lock();
if (logCache.isEmpty())
return;
toFlush = logCache;
logCache = new ArrayList<OLogRecord>();
} finally {
cacheLock.unlock();
}
if (toFlush.isEmpty())
return;
ByteBuffer pageContent = pageBuffer.get();
pageContent.position(0);
OLogRecord first = toFlush.get(0);
int curIndex = (int) (first.writeFrom / OWALPage.PAGE_SIZE);
fileLock.lock();
try {
final RandomAccessFile rndFile = getRndFile();
long pagesCount = rndFile.length() / OWALPage.PAGE_SIZE;
if (pagesCount > curIndex) {
final FileChannel channel = rndFile.getChannel();
channel.read(pageContent, curIndex * OWALPage.PAGE_SIZE);
}
} finally {
fileLock.unlock();
}
OLogSequenceNumber lsn = null;
int pageIndex = 0;
int pos;
boolean lastToFlush = false;
long lastPos = 0;
for (OLogRecord log : toFlush) {
lsn = new OLogSequenceNumber(order, log.writeFrom);
pos = (int) (log.writeFrom % OWALPage.PAGE_SIZE);
pageIndex = (int) (log.writeFrom / OWALPage.PAGE_SIZE);
int written = 0;
while (written < log.record.length) {
lastToFlush = true;
int pageFreeSpace = OWALPage.calculateRecordSize(OWALPage.PAGE_SIZE - pos);
int contentLength = Math.min(pageFreeSpace, (log.record.length - written));
int fromRecord = written;
written += contentLength;
pos = writeContentInPage(pageContent, pos, log.record, written == log.record.length, fromRecord, contentLength);
if (OWALPage.PAGE_SIZE - pos < OWALPage.MIN_RECORD_SIZE) {
fileLock.lock();
try {
final RandomAccessFile rndFile = getRndFile();
final FileChannel channel = rndFile.getChannel();
channel.position(pageIndex * OWALPage.PAGE_SIZE);
flushPage(pageContent, channel);
} finally {
fileLock.unlock();
}
writtenUpTo = (pageIndex + 1) * OWALPage.PAGE_SIZE - 1;
lastToFlush = false;
pageIndex++;
pos = OWALPage.RECORDS_OFFSET;
}
}
lastPos = log.writeTo;
}
if (lastToFlush) {
fileLock.lock();
try {
RandomAccessFile rndFile = getRndFile();
final FileChannel channel = rndFile.getChannel();
channel.position(pageIndex * OWALPage.PAGE_SIZE);
flushPage(pageContent, channel);
} finally {
fileLock.unlock();
}
writtenUpTo = lastPos;
}
if (OGlobalConfiguration.WAL_SYNC_ON_PAGE_FLUSH.getValueAsBoolean()) {
fileLock.lock();
try {
final RandomAccessFile rndFile = getRndFile();
rndFile.getFD().sync();
} finally {
fileLock.unlock();
}
}
writeAheadLog.setFlushedLsn(lsn);
} finally {
if (statistic != null)
statistic.stopWALFlushTimer();
}
}
}
/**
* Write the content in the page and return the new page cursor position.
*
* @param pageContent buffer of the page to be filled
* @param posInPage position in the page where to write
* @param log content to write to the page
* @param isLast flag to mark if is last portion of the record
* @param fromRecord the start of the portion of the record to write in this page
* @param contentLength the length of the portion of the record to write in this page
*
* @return the new page cursor position after this write.
*/
private int writeContentInPage(ByteBuffer pageContent, int posInPage, byte[] log, boolean isLast, int fromRecord,
int contentLength) {
pageContent.put(posInPage, !isLast ? (byte) 1 : 0);
pageContent.put(posInPage + 1, isLast ? (byte) 1 : 0);
pageContent.putInt(posInPage + 2, contentLength);
pageContent.position(posInPage + OIntegerSerializer.INT_SIZE + 2);
pageContent.put(log, fromRecord, contentLength);
posInPage += OWALPage.calculateSerializedSize(contentLength);
pageContent.putInt(OWALPage.FREE_SPACE_OFFSET, OWALPage.PAGE_SIZE - posInPage);
return posInPage;
}
private void flushPage(ByteBuffer content, FileChannel channel) throws IOException {
content.putLong(OWALPage.MAGIC_NUMBER_OFFSET, OWALPage.MAGIC_NUMBER);
CRC32 crc32 = new CRC32();
byte[] data = new byte[OWALPage.PAGE_SIZE - OIntegerSerializer.INT_SIZE];
content.position(OWALPage.MAGIC_NUMBER_OFFSET);
content.get(data, 0, data.length);
crc32.update(data);
content.putInt(0, (int) crc32.getValue());
content.position(0);
channel.write(content);
}
OLogSegment(ODiskWriteAheadLog writeAheadLog, File file, int fileTTL, int maxPagesCacheSize,
OPerformanceStatisticManager performanceStatisticManager, ScheduledExecutorService closer,
ScheduledExecutorService commitExecutor) throws IOException {
this.writeAheadLog = writeAheadLog;
this.file = file;
this.fileTTL = fileTTL;
this.maxPagesCacheSize = maxPagesCacheSize;
this.performanceStatisticManager = performanceStatisticManager;
this.closer = closer;
this.commitExecutor = commitExecutor;
order = extractOrder(file.getName());
closed = false;
}
public void startFlush() {
if (writeAheadLog.getCommitDelay() > 0) {
commitExecutor.scheduleAtFixedRate(new FlushTask(), writeAheadLog.getCommitDelay(), writeAheadLog.getCommitDelay(),
TimeUnit.MILLISECONDS);
//if WAL segment is active (all content is written in this segment) we should not try to close it after TTL.
preventAutoClose = true;
}
}
public void stopFlush(boolean flush) {
if (flush)
flush();
if (!commitExecutor.isShutdown()) {
commitExecutor.shutdown();
try {
if (!commitExecutor.awaitTermination(OGlobalConfiguration.WAL_SHUTDOWN_TIMEOUT.getValueAsInteger(), TimeUnit.MILLISECONDS))
throw new OStorageException("WAL flush task for '" + getPath() + "' segment cannot be stopped");
} catch (InterruptedException e) {
OLogManager.instance().error(this, "Cannot shutdown background WAL commit thread");
}
}
//segment is not active any more we should start file auto close
preventAutoClose = false;
}
/**
* Returns active instance of file which is associated with given WAL segment
* Call of this method should always be protected by {@link #fileLock}.
*
* @return Active instance of file which is associated with given WAL segment
*/
private RandomAccessFile getRndFile() throws IOException {
if (rndFile == null) {
rndFile = new RandomAccessFile(file, "rw");
scheduleFileAutoClose();
} else {
closeNextTime = false;
}
return rndFile;
}
/**
* Start timer thread which will auto close file if it is not accesses during {@link #fileTTL} seconds.
* If file is already closed timer thread will be terminate itself till it will not be started again by
* {@link #getRndFile()} call.
*/
private void scheduleFileAutoClose() {
if (!autoCloseInProgress.get() && autoCloseInProgress.compareAndSet(false, true)) {
closeNextTime = true;
final FileCloser task = new FileCloser();
task.self = closer.scheduleWithFixedDelay(task, fileTTL, fileTTL, TimeUnit.SECONDS);
}
}
public long getOrder() {
return order;
}
public void init(ByteBuffer buffer) throws IOException {
selfCheck();
initPageCache(buffer);
last = new OLogSequenceNumber(order, filledUpTo - 1);
}
@Override
public int compareTo(OLogSegment other) {
final long otherOrder = other.order;
if (order > otherOrder)
return 1;
else if (order < otherOrder)
return -1;
return 0;
}
@Override
public boolean equals(Object o) {
if (this == o)
return true;
if (o == null || getClass() != o.getClass())
return false;
OLogSegment that = (OLogSegment) o;
return order == that.order;
}
@Override
public int hashCode() {
return (int) (order ^ (order >>> 32));
}
public long filledUpTo() throws IOException {
return filledUpTo;
}
public OLogSequenceNumber begin() throws IOException {
if (!logCache.isEmpty())
return new OLogSequenceNumber(order, OWALPage.RECORDS_OFFSET);
fileLock.lock();
try {
final RandomAccessFile rndFile = getRndFile();
if (rndFile.length() > 0)
return new OLogSequenceNumber(order, OWALPage.RECORDS_OFFSET);
} finally {
fileLock.unlock();
}
return null;
}
public OLogSequenceNumber end() {
return last;
}
public void delete(boolean flush) throws IOException {
close(flush);
boolean deleted = OFileUtils.delete(file);
int retryCount = 0;
while (!deleted) {
deleted = OFileUtils.delete(file);
retryCount++;
if (retryCount > 10)
throw new IOException("Cannot delete file. Retry limit exceeded. (" + retryCount + ")");
}
}
public String getPath() {
return file.getAbsolutePath();
}
public static class OLogRecord {
public final byte[] record;
public final long writeFrom;
public final long writeTo;
public OLogRecord(byte[] record, long writeFrom, long writeTo) {
this.record = record;
this.writeFrom = writeFrom;
this.writeTo = writeTo;
}
}
public static OLogRecord generateLogRecord(final long starting, final byte[] record) {
long from = starting;
long length = record.length;
long resultSize;
int freePageSpace = OWALPage.PAGE_SIZE - (int) Math.max(starting % OWALPage.PAGE_SIZE, OWALPage.RECORDS_OFFSET);
int inPage = OWALPage.calculateRecordSize(freePageSpace);
//the record fit in the current page
if (inPage >= length) {
resultSize = OWALPage.calculateSerializedSize((int) length);
if (from % OWALPage.PAGE_SIZE == 0)
from += OWALPage.RECORDS_OFFSET;
return new OLogRecord(record, from, from + resultSize);
} else {
if (inPage > 0) {
//space left in the current page, take it
length -= inPage;
resultSize = freePageSpace;
if (from % OWALPage.PAGE_SIZE == 0)
from += OWALPage.RECORDS_OFFSET;
} else {
//no space left, start from a new one.
from = starting + freePageSpace + OWALPage.RECORDS_OFFSET;
resultSize = -OWALPage.RECORDS_OFFSET;
}
//calculate spare page
//add all the full pages
resultSize += length / OWALPage.calculateRecordSize(OWALPage.MAX_ENTRY_SIZE) * OWALPage.PAGE_SIZE;
int leftSize = (int) length % OWALPage.calculateRecordSize(OWALPage.MAX_ENTRY_SIZE);
if (leftSize > 0) {
//add the spare bytes at the last page
resultSize += OWALPage.RECORDS_OFFSET + OWALPage.calculateSerializedSize(leftSize);
}
return new OLogRecord(record, from, from + resultSize);
}
}
public OLogSequenceNumber logRecord(byte[] record) throws IOException {
flushNewData = true;
OLogRecord rec = generateLogRecord(filledUpTo, record);
filledUpTo = rec.writeTo;
last = new OLogSequenceNumber(order, rec.writeFrom);
try {
cacheLock.lock();
logCache.add(rec);
} finally {
cacheLock.unlock();
}
long pagesInCache = (filledUpTo - writtenUpTo) / OWALPage.PAGE_SIZE;
if (pagesInCache > maxPagesCacheSize) {
OLogManager.instance()
.info(this, "Max cache limit is reached (%d vs. %d), sync flush is performed", maxPagesCacheSize, pagesInCache);
writeAheadLog.incrementCacheOverflowCount();
flush();
}
return last;
}
@SuppressFBWarnings(value = "PZLA_PREFER_ZERO_LENGTH_ARRAYS")
public byte[] readRecord(OLogSequenceNumber lsn, ByteBuffer byteBuffer) throws IOException {
final OPair<OLogSequenceNumber, byte[]> lastRecord = lastReadRecord.get();
if (lastRecord != null && lastRecord.getKey().equals(lsn))
return lastRecord.getValue();
assert lsn.getSegment() == order;
if (lsn.getPosition() >= filledUpTo)
return null;
if (!logCache.isEmpty())
flush();
long pageIndex = lsn.getPosition() / OWALPage.PAGE_SIZE;
byte[] record = null;
int pageOffset = (int) (lsn.getPosition() % OWALPage.PAGE_SIZE);
long pageCount = (filledUpTo + OWALPage.PAGE_SIZE - 1) / OWALPage.PAGE_SIZE;
while (pageIndex < pageCount) {
fileLock.lock();
try {
final RandomAccessFile rndFile = getRndFile();
final FileChannel channel = rndFile.getChannel();
byteBuffer.position(0);
channel.read(byteBuffer, pageIndex * OWALPage.PAGE_SIZE);
} finally {
fileLock.unlock();
}
if (!checkPageIntegrity(byteBuffer))
throw new OWALPageBrokenException("WAL page with index " + pageIndex + " is broken");
OWALPage page = new OWALPage(byteBuffer, false);
byte[] content = page.getRecord(pageOffset);
if (record == null)
record = content;
else {
byte[] oldRecord = record;
record = new byte[record.length + content.length];
System.arraycopy(oldRecord, 0, record, 0, oldRecord.length);
System.arraycopy(content, 0, record, oldRecord.length, record.length - oldRecord.length);
}
if (page.mergeWithNextPage(pageOffset)) {
pageOffset = OWALPage.RECORDS_OFFSET;
pageIndex++;
if (pageIndex >= pageCount)
throw new OWALPageBrokenException("WAL page with index " + pageIndex + " is broken");
} else {
if (page.getFreeSpace() >= OWALPage.MIN_RECORD_SIZE && pageIndex < pageCount - 1)
throw new OWALPageBrokenException("WAL page with index " + pageIndex + " is broken");
break;
}
}
lastReadRecord = new WeakReference<OPair<OLogSequenceNumber, byte[]>>(new OPair<OLogSequenceNumber, byte[]>(lsn, record));
return record;
}
public OLogSequenceNumber getNextLSN(OLogSequenceNumber lsn, ByteBuffer buffer) throws IOException {
final byte[] record = readRecord(lsn, buffer);
if (record == null)
return null;
long pos = lsn.getPosition();
long pageIndex = pos / OWALPage.PAGE_SIZE;
int pageOffset = (int) (pos - pageIndex * OWALPage.PAGE_SIZE);
int restOfRecord = record.length;
while (restOfRecord > 0) {
int entrySize = OWALPage.calculateSerializedSize(restOfRecord);
if (entrySize + pageOffset < OWALPage.PAGE_SIZE) {
if (entrySize + pageOffset <= OWALPage.PAGE_SIZE - OWALPage.MIN_RECORD_SIZE)
pos += entrySize;
else
pos += OWALPage.PAGE_SIZE - pageOffset + OWALPage.RECORDS_OFFSET;
break;
} else if (entrySize + pageOffset == OWALPage.PAGE_SIZE) {
pos += entrySize + OWALPage.RECORDS_OFFSET;
break;
} else {
long chunkSize = OWALPage.calculateRecordSize(OWALPage.PAGE_SIZE - pageOffset);
restOfRecord -= chunkSize;
pos += OWALPage.PAGE_SIZE - pageOffset + OWALPage.RECORDS_OFFSET;
pageOffset = OWALPage.RECORDS_OFFSET;
}
}
if (pos >= filledUpTo)
return null;
return new OLogSequenceNumber(order, pos);
}
public void close(boolean flush) throws IOException {
if (!closed) {
lastReadRecord.clear();
stopFlush(flush);
if (!closer.isShutdown()) {
closer.shutdown();
try {
if (!closer.awaitTermination(OGlobalConfiguration.WAL_SHUTDOWN_TIMEOUT.getValueAsInteger(), TimeUnit.MILLISECONDS))
throw new OStorageException("WAL file auto close task '" + getPath() + "' cannot be stopped");
} catch (InterruptedException e) {
OLogManager.instance().error(this, "Shutdown of file auto close thread was interrupted");
}
}
fileLock.lock();
try {
if (rndFile != null) {
rndFile.close();
rndFile = null;
}
} finally {
fileLock.unlock();
}
closed = true;
}
}
public OLogSequenceNumber readFlushedLSN() throws IOException {
fileLock.lock();
try {
final RandomAccessFile rndFile = getRndFile();
long pages = rndFile.length() / OWALPage.PAGE_SIZE;
if (pages == 0)
return null;
} finally {
fileLock.unlock();
}
return new OLogSequenceNumber(order, filledUpTo - 1);
}
public void flush() {
if (!commitExecutor.isShutdown()) {
try {
commitExecutor.submit(new FlushTask()).get();
} catch (InterruptedException e) {
Thread.interrupted();
throw OException.wrapException(new OStorageException("Thread was interrupted during flush"), e);
} catch (ExecutionException e) {
throw OException.wrapException(new OStorageException("Error during WAL segment '" + getPath() + "' flush"), e);
}
} else {
new FlushTask().run();
}
}
private void initPageCache(ByteBuffer buffer) throws IOException {
fileLock.lock();
try {
final RandomAccessFile rndFile = getRndFile();
long pagesCount = rndFile.length() / OWALPage.PAGE_SIZE;
if (pagesCount == 0)
return;
FileChannel channel = rndFile.getChannel();
buffer.position(0);
channel.read(buffer, (pagesCount - 1) * OWALPage.PAGE_SIZE);
if (checkPageIntegrity(buffer)) {
int freeSpace = buffer.getInt(OWALPage.FREE_SPACE_OFFSET);
filledUpTo = (pagesCount - 1) * OWALPage.PAGE_SIZE + (OWALPage.PAGE_SIZE - freeSpace);
} else {
filledUpTo = pagesCount * OWALPage.PAGE_SIZE + OWALPage.RECORDS_OFFSET;
}
} finally {
fileLock.unlock();
}
}
private long extractOrder(String name) {
final Matcher matcher = Pattern.compile("^.*\\.(\\d+)\\.wal$").matcher(name);
final boolean matches = matcher.find();
assert matches;
final String order = matcher.group(1);
try {
return Long.parseLong(order);
} catch (NumberFormatException e) {
// never happen
throw new IllegalStateException(e);
}
}
private boolean checkPageIntegrity(ByteBuffer content) {
final long magicNumber = content.getLong(OWALPage.MAGIC_NUMBER_OFFSET);
if (magicNumber != OWALPage.MAGIC_NUMBER)
return false;
byte[] data = new byte[OWALPage.PAGE_SIZE - OIntegerSerializer.INT_SIZE];
content.position(OWALPage.MAGIC_NUMBER_OFFSET);
content.get(data, 0, data.length);
final CRC32 crc32 = new CRC32();
crc32.update(data);
return ((int) crc32.getValue()) == content.getInt(0);
}
private void selfCheck() throws IOException {
if (!logCache.isEmpty())
throw new IllegalStateException("WAL cache is not empty, we cannot verify WAL after it was started to be used");
fileLock.lock();
try {
final RandomAccessFile rndFile = getRndFile();
long pagesCount = rndFile.length() / OWALPage.PAGE_SIZE;
if (rndFile.length() % OWALPage.PAGE_SIZE > 0) {
OLogManager.instance().error(this, "Last WAL page was written partially, auto fix");
rndFile.setLength(OWALPage.PAGE_SIZE * pagesCount);
}
} finally {
fileLock.unlock();
}
}
public long getFilledUpTo() {
return filledUpTo;
}
/**
* Timer task which is used to close file if it is not accessed during {@link #fileTTL} interval.
*/
class FileCloser implements Runnable {
private boolean stopped = false;
private volatile ScheduledFuture<?> self = null;
@Override
public void run() {
if (stopped) {
//this task is finished we should stop its execution
if (self != null) {
self.cancel(false);
}
return;
}
if (preventAutoClose) {
return;
}
fileLock.lock();
try {
if (closeNextTime) {
try {
if (rndFile != null) {
rndFile.close();
rndFile = null;
}
} catch (IOException e) {
OLogManager.instance().error(this, "Can not auto close file in WAL", e);
}
autoCloseInProgress.set(false);
stopped = true;
if (self != null)
self.cancel(false);
} else {
//reschedule himself
closeNextTime = true;
}
} finally {
fileLock.unlock();
}
}
}
}