SegmentLocalFSDirectory.java example

Explorer
pinot-master
/**
 * Copyright (C) 2014-2016 LinkedIn Corp. (pinot-core@linkedin.com)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.linkedin.pinot.core.segment.store;

import com.google.common.base.Preconditions;
import com.linkedin.pinot.common.segment.ReadMode;
import com.linkedin.pinot.core.indexsegment.generator.SegmentVersion;
import com.linkedin.pinot.core.segment.creator.impl.V1Constants;
import com.linkedin.pinot.core.segment.index.SegmentMetadataImpl;
import com.linkedin.pinot.core.segment.memory.PinotDataBuffer;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.file.Path;
import java.util.concurrent.atomic.AtomicLong;
import org.apache.commons.configuration.ConfigurationException;
import org.apache.commons.io.FileUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;


class SegmentLocalFSDirectory extends SegmentDirectory {
  private static Logger LOGGER = LoggerFactory.getLogger(SegmentLocalFSDirectory.class);

  // matches most systems
  private static final int PAGE_SIZE_BYTES = 4096;
  // Prefetch limit...arbitrary but related to common server memory and data size profiles
  private static final long MAX_MMAP_PREFETCH_PAGES = 100 * 1024 * 1024 * 1024L / PAGE_SIZE_BYTES;
  private static final double PREFETCH_SLOWDOWN_PCT = 0.67;
  private static AtomicLong prefetchedPages = new AtomicLong(0);

  private final File segmentDirectory;
  SegmentLock segmentLock;
  private SegmentMetadataImpl segmentMetadata;
  private ReadMode readMode;

  private ColumnIndexDirectory columnIndexDirectory;

  SegmentLocalFSDirectory(String directoryPath, SegmentMetadataImpl metadata, ReadMode readMode) {
    this(new File(directoryPath), metadata, readMode);
  }

  SegmentLocalFSDirectory (File directory, ReadMode readMode)
      throws IOException, ConfigurationException {
    this(directory, loadSegmentMetadata(directory), readMode);
  }

  SegmentLocalFSDirectory(File directoryFile, SegmentMetadataImpl metadata, ReadMode readMode) {

    Preconditions.checkNotNull(directoryFile);
    Preconditions.checkNotNull(metadata);

    segmentDirectory = getSegmentPath(directoryFile, metadata.getSegmentVersion());
    Preconditions.checkState(segmentDirectory.exists(), "Segment directory: " + directoryFile + " must exist");

    segmentLock = new SegmentLock();
    this.segmentMetadata = metadata;
    this.readMode = readMode;
    try {
      load();
    } catch (IOException | ConfigurationException e) {
      LOGGER.error("Failed to load segment, error: ", e);
      throw new RuntimeException(e);
    }
  }

  private File getSegmentPath(File segmentDirectory, SegmentVersion segmentVersion) {
    if (segmentVersion == SegmentVersion.v1 || segmentVersion == SegmentVersion.v2) {
      return segmentDirectory;
    }

    if (segmentVersion == SegmentVersion.v3) {
      if (segmentDirectory.getAbsolutePath().endsWith(SegmentDirectoryPaths.V3_SUBDIRECTORY_NAME)) {
        return segmentDirectory;
      }
      File v3SubDir = new File(segmentDirectory, SegmentDirectoryPaths.V3_SUBDIRECTORY_NAME);
      if (v3SubDir.exists()) {
        return v3SubDir;
      }
      // return input path by default
      return segmentDirectory;
    }
    throw new IllegalArgumentException("Unknown segment version: " + segmentVersion);
  }

  public static SegmentMetadataImpl loadSegmentMetadata(File segmentDirectory)
      throws IOException, ConfigurationException {
    return new SegmentMetadataImpl(segmentDirectory);
  }

  @Override
  public Path getPath() {
    return segmentDirectory.toPath();
  }

  @Override
  public long getDiskSizeBytes() {
    // [PINOT-3479] For newly added refresh segments, the new segment will
    // replace the old segment on disk before the new segment is loaded.
    // That means, the new segment may be in the pre-processing state.
    // So, the segment format may not have been converted, and inverted indexes
    // or default columns will not exist.

    // check that v3 subdirectory exists since the format may not have been converted
    if (segmentDirectory.exists()) {
      try {
        return FileUtils.sizeOfDirectory(segmentDirectory.toPath().toFile());
      } catch (IllegalArgumentException e) {
        LOGGER.error("Failed to read disk size for direcotry: ", segmentDirectory.getAbsolutePath());
        return -1;
      }
    } else {
      if (! SegmentDirectoryPaths.isV3Directory(segmentDirectory)) {
        LOGGER.error("Segment directory: {} not found on disk and is not v3 format", segmentDirectory.getAbsolutePath());
        return -1;
      }
      File[] files = segmentDirectory.getParentFile().listFiles();
      if (files == null) {
        LOGGER.warn("Empty list of files for path: {}, segmentDirectory: {}", segmentDirectory.getParentFile(), segmentDirectory);
        return -1;
      }

      long size = 0L;
      for (File file : files) {
        if (file.isFile()) {
          size += file.length();
        }
      }
      return size;
    }
  }

  public Reader createReader()
      throws IOException {

    if (segmentLock.tryReadLock()) {
      loadData();
      return new Reader();
    }
    return null;
  }

  public Writer createWriter()
      throws IOException {

    if (segmentLock.tryWriteLock()) {
      loadData();
      return new Writer();
    }

    return null;
  }

  @Override
  public String toString() {
    return segmentDirectory.toString();
  }

  protected void load()
      throws IOException, ConfigurationException {
    // in future, we can extend this to support metadata loading as well
    loadData();
  }

  private synchronized void loadData()
      throws IOException {
    if (columnIndexDirectory != null) {
      return;
    }

    String version = segmentMetadata.getVersion();
    SegmentVersion segmentVersion = SegmentVersion.valueOf(version);

    switch (segmentVersion) {
      case v1:
      case v2:
        columnIndexDirectory = new FilePerIndexDirectory(segmentDirectory, segmentMetadata, readMode);
        break;
      case v3:
        try {
          columnIndexDirectory = new SingleFileIndexDirectory(segmentDirectory, segmentMetadata, readMode);
        } catch (ConfigurationException e) {
          LOGGER.error("Failed to create columnar index directory", e);
          throw new RuntimeException(e);
        }
        break;
    }
  }

  @Override
  public void close()
      throws Exception {
    segmentLock.close();
    synchronized (this) {
      if (columnIndexDirectory != null) {
        columnIndexDirectory.close();
        columnIndexDirectory = null;
      }
    }
  }

  protected File starTreeIndexFile() {
    // this is not version dependent for now
    return new File(segmentDirectory, V1Constants.STAR_TREE_INDEX_FILE);
  }

  private PinotDataBuffer getIndexForColumn(String column, ColumnIndexType type)
      throws IOException {
    PinotDataBuffer buffer;
    switch (type) {
      case DICTIONARY:
        buffer = columnIndexDirectory.getDictionaryBufferFor(column);
        break;
      case FORWARD_INDEX:
        buffer = columnIndexDirectory.getForwardIndexBufferFor(column);
        break;
      case INVERTED_INDEX:
        buffer = columnIndexDirectory.getInvertedIndexBufferFor(column);
        break;
      default:
        throw new RuntimeException("Unknown index type: " + type.name());
    }
    if (readMode == ReadMode.mmap) {
      prefetchMmapData(buffer);
    }
    return buffer;
  }

  private void prefetchMmapData(PinotDataBuffer buffer) {
    // mmap mode causes high number of major page faults after server restart.
    // This impacts latency especially for prod "online" use cases that require low latency.
    // This function proactively loads pages in memory to lower the variance in
    // latencies after server startup.

    // This has to handle two different data size profiles
    // 1. Servers with data size close to main memory size
    // 2. Servers with very large data sizes (terabytes)
    // To prevent it from loading terabytes of data on startup, we put a limit
    // on the number of pages this will prefetch (OS will do something more on top of this)
    // The logic here is as follows:
    // Server doesn't know total data size it is expected to serve. So this will
    // load all data till 2/3rd (PREFETCH_SLOWDOWN_PCT) of the configured limit. After that it will only
    // read the header page. We read headers because that has more frequently accessed
    // information which will have bigger impact on the latency. This can go over the limit
    // because it doesn't stop at any point. But that's not an issue considering this is
    // an optimization.

    // Prefetch limit and slowdown percentage are arbitrary
    if (prefetchedPages.get() >= MAX_MMAP_PREFETCH_PAGES) {
      return;
    }

    final long prefetchSlowdownPageLimit = (long) (PREFETCH_SLOWDOWN_PCT * MAX_MMAP_PREFETCH_PAGES);
    if (prefetchedPages.get() >= prefetchSlowdownPageLimit) {
      if (0 < buffer.size()) {
        buffer.getByte(0);
        prefetchedPages.incrementAndGet();
      }
    } else {
      // pos needs to be long because buffer.size() is 32 bit but
      // adding 4k can make it go over int size
      for (long pos = 0; pos < buffer.size() && prefetchedPages.get() < prefetchSlowdownPageLimit; pos += PAGE_SIZE_BYTES) {
        buffer.getByte((int)pos);
        prefetchedPages.incrementAndGet();
      }
    }
  }

  private boolean hasIndexFor(String column, ColumnIndexType type) {
    return columnIndexDirectory.hasIndexFor(column, type);
  }

  private InputStream getStarTreeStream() {
    File starTreeFile = starTreeIndexFile();
    Preconditions.checkState(starTreeFile.exists(), "Star tree file for segment: {} does not exist");
    Preconditions.checkState(starTreeFile.isFile(), "Star tree file: {} for segment: {} is not a regular file");

    try {
      return new FileInputStream(starTreeFile);
    } catch (FileNotFoundException e) {
      // we should not reach here
      LOGGER.error("Star tree file for segment: {} is not found", segmentDirectory, e);
      throw new IllegalStateException("Star tree file for segment: " + segmentDirectory +
          " is not found", e);
    }
  }

  public boolean hasStarTree() {
    return starTreeIndexFile().exists();
  }

  /***************************  SegmentDirectory Reader *********************/
  public class Reader extends SegmentDirectory.Reader {

    @Override
    public PinotDataBuffer getIndexFor(String column, ColumnIndexType type)
        throws IOException {
      return getIndexForColumn(column, type);
    }

    @Override
    public InputStream getStarTreeStream() {
      return SegmentLocalFSDirectory.this.getStarTreeStream();
    }

    @Override
    public File getStarTreeFile() {
      return SegmentLocalFSDirectory.this.starTreeIndexFile();
    }

    @Override
    public boolean hasStarTree() {
      return SegmentLocalFSDirectory.this.hasStarTree();
    }

    @Override
    public boolean hasIndexFor(String column, ColumnIndexType type) {
      return columnIndexDirectory.hasIndexFor(column, type);
    }

    @Override
    public void close() {
      // do nothing here
      segmentLock.unlock();
    }

    @Override
    public String toString() {
      return segmentDirectory.toString();
    }
  }

  /***************************  SegmentDirectory Writer *********************/
  // TODO: thread-safety. Single writer may be shared
  // by multiple threads. This is not our typical use-case
  // but it's nice to have interface guarantee that.
  public class Writer extends SegmentDirectory.Writer {

    public Writer() {
    }

    @Override
    public PinotDataBuffer newIndexFor(String columnName, ColumnIndexType indexType, int sizeBytes)
        throws IOException {
      return getNewIndexBuffer(new IndexKey(columnName, indexType), sizeBytes);
    }

    @Override
    public OutputStream starTreeOutputStream() {
      // this checks about file's existence and if it's a regular file
      try {
        return new FileOutputStream(starTreeIndexFile());
      } catch (FileNotFoundException e) {
        LOGGER.error("Failed to open star tree output stream for segment: {}", segmentDirectory, e);
        throw new RuntimeException("Failed to open star tree output stream for segment: " + segmentDirectory, e);
      }
    }

    @Override
    public boolean isIndexRemovalSupported() {
      return columnIndexDirectory.isIndexRemovalSupported();
    }

    @Override
    public InputStream getStarTreeStream() {
      return SegmentLocalFSDirectory.this.getStarTreeStream();
    }

    @Override
    public File getStarTreeFile() {
      return SegmentLocalFSDirectory.this.starTreeIndexFile();
    }

    @Override
    public boolean hasStarTree() {
      return SegmentLocalFSDirectory.this.hasStarTree();
    }

    @Override
    public void removeIndex(String columnName, ColumnIndexType indexType) {
      columnIndexDirectory.removeIndex(columnName, indexType);
    }

    @Override
    public void removeStarTree() {
      starTreeIndexFile().delete();
    }

    private PinotDataBuffer getNewIndexBuffer(IndexKey key, long sizeBytes)
        throws IOException {
      ColumnIndexType indexType = key.type;
      switch (indexType) {
        case DICTIONARY:
          return columnIndexDirectory.newDictionaryBuffer(key.name, (int) sizeBytes);

        case FORWARD_INDEX:
          return columnIndexDirectory.newForwardIndexBuffer(key.name, (int) sizeBytes);
        case INVERTED_INDEX:
          return columnIndexDirectory.newInvertedIndexBuffer(key.name, ((int) sizeBytes));
        default:
          throw new RuntimeException("Unknown index type: " + indexType.name() +
              " for directory: " + segmentDirectory);
      }
    }

    @Override
    public void abortAndClose()
        throws Exception {
      abort();
      close();
    }

    @Override
    void save()
        throws IOException {
    }

    void abort() {

    }

    @Override
    public String toString() {
      return segmentDirectory.toString();
    }

    public void close() {
      segmentLock.unlock();
      if (columnIndexDirectory != null) {
        columnIndexDirectory.close();
      }
      columnIndexDirectory = null;
    }

    @Override
    public PinotDataBuffer getIndexFor(String column, ColumnIndexType type)
        throws IOException {
      return getIndexForColumn(column, type);
    }

    @Override
    public boolean hasIndexFor(String column, ColumnIndexType type) {
      return columnIndexDirectory.hasIndexFor(column, type);
    }
  }

  /*
   * This is NOT a re-entrant lock. ReentrantReadWriteLock
   * allows the thread hold write lock to create readers.
   * We want to prevent that.
   */
  class SegmentLock implements AutoCloseable {
    int readers = 0;
    int writers = 0;

    synchronized boolean tryReadLock() {
      if (writers > 0) {
        return false;
      }
      ++readers;
      return true;
    }

    synchronized boolean tryWriteLock() {
      if (readers > 0 || writers > 0) {
        return false;
      }
      ++writers;
      return true;
    }

    synchronized void unlock() {
      if (writers > 0) {
        --writers;
      } else if (readers > 0) {
        --readers;
      }
    }

    public void close() {
      unlock();
    }
  }
}