/** * Copyright (C) 2014-2016 LinkedIn Corp. (pinot-core@linkedin.com) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.linkedin.pinot.core.segment.store; import com.google.common.base.Preconditions; import com.linkedin.pinot.common.segment.ReadMode; import com.linkedin.pinot.core.indexsegment.generator.SegmentVersion; import com.linkedin.pinot.core.segment.creator.impl.V1Constants; import com.linkedin.pinot.core.segment.index.SegmentMetadataImpl; import com.linkedin.pinot.core.segment.memory.PinotDataBuffer; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.nio.file.Path; import java.util.concurrent.atomic.AtomicLong; import org.apache.commons.configuration.ConfigurationException; import org.apache.commons.io.FileUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; class SegmentLocalFSDirectory extends SegmentDirectory { private static Logger LOGGER = LoggerFactory.getLogger(SegmentLocalFSDirectory.class); // matches most systems private static final int PAGE_SIZE_BYTES = 4096; // Prefetch limit...arbitrary but related to common server memory and data size profiles private static final long MAX_MMAP_PREFETCH_PAGES = 100 * 1024 * 1024 * 1024L / PAGE_SIZE_BYTES; private static final double PREFETCH_SLOWDOWN_PCT = 0.67; private static AtomicLong prefetchedPages = new AtomicLong(0); private final File segmentDirectory; SegmentLock segmentLock; private SegmentMetadataImpl segmentMetadata; private ReadMode readMode; private ColumnIndexDirectory columnIndexDirectory; SegmentLocalFSDirectory(String directoryPath, SegmentMetadataImpl metadata, ReadMode readMode) { this(new File(directoryPath), metadata, readMode); } SegmentLocalFSDirectory (File directory, ReadMode readMode) throws IOException, ConfigurationException { this(directory, loadSegmentMetadata(directory), readMode); } SegmentLocalFSDirectory(File directoryFile, SegmentMetadataImpl metadata, ReadMode readMode) { Preconditions.checkNotNull(directoryFile); Preconditions.checkNotNull(metadata); segmentDirectory = getSegmentPath(directoryFile, metadata.getSegmentVersion()); Preconditions.checkState(segmentDirectory.exists(), "Segment directory: " + directoryFile + " must exist"); segmentLock = new SegmentLock(); this.segmentMetadata = metadata; this.readMode = readMode; try { load(); } catch (IOException | ConfigurationException e) { LOGGER.error("Failed to load segment, error: ", e); throw new RuntimeException(e); } } private File getSegmentPath(File segmentDirectory, SegmentVersion segmentVersion) { if (segmentVersion == SegmentVersion.v1 || segmentVersion == SegmentVersion.v2) { return segmentDirectory; } if (segmentVersion == SegmentVersion.v3) { if (segmentDirectory.getAbsolutePath().endsWith(SegmentDirectoryPaths.V3_SUBDIRECTORY_NAME)) { return segmentDirectory; } File v3SubDir = new File(segmentDirectory, SegmentDirectoryPaths.V3_SUBDIRECTORY_NAME); if (v3SubDir.exists()) { return v3SubDir; } // return input path by default return segmentDirectory; } throw new IllegalArgumentException("Unknown segment version: " + segmentVersion); } public static SegmentMetadataImpl loadSegmentMetadata(File segmentDirectory) throws IOException, ConfigurationException { return new SegmentMetadataImpl(segmentDirectory); } @Override public Path getPath() { return segmentDirectory.toPath(); } @Override public long getDiskSizeBytes() { // [PINOT-3479] For newly added refresh segments, the new segment will // replace the old segment on disk before the new segment is loaded. // That means, the new segment may be in the pre-processing state. // So, the segment format may not have been converted, and inverted indexes // or default columns will not exist. // check that v3 subdirectory exists since the format may not have been converted if (segmentDirectory.exists()) { try { return FileUtils.sizeOfDirectory(segmentDirectory.toPath().toFile()); } catch (IllegalArgumentException e) { LOGGER.error("Failed to read disk size for direcotry: ", segmentDirectory.getAbsolutePath()); return -1; } } else { if (! SegmentDirectoryPaths.isV3Directory(segmentDirectory)) { LOGGER.error("Segment directory: {} not found on disk and is not v3 format", segmentDirectory.getAbsolutePath()); return -1; } File[] files = segmentDirectory.getParentFile().listFiles(); if (files == null) { LOGGER.warn("Empty list of files for path: {}, segmentDirectory: {}", segmentDirectory.getParentFile(), segmentDirectory); return -1; } long size = 0L; for (File file : files) { if (file.isFile()) { size += file.length(); } } return size; } } public Reader createReader() throws IOException { if (segmentLock.tryReadLock()) { loadData(); return new Reader(); } return null; } public Writer createWriter() throws IOException { if (segmentLock.tryWriteLock()) { loadData(); return new Writer(); } return null; } @Override public String toString() { return segmentDirectory.toString(); } protected void load() throws IOException, ConfigurationException { // in future, we can extend this to support metadata loading as well loadData(); } private synchronized void loadData() throws IOException { if (columnIndexDirectory != null) { return; } String version = segmentMetadata.getVersion(); SegmentVersion segmentVersion = SegmentVersion.valueOf(version); switch (segmentVersion) { case v1: case v2: columnIndexDirectory = new FilePerIndexDirectory(segmentDirectory, segmentMetadata, readMode); break; case v3: try { columnIndexDirectory = new SingleFileIndexDirectory(segmentDirectory, segmentMetadata, readMode); } catch (ConfigurationException e) { LOGGER.error("Failed to create columnar index directory", e); throw new RuntimeException(e); } break; } } @Override public void close() throws Exception { segmentLock.close(); synchronized (this) { if (columnIndexDirectory != null) { columnIndexDirectory.close(); columnIndexDirectory = null; } } } protected File starTreeIndexFile() { // this is not version dependent for now return new File(segmentDirectory, V1Constants.STAR_TREE_INDEX_FILE); } private PinotDataBuffer getIndexForColumn(String column, ColumnIndexType type) throws IOException { PinotDataBuffer buffer; switch (type) { case DICTIONARY: buffer = columnIndexDirectory.getDictionaryBufferFor(column); break; case FORWARD_INDEX: buffer = columnIndexDirectory.getForwardIndexBufferFor(column); break; case INVERTED_INDEX: buffer = columnIndexDirectory.getInvertedIndexBufferFor(column); break; default: throw new RuntimeException("Unknown index type: " + type.name()); } if (readMode == ReadMode.mmap) { prefetchMmapData(buffer); } return buffer; } private void prefetchMmapData(PinotDataBuffer buffer) { // mmap mode causes high number of major page faults after server restart. // This impacts latency especially for prod "online" use cases that require low latency. // This function proactively loads pages in memory to lower the variance in // latencies after server startup. // This has to handle two different data size profiles // 1. Servers with data size close to main memory size // 2. Servers with very large data sizes (terabytes) // To prevent it from loading terabytes of data on startup, we put a limit // on the number of pages this will prefetch (OS will do something more on top of this) // The logic here is as follows: // Server doesn't know total data size it is expected to serve. So this will // load all data till 2/3rd (PREFETCH_SLOWDOWN_PCT) of the configured limit. After that it will only // read the header page. We read headers because that has more frequently accessed // information which will have bigger impact on the latency. This can go over the limit // because it doesn't stop at any point. But that's not an issue considering this is // an optimization. // Prefetch limit and slowdown percentage are arbitrary if (prefetchedPages.get() >= MAX_MMAP_PREFETCH_PAGES) { return; } final long prefetchSlowdownPageLimit = (long) (PREFETCH_SLOWDOWN_PCT * MAX_MMAP_PREFETCH_PAGES); if (prefetchedPages.get() >= prefetchSlowdownPageLimit) { if (0 < buffer.size()) { buffer.getByte(0); prefetchedPages.incrementAndGet(); } } else { // pos needs to be long because buffer.size() is 32 bit but // adding 4k can make it go over int size for (long pos = 0; pos < buffer.size() && prefetchedPages.get() < prefetchSlowdownPageLimit; pos += PAGE_SIZE_BYTES) { buffer.getByte((int)pos); prefetchedPages.incrementAndGet(); } } } private boolean hasIndexFor(String column, ColumnIndexType type) { return columnIndexDirectory.hasIndexFor(column, type); } private InputStream getStarTreeStream() { File starTreeFile = starTreeIndexFile(); Preconditions.checkState(starTreeFile.exists(), "Star tree file for segment: {} does not exist"); Preconditions.checkState(starTreeFile.isFile(), "Star tree file: {} for segment: {} is not a regular file"); try { return new FileInputStream(starTreeFile); } catch (FileNotFoundException e) { // we should not reach here LOGGER.error("Star tree file for segment: {} is not found", segmentDirectory, e); throw new IllegalStateException("Star tree file for segment: " + segmentDirectory + " is not found", e); } } public boolean hasStarTree() { return starTreeIndexFile().exists(); } /*************************** SegmentDirectory Reader *********************/ public class Reader extends SegmentDirectory.Reader { @Override public PinotDataBuffer getIndexFor(String column, ColumnIndexType type) throws IOException { return getIndexForColumn(column, type); } @Override public InputStream getStarTreeStream() { return SegmentLocalFSDirectory.this.getStarTreeStream(); } @Override public File getStarTreeFile() { return SegmentLocalFSDirectory.this.starTreeIndexFile(); } @Override public boolean hasStarTree() { return SegmentLocalFSDirectory.this.hasStarTree(); } @Override public boolean hasIndexFor(String column, ColumnIndexType type) { return columnIndexDirectory.hasIndexFor(column, type); } @Override public void close() { // do nothing here segmentLock.unlock(); } @Override public String toString() { return segmentDirectory.toString(); } } /*************************** SegmentDirectory Writer *********************/ // TODO: thread-safety. Single writer may be shared // by multiple threads. This is not our typical use-case // but it's nice to have interface guarantee that. public class Writer extends SegmentDirectory.Writer { public Writer() { } @Override public PinotDataBuffer newIndexFor(String columnName, ColumnIndexType indexType, int sizeBytes) throws IOException { return getNewIndexBuffer(new IndexKey(columnName, indexType), sizeBytes); } @Override public OutputStream starTreeOutputStream() { // this checks about file's existence and if it's a regular file try { return new FileOutputStream(starTreeIndexFile()); } catch (FileNotFoundException e) { LOGGER.error("Failed to open star tree output stream for segment: {}", segmentDirectory, e); throw new RuntimeException("Failed to open star tree output stream for segment: " + segmentDirectory, e); } } @Override public boolean isIndexRemovalSupported() { return columnIndexDirectory.isIndexRemovalSupported(); } @Override public InputStream getStarTreeStream() { return SegmentLocalFSDirectory.this.getStarTreeStream(); } @Override public File getStarTreeFile() { return SegmentLocalFSDirectory.this.starTreeIndexFile(); } @Override public boolean hasStarTree() { return SegmentLocalFSDirectory.this.hasStarTree(); } @Override public void removeIndex(String columnName, ColumnIndexType indexType) { columnIndexDirectory.removeIndex(columnName, indexType); } @Override public void removeStarTree() { starTreeIndexFile().delete(); } private PinotDataBuffer getNewIndexBuffer(IndexKey key, long sizeBytes) throws IOException { ColumnIndexType indexType = key.type; switch (indexType) { case DICTIONARY: return columnIndexDirectory.newDictionaryBuffer(key.name, (int) sizeBytes); case FORWARD_INDEX: return columnIndexDirectory.newForwardIndexBuffer(key.name, (int) sizeBytes); case INVERTED_INDEX: return columnIndexDirectory.newInvertedIndexBuffer(key.name, ((int) sizeBytes)); default: throw new RuntimeException("Unknown index type: " + indexType.name() + " for directory: " + segmentDirectory); } } @Override public void abortAndClose() throws Exception { abort(); close(); } @Override void save() throws IOException { } void abort() { } @Override public String toString() { return segmentDirectory.toString(); } public void close() { segmentLock.unlock(); if (columnIndexDirectory != null) { columnIndexDirectory.close(); } columnIndexDirectory = null; } @Override public PinotDataBuffer getIndexFor(String column, ColumnIndexType type) throws IOException { return getIndexForColumn(column, type); } @Override public boolean hasIndexFor(String column, ColumnIndexType type) { return columnIndexDirectory.hasIndexFor(column, type); } } /* * This is NOT a re-entrant lock. ReentrantReadWriteLock * allows the thread hold write lock to create readers. * We want to prevent that. */ class SegmentLock implements AutoCloseable { int readers = 0; int writers = 0; synchronized boolean tryReadLock() { if (writers > 0) { return false; } ++readers; return true; } synchronized boolean tryWriteLock() { if (readers > 0 || writers > 0) { return false; } ++writers; return true; } synchronized void unlock() { if (writers > 0) { --writers; } else if (readers > 0) { --readers; } } public void close() { unlock(); } } }