/** * Copyright (C) 2014-2016 LinkedIn Corp. (pinot-core@linkedin.com) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.linkedin.pinot.core.segment.store; import com.google.common.base.Preconditions; import com.linkedin.pinot.common.segment.ReadMode; import com.linkedin.pinot.core.segment.index.SegmentMetadataImpl; import com.linkedin.pinot.core.segment.memory.PinotDataBuffer; import java.io.BufferedWriter; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.io.PrintWriter; import java.nio.channels.FileChannel; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.SortedMap; import java.util.TreeMap; import org.apache.commons.configuration.ConfigurationException; import org.apache.commons.configuration.PropertiesConfiguration; import org.slf4j.Logger; import org.slf4j.LoggerFactory; // There are a couple of un-addressed issues right now // // thread-safety : methods in this class are not thread safe. External synchronization // is required. This will be addressed soon // // ACID: Various failures can lead to inconsistency. We will rely on retrieving segments // in case of failures. Some parts of this will improve in future but there will be // no complete ACID guarantee // // TODO/Missing features: // newBuffer : opening new buffer maps a new buffer separately. User can avoid // it by making all the write calls followed by reads. // Remove index: Ability to remove an index (particularly inverted index) // Abort writes: There is no way to abort discard changes // class SingleFileIndexDirectory extends ColumnIndexDirectory { private static Logger LOGGER = LoggerFactory.getLogger(SingleFileIndexDirectory.class); private static final String DEFAULT_INDEX_FILE_NAME = "columns.psf"; private static final String INDEX_MAP_FILE = "index_map"; private static final long MAGIC_MARKER = 0xdeadbeefdeafbeadL; private static final int MAGIC_MARKER_SIZE_BYTES = 8; private static final String MAP_KEY_SEPARATOR = "."; private static final String MAP_KEY_NAME_START_OFFSET = "startOffset"; private static final String MAP_KEY_NAME_SIZE = "size"; // Max size of buffer we want to allocate // ByteBuffer limits the size to 2GB - (some platform dependent size) // This breaks the abstraction with PinotDataBuffer....a workaround for // now till PinotDataBuffer can support large buffers again private static final int MAX_ALLOCATION_SIZE = 2000 * 1024 * 1024; private File indexFile; private Map<IndexKey, IndexEntry> columnEntries; private List<PinotDataBuffer> allocBuffers; public SingleFileIndexDirectory(File segmentDirectory, SegmentMetadataImpl metadata, ReadMode readMode) throws IOException, ConfigurationException { super(segmentDirectory, metadata, readMode); indexFile = new File(segmentDirectory, DEFAULT_INDEX_FILE_NAME); if (! indexFile.exists()) { indexFile.createNewFile(); } columnEntries = new HashMap<>(metadata.getAllColumns().size()); allocBuffers = new ArrayList<>(); load() ; } @Override public PinotDataBuffer getDictionaryBufferFor(String column) throws IOException { return checkAndGetIndexBuffer(column, ColumnIndexType.DICTIONARY); } @Override public PinotDataBuffer getForwardIndexBufferFor(String column) throws IOException { return checkAndGetIndexBuffer(column, ColumnIndexType.FORWARD_INDEX); } @Override public PinotDataBuffer getInvertedIndexBufferFor(String column) throws IOException { return checkAndGetIndexBuffer(column, ColumnIndexType.INVERTED_INDEX); } @Override public boolean hasIndexFor(String column, ColumnIndexType type) { IndexKey key = new IndexKey(column, type); return columnEntries.containsKey(key); } @Override public PinotDataBuffer newDictionaryBuffer(String column, int sizeBytes) throws IOException { return allocNewBufferInternal(column, ColumnIndexType.DICTIONARY, sizeBytes, "dictionary.create"); } @Override public PinotDataBuffer newForwardIndexBuffer(String column, int sizeBytes) throws IOException { return allocNewBufferInternal(column, ColumnIndexType.FORWARD_INDEX, sizeBytes, "forward_index.create"); } @Override public PinotDataBuffer newInvertedIndexBuffer(String column, int sizeBytes) throws IOException { return allocNewBufferInternal(column, ColumnIndexType.INVERTED_INDEX, sizeBytes, "inverted_index.create"); } private PinotDataBuffer checkAndGetIndexBuffer(String column, ColumnIndexType type) { IndexKey key = new IndexKey(column, type); IndexEntry entry = columnEntries.get(key); if (entry == null || entry.buffer == null) { throw new RuntimeException("Could not find index for column: " + column + ", type: " + type + ", segment: " + segmentDirectory.toString()); } return entry.buffer; } // This is using extra resources right now which can be changed. private PinotDataBuffer allocNewBufferInternal(String column, ColumnIndexType indexType, int size, String context) throws IOException { IndexKey key = new IndexKey(column, indexType); checkKeyNotPresent(key); String allocContext = allocationContext(key) + context; IndexEntry entry = new IndexEntry(key); entry.startOffset = indexFile.length(); entry.size = size + MAGIC_MARKER_SIZE_BYTES; // read-mode is always mmap so that buffer changes are synced // to the file PinotDataBuffer appendBuffer = PinotDataBuffer.fromFile(indexFile, entry.startOffset, entry.size, ReadMode.mmap, FileChannel.MapMode.READ_WRITE, allocContext); LOGGER.debug("Allotted buffer for key: {}, startOffset: {}, size: {}", key, entry.startOffset, entry.size); appendBuffer.putLong(0, MAGIC_MARKER); allocBuffers.add(appendBuffer); entry.buffer = appendBuffer.view(0 + MAGIC_MARKER_SIZE_BYTES, entry.size); columnEntries.put(key, entry); persistIndexMap(entry); return entry.buffer.duplicate(); } private void checkKeyNotPresent(IndexKey key) { if (columnEntries.containsKey(key)) { throw new RuntimeException("Attempt to re-create an existing index for key: " + key.toString() + ", for segmentDirectory: " + segmentDirectory.getAbsolutePath()); } } private void validateMagicMarker(PinotDataBuffer buffer, int startOffset) { long actualMarkerValue = buffer.getLong(startOffset); if (actualMarkerValue != MAGIC_MARKER) { LOGGER.error("Missing magic marker in index file: {} at position: {}", indexFile, startOffset); throw new RuntimeException("Inconsistent data read. Index data file " + indexFile.toString() + " is possibly corrupted"); } } private void load() throws IOException, ConfigurationException { loadMap(); mapBufferEntries(); } private void loadMap() throws ConfigurationException { File mapFile = new File(segmentDirectory, INDEX_MAP_FILE); PropertiesConfiguration mapConfig = new PropertiesConfiguration(mapFile); Iterator keys = mapConfig.getKeys(); while (keys.hasNext()) { String key = (String) keys.next(); // column names can have '.' in it hence scan from backwards // parsing names like "column.name.dictionary.startOffset" // or, "column.name.dictionary.endOffset" where column.name is the key int lastSeparatorPos = key.lastIndexOf(MAP_KEY_SEPARATOR); Preconditions.checkState(lastSeparatorPos != -1, "Key separator not found: " + key + ", segment: " + segmentDirectory); String propertyName = key.substring(lastSeparatorPos + 1); int indexSeparatorPos = key.lastIndexOf(MAP_KEY_SEPARATOR, lastSeparatorPos-1); Preconditions.checkState(indexSeparatorPos != -1, "Index separator not found: " + key + " , segment: " + segmentDirectory); String indexName = key.substring(indexSeparatorPos + 1, lastSeparatorPos); String columnName = key.substring(0, indexSeparatorPos); IndexKey indexKey = new IndexKey(columnName, ColumnIndexType.getValue(indexName)); IndexEntry entry = columnEntries.get(indexKey); if (entry == null) { entry = new IndexEntry(indexKey); columnEntries.put(indexKey, entry); } if (propertyName.equals(MAP_KEY_NAME_START_OFFSET)) { entry.startOffset = mapConfig.getLong(key); } else if (propertyName.equals(MAP_KEY_NAME_SIZE)) { entry.size = mapConfig.getLong(key); } else { throw new ConfigurationException("Invalid map file key: " + key + ", segmentDirectory: " + segmentDirectory.toString()); } } // validation for (Map.Entry<IndexKey, IndexEntry> colIndexEntry : columnEntries.entrySet()) { IndexEntry entry = colIndexEntry.getValue(); if (entry.size < 0 || entry.startOffset < 0) { throw new ConfigurationException("Invalid map entry for key: " + colIndexEntry.getKey().toString() + ", segment: " + segmentDirectory.toString()); } } } private void mapBufferEntries() throws IOException { SortedMap<Long, IndexEntry> indexStartMap = new TreeMap<>(); for (Map.Entry<IndexKey, IndexEntry> columnEntry : columnEntries.entrySet()) { long startOffset = columnEntry.getValue().startOffset; indexStartMap.put(startOffset, columnEntry.getValue()); } long runningSize = 0; List<Long> offsetAccum = new ArrayList<>(); for (Map.Entry<Long, IndexEntry> offsetEntry : indexStartMap.entrySet()) { IndexEntry entry = offsetEntry.getValue(); runningSize += entry.size; if ( runningSize >= MAX_ALLOCATION_SIZE) { mapAndSliceFile(indexStartMap, offsetAccum, offsetEntry.getKey()); runningSize = entry.size; offsetAccum.clear(); } offsetAccum.add(offsetEntry.getKey()); } if (offsetAccum.size() > 0) { mapAndSliceFile(indexStartMap, offsetAccum, offsetAccum.get(0) + runningSize); } } private void mapAndSliceFile(SortedMap<Long, IndexEntry> startOffsets, List<Long> offsetAccum, long endOffset) throws IOException { Preconditions.checkNotNull(startOffsets); Preconditions.checkNotNull(offsetAccum); Preconditions.checkArgument(offsetAccum.size() >= 1); long fromFilePos = offsetAccum.get(0); long toFilePos = endOffset - fromFilePos; String context = allocationContext(indexFile, "single_file_index.rw." + "." + String.valueOf(fromFilePos) + "." + String.valueOf(toFilePos)); PinotDataBuffer buffer = PinotDataBuffer.fromFile(indexFile, fromFilePos, toFilePos, readMode, FileChannel.MapMode.READ_WRITE, context); allocBuffers.add(buffer); int prevSlicePoint = 0; for (Long fileOffset : offsetAccum) { IndexEntry entry = startOffsets.get(fileOffset); int endSlicePoint = prevSlicePoint + (int) entry.size; validateMagicMarker(buffer, prevSlicePoint); PinotDataBuffer viewBuffer = buffer.view(prevSlicePoint + MAGIC_MARKER_SIZE_BYTES, endSlicePoint); entry.buffer = viewBuffer; prevSlicePoint = endSlicePoint; } } private void persistIndexMap(IndexEntry entry) throws IOException { File mapFile = new File(segmentDirectory, INDEX_MAP_FILE); try (PrintWriter writer = new PrintWriter(new BufferedWriter(new FileWriter(mapFile, true)))) { String startKey = getKey(entry.key.name, entry.key.type.getIndexName(), true); StringBuilder sb = new StringBuilder(); sb.append(startKey).append(" = ").append(entry.startOffset); writer.println(sb.toString()); String endKey = getKey(entry.key.name, entry.key.type.getIndexName(), false); sb = new StringBuilder(); sb.append(endKey).append(" = ").append(entry.size); writer.println(sb.toString()); } } private String getKey(String column, String indexName, boolean isStartOffset) { return column + MAP_KEY_SEPARATOR + indexName + MAP_KEY_SEPARATOR + (isStartOffset ? "startOffset" : "size"); } private String allocationContext(IndexKey key) { return this.getClass().getSimpleName() + key.toString(); } @Override public void close() { for (PinotDataBuffer buf : allocBuffers) { buf.close(); } columnEntries.clear(); allocBuffers.clear(); } @Override public void removeIndex(String columnName, ColumnIndexType indexType) { throw new UnsupportedOperationException("Index removal is not supported for single file index format. Requested colum: " + columnName + " indexType: " + indexType); } @Override public boolean isIndexRemovalSupported() { return false; } @Override public String toString(){ return segmentDirectory.toString() + "/" + indexFile.toString(); } }