/** * diqube: Distributed Query Base. * * Copyright (C) 2015 Bastian Gloeckle * * This file is part of diqube. * * diqube is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package org.diqube.loader.columnshard; import java.util.Iterator; import java.util.Map; import java.util.Map.Entry; import java.util.NavigableMap; import java.util.TreeMap; import java.util.concurrent.ConcurrentHashMap; import org.diqube.data.column.ColumnPage; import org.diqube.data.column.ColumnPageFactory; import org.diqube.data.column.ColumnShard; import org.diqube.data.column.ColumnShardFactory; import org.diqube.data.column.StandardColumnShard; import org.diqube.data.types.dbl.dict.DoubleDictionary; import org.diqube.data.types.lng.dict.LongDictionary; import org.diqube.data.types.str.dict.StringDictionary; import org.diqube.loader.LoaderColumnInfo; import org.diqube.loader.compression.CompressedDoubleDictionaryBuilder; import org.diqube.loader.compression.CompressedLongDictionaryBuilder; import org.diqube.loader.compression.CompressedStringDictionaryBuilder; import org.diqube.util.Pair; /** * A {@link SparseColumnShardBuilder} builds a {@link StandardColumnShard} of which it is expected to have only a sparse * set of values for the rows. * * TODO #12 check if separate implementation of Columnshard should be used for this. * * @author Bastian Gloeckle */ public class SparseColumnShardBuilder<T> { private ConcurrentHashMap<Long, T> rowIdToValues = new ConcurrentHashMap<>(); private ColumnShardFactory columnShardFactory; private ColumnPageFactory columnPageFactory; private String name; private long numberOfRows; public SparseColumnShardBuilder(ColumnShardFactory columnShardFactory, ColumnPageFactory columnPageFactory, String name) { this.columnShardFactory = columnShardFactory; this.columnPageFactory = columnPageFactory; this.name = name; } public SparseColumnShardBuilder<T> withValues(Map<Long, T> values) { rowIdToValues.putAll(values); return this; } public SparseColumnShardBuilder<T> withNumberOfRows(long numberOfRows) { this.numberOfRows = numberOfRows; return this; } public ColumnShard build() { NavigableMap<Long, Long> navigableRowIdToValueIds = new TreeMap<Long, Long>(); T sampleValue = rowIdToValues.values().iterator().next(); Class<?> columnValueClass = sampleValue.getClass(); ColumnShard res = null; NavigableMap<Long, ColumnPage> pages = new TreeMap<>(); Map<Long, Long> idChangeMap = null; Long defaultValueId = null; if (columnValueClass.equals(Long.class)) { NavigableMap<Long, Long> entityMap = new TreeMap<>(); long tmpValueId = 0; for (Entry<Long, T> valueEntry : rowIdToValues.entrySet()) { if (!entityMap.containsKey(valueEntry.getValue())) entityMap.put((Long) valueEntry.getValue(), tmpValueId++); navigableRowIdToValueIds.put(valueEntry.getKey(), entityMap.get(valueEntry.getValue())); } if (!entityMap.containsKey(LoaderColumnInfo.DEFAULT_LONG)) entityMap.put(LoaderColumnInfo.DEFAULT_LONG, tmpValueId++); defaultValueId = entityMap.get(LoaderColumnInfo.DEFAULT_LONG); CompressedLongDictionaryBuilder builder = new CompressedLongDictionaryBuilder(); builder.withDictionaryName(name).fromEntityMap(entityMap); Pair<LongDictionary<?>, Map<Long, Long>> builderRes = builder.build(); LongDictionary<?> columnShardDictionary = builderRes.getLeft(); idChangeMap = builderRes.getRight(); res = columnShardFactory.createStandardLongColumnShard(name, pages, columnShardDictionary); } else if (columnValueClass.equals(String.class)) { // TODO optimize c&p NavigableMap<String, Long> entityMap = new TreeMap<>(); long tmpValueId = 0; for (Entry<Long, T> valueEntry : rowIdToValues.entrySet()) { if (!entityMap.containsKey(valueEntry.getValue())) entityMap.put((String) valueEntry.getValue(), tmpValueId++); navigableRowIdToValueIds.put(valueEntry.getKey(), entityMap.get(valueEntry.getValue())); } if (!entityMap.containsKey(LoaderColumnInfo.DEFAULT_STRING)) entityMap.put(LoaderColumnInfo.DEFAULT_STRING, tmpValueId++); defaultValueId = entityMap.get(LoaderColumnInfo.DEFAULT_STRING); CompressedStringDictionaryBuilder builder = new CompressedStringDictionaryBuilder(); builder.fromEntityMap(entityMap); Pair<StringDictionary<?>, Map<Long, Long>> builderRes = builder.build(); StringDictionary<?> columnShardDictionary = builderRes.getLeft(); idChangeMap = builderRes.getRight(); res = columnShardFactory.createStandardStringColumnShard(name, pages, columnShardDictionary); } else if (columnValueClass.equals(Double.class)) { // TODO optimize c&p NavigableMap<Double, Long> entityMap = new TreeMap<>(); long tmpValueId = 0; for (Entry<Long, T> valueEntry : rowIdToValues.entrySet()) { if (!entityMap.containsKey(valueEntry.getValue())) entityMap.put((Double) valueEntry.getValue(), tmpValueId++); navigableRowIdToValueIds.put(valueEntry.getKey(), entityMap.get(valueEntry.getValue())); } if (!entityMap.containsKey(LoaderColumnInfo.DEFAULT_DOUBLE)) entityMap.put(LoaderColumnInfo.DEFAULT_DOUBLE, tmpValueId++); defaultValueId = entityMap.get(LoaderColumnInfo.DEFAULT_DOUBLE); CompressedDoubleDictionaryBuilder builder = new CompressedDoubleDictionaryBuilder(); builder.fromEntityMap(entityMap); Pair<DoubleDictionary<?>, Map<Long, Long>> builderRes = builder.build(); DoubleDictionary<?> columnShardDictionary = builderRes.getLeft(); idChangeMap = builderRes.getRight(); res = columnShardFactory.createStandardDoubleColumnShard(name, pages, columnShardDictionary); } else throw new RuntimeException("Cannot build sparse column of type " + columnValueClass); // should not happen long maxRowId = navigableRowIdToValueIds.lastKey(); numberOfRows = Long.max(numberOfRows, maxRowId + 1); int numberOfPages = (int) (numberOfRows / ColumnShardBuilder.PROPOSAL_ROWS); if (numberOfRows % ColumnShardBuilder.PROPOSAL_ROWS != 0) numberOfPages++; Iterator<Entry<Long, Long>> navigableRowIdToValuesIterator = navigableRowIdToValueIds.entrySet().iterator(); Entry<Long, Long> nextRowIdToValue = null; for (int pageNo = 0; pageNo < numberOfPages; pageNo++) { NavigableMap<Long, Long> valueToId = new TreeMap<>(); int valueLength = ColumnShardBuilder.PROPOSAL_ROWS; if (pageNo == numberOfPages - 1 && numberOfRows % ColumnShardBuilder.PROPOSAL_ROWS != 0) valueLength = (int) (numberOfRows % ColumnShardBuilder.PROPOSAL_ROWS); long[] pageValue = new long[valueLength]; long nextPageValueId = 0; for (int i = 0; i < valueLength; i++) { if (nextRowIdToValue == null && navigableRowIdToValuesIterator.hasNext()) nextRowIdToValue = navigableRowIdToValuesIterator.next(); long rowId = ((long) pageNo) * ColumnShardBuilder.PROPOSAL_ROWS + i; long valueId; if (nextRowIdToValue != null && nextRowIdToValue.getKey() == rowId) { valueId = nextRowIdToValue.getValue(); nextRowIdToValue = null; } else { valueId = defaultValueId; } // Adjust ID that was stored in columnDict, if it has been adjusted when building the column dictionary above. if (idChangeMap != null && idChangeMap.containsKey(valueId)) valueId = idChangeMap.get(valueId); // give this value a new ID which is valid for this column page if (!valueToId.containsKey(valueId)) { valueToId.put(valueId, nextPageValueId++); } // remember the new ID as value pageValue[i] = valueToId.get(valueId); } ColumnPageBuilder pageBuilder = new ColumnPageBuilder(columnPageFactory); // TODO add firstRowInShard - not needed that much currently, as SparseColumnShardBuilder is used on query master // only. long firstRowId = ((long) pageNo) * ColumnShardBuilder.PROPOSAL_ROWS; pageBuilder.withFirstRowId(firstRowId).withValueMap(valueToId).withValues(pageValue) .withColumnPageName(name + "#" + firstRowId); ColumnPage newPage = pageBuilder.build(); pages.put(firstRowId, newPage); } return res; } }