/**
* diqube: Distributed Query Base.
*
* Copyright (C) 2015 Bastian Gloeckle
*
* This file is part of diqube.
*
* diqube is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package org.diqube.executionenv.util;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import javax.inject.Inject;
import org.diqube.context.AutoInstatiate;
import org.diqube.executionenv.ExecutionEnvironment;
import org.diqube.executionenv.querystats.QueryableLongColumnShard;
import org.diqube.executionenv.resolver.QueryableLongColumnShardResolver;
import org.diqube.name.RepeatedColumnNameGenerator;
import com.google.common.collect.Iterables;
/**
* Utility class that provides the resolution of column name patterns (= column names that contain "[*]" etc, see
* {@link RepeatedColumnNameGenerator}).
*
* @author Bastian Gloeckle
*/
@AutoInstatiate
public class ColumnPatternUtil {
@Inject
private RepeatedColumnNameGenerator repeatedColNames;
/**
* Replaces all the [*] strings in the pattern with actual column indices.
*
* @param lengthColResolver
* The {@link QueryableLongColumnShardResolver} that should be used to find "length" columns of the repeated
* fields.
* @param pattern
* The column name pattern
* @return A {@link ColumnPatternContainer} that can provide the column names.
* @throws LengthColumnMissingException
* in case a "length" column for one of the fields that are marked with "[*]" in the input pattern is not
* available in the provided lengthColResolver.
*/
public ColumnPatternContainer findColNamesForColNamePattern(QueryableLongColumnShardResolver lengthColResolver,
String pattern) throws LengthColumnMissingException {
return findColNamesForColNamePattern(lengthColResolver, Arrays.asList(pattern));
}
/**
* Replaces all the [*] strings in the patterns with actual column indices.
*
* The patterns have to follow the same "path" for this to work. This means that the repetitions need to be on the
* same fields, whereas not all fields need to repeat over all those fields. There will be one result for each
* resulting column name of the most-repeated pattern, where the column names of the less-repeated patterns will be
* the column names of the "parent" field of the most-repeated pattern. Examples for results (assuming length = 2 on
* all repeated fields):
*
* <pre>
* a[*].b.c[*], a[*].b.d
*
* ->
*
* <pre>
* a[0].b.c[0], a[0].b.d
* a[0].b.c[1], a[0].b.d
* a[1].b.c[0], a[1].b.d
* a[1].b.c[1], a[1].b.d
* </pre>
*
* <pre>
* a[*].b.c[*], a[*].b.c[*]
*
* ->
*
* a[0].b.c[0], a[0].b.c[0]
* a[0].b.c[1], a[0].b.c[1]
* a[1].b.c[0], a[1].b.c[0]
* a[1].b.c[1], a[1].b.c[1]
* </pre>
*
* <pre>
* a[*].b.c[*], a.x
*
* ->
*
* a[0].b.c[0], a.x
* a[0].b.c[1], a.x
* a[1].b.c[0], a.x
* a[1].b.c[1], a.x
* </pre>
*
* @param lengthColResolver
* The {@link QueryableLongColumnShardResolver} that should be used to find "length" columns of the repeated
* fields.
* @param patterns
* The patterns that should be resolved, adhering to the fact that they follow the same "path" (see above).
* @return A {@link ColumnPatternContainer} that can be used to fetch the colnames.
* @throws PatternException
* in case the patterns do not repeat on the same "path".
* @throws LengthColumnMissingException
* In case a "length" column for one of the fields that are marked with "[*]" in the input patterns is not
* available in the provided lengthColResolver.
*/
public ColumnPatternContainer findColNamesForColNamePattern(QueryableLongColumnShardResolver lengthColResolver,
List<String> patterns) throws PatternException, LengthColumnMissingException {
if (patterns.size() > 1) {
// Validate that patterns "repeat" in the same paths. For example the following is invalid:
// a.b[*].c.d[*].e
// a.x.c[*].d
// This is invalid, because we need to find one "path" through the repeated fields for the most-times repeated
// pattern, and all other patterns have to resolve to values along that "path".
// We validate this by finding the "last repeated field" for each pattern (= field name of the last field with a
// [*]). We then take the longest one (=the most specific one) and validate that this longest one "startsWith"
// each of the other "last repeated field" strings.
List<String> lastRepeatedFields = patterns.stream().map(s -> {
int lastRepeatedIdx = s.lastIndexOf(repeatedColNames.allEntriesIdentifyingSubstr());
if (lastRepeatedIdx == -1)
// we no not care about fields that are not repeated at all.
return null;
return s.substring(0, lastRepeatedIdx);
}).filter(s -> s != null).sorted((s1, s2) -> -1 * Integer.compare(s1.length(), s2.length()))
.collect(Collectors.toList());
if (lastRepeatedFields.size() > 1) {
String longestRepeatedField = lastRepeatedFields.get(0);
// TODO support case with fixed index: a.b[*].c[*].d, a.b[0].c[*].d
Optional<String> badMatched;
if ((badMatched = lastRepeatedFields.stream().filter(s -> !longestRepeatedField.startsWith(s)).findAny())
.isPresent())
throw new PatternException("Column pattern set invalid, as the patterns repeat on different paths: "
+ longestRepeatedField + " vs. " + badMatched.get());
}
}
if (!patterns.stream().anyMatch(p -> p.contains(repeatedColNames.allEntriesIdentifyingSubstr())))
throw new PatternException("No [*] in any pattern");
List<List<String>> baseNames = new ArrayList<>();
for (String pattern : patterns) {
List<String> newBaseNames =
new ArrayList<>(Arrays.asList(pattern.split(Pattern.quote(repeatedColNames.allEntriesIdentifyingSubstr()))));
if (pattern.endsWith(repeatedColNames.allEntriesIdentifyingSubstr()))
// last baseName will not be repeated, but in this pattern the last one /is/ repeated. Append empty string to
// simulate correct behaviour.
newBaseNames.add("");
baseNames.add(newBaseNames);
}
return new ColumnPatternContainer(lengthColResolver, baseNames);
}
/** for tests */
void setRepeatedColNames(RepeatedColumnNameGenerator repeatedColNames) {
this.repeatedColNames = repeatedColNames;
}
/**
* Exception showing that the provided pattern(s) are invalid.
*/
public static class PatternException extends RuntimeException {
private static final long serialVersionUID = 1L;
public PatternException(String msg) {
super(msg);
}
}
/**
* Exception showing that a length column is missing for a field that has a [*].
*/
public static class LengthColumnMissingException extends RuntimeException {
private static final long serialVersionUID = 1L;
public LengthColumnMissingException(String msg) {
super(msg);
}
}
/**
* Contains the actual column names of resolved patterns.
*/
public class ColumnPatternContainer {
private static final long MAX_LEN = Long.MIN_VALUE;
/**
* Map from indices (for each occurrence of [*] one) to a list of {@link ConcatStringProvider}s, for each pattern
* one.
*/
private Map<List<Long>, List<ConcatStringProvider>> stringProviders = new HashMap<>();
/**
* The input patterns, split up at [*]. The resulting column names will have indices after each basename, except for
* the last one (which will not be "repeated"). Note that each pattern may have a different number of baseNames, but
* all patterns are along the same "path".
*/
private List<List<String>> baseNames;
/** index in {@link #baseNames} where the list of basenames is the longest. */
private int longestPatternBaseNameIndex;
/** number of [*] that need to be inserted. */
private int numberOfStars;
private QueryableLongColumnShardResolver lengthColResolver;
/**
*
* @param baseNames
* The patterns. Each pattern needs to be split up into a List<String> by splitting the string at [*]. This
* {@link ColumnPatternContainer} will then fill in indices "between" two of these baseNames. Note that all
* baseNames need to be along the same "path".
*/
private ColumnPatternContainer(QueryableLongColumnShardResolver lengthColResolver, List<List<String>> baseNames)
throws LengthColumnMissingException {
this.lengthColResolver = lengthColResolver;
this.baseNames = baseNames;
numberOfStars = -1;
for (int i = 0; i < baseNames.size(); i++)
if (baseNames.get(i).size() > numberOfStars) {
numberOfStars = baseNames.get(i).size() - 1; // -1 -> last baseName will not get an [*] appended!
longestPatternBaseNameIndex = i;
}
List<ConcatStringProvider> parentProviders = new ArrayList<>(baseNames.size());
for (int i = 0; i < baseNames.size(); i++)
parentProviders.add(null);
createStringProviders(baseNames, 0, new ArrayList<>(numberOfStars), stringProviders, parentProviders);
}
/**
* Return a set of a list of colnames (for each input pattern one entry in the list) with filled in repetition
* indices for the lengths of the given rowId.
*
* For each input pattern there will be one string in the returned lists. And there are potentially multiple lists
* in a set, each list containing a different index combination.
*/
public Set<List<String>> getColumnPatterns(long rowId) {
Set<List<String>> res = new HashSet<>();
getColumnPatternsRecursive(rowId, 0, new ArrayList<>(), res);
return res;
}
/**
* See {@link #getColumnPatterns(long)}, but assuming there is only a single pattern, merges the values of the
* one-element-lists into the set directly.
*/
public Set<String> getColumnPatternsSinglePattern(long rowId) {
return getColumnPatterns(rowId).stream().flatMap(l -> Stream.of(Iterables.getOnlyElement(l)))
.collect(Collectors.toSet());
}
/**
* Return a set of list of colnames (for each input pattern one entry in the list) with filled in repetition indices
* of the maximum length of all rows. This then is the union of the result of {@link #getColumnPatterns(long)} for
* all rowIds (in respect to the current {@link ExecutionEnvironment}, of course).
*/
public Set<List<String>> getMaximumColumnPatterns() {
Set<List<String>> res = new HashSet<>();
getColumnPatternsRecursive(MAX_LEN, 0, new ArrayList<>(), res);
return res;
}
/**
* See {@link #getMaximumColumnPatterns()},but assuming there is only a single pattern, merges the values of the
* one-element-lists into the set directly.
*/
public Set<String> getMaximumColumnPatternsSinglePattern() {
return getMaximumColumnPatterns().stream().flatMap(l -> Stream.of(Iterables.getOnlyElement(l)))
.collect(Collectors.toSet());
}
/**
* Returns the "length" columns for the column with the given parent indices.
*/
private QueryableLongColumnShard getLengthColumn(List<Long> indices) throws LengthColumnMissingException {
StringBuilder sb = new StringBuilder();
for (int idx = 0; idx < indices.size(); idx++)
sb.append(
repeatedColNames.repeatedAtIndex(baseNames.get(longestPatternBaseNameIndex).get(idx), indices.get(idx)));
sb.append(baseNames.get(longestPatternBaseNameIndex).get(indices.size()));
String lenColName = repeatedColNames.repeatedLength(sb.toString());
QueryableLongColumnShard res = lengthColResolver.getLongColumnShard(lenColName);
if (res == null)
throw new LengthColumnMissingException("Missing column " + lenColName);
return res;
}
/**
* Recursively creates all columnNames that are valid for the given row.
*
* @param rowId
* The row to receive the lengths from. If {@link #MAX_LEN}, then not the lengths of a specific row will be
* used, but the maximum lengths.
* @param replaceIdx
* Index of the [*] whose value should be found in this recursive call. Provide 0 initially.
* @param rowIndices
* List of indices the parent incarnations chose currently, provide empty list initially.
* @param res
* The result. A Set of list where each list is one index possibility for all patterns.
*/
private void getColumnPatternsRecursive(long rowId, int replaceIdx, List<Long> rowIndices, Set<List<String>> res)
throws LengthColumnMissingException {
if (replaceIdx == numberOfStars) {
List<String> cols = stringProviders.get(rowIndices).stream()
.map(concatStringProvider -> concatStringProvider.create()).collect(Collectors.toList());
res.add(cols);
return;
}
long len;
QueryableLongColumnShard lenCol = getLengthColumn(rowIndices);
if (rowId != MAX_LEN) {
long lenColValId = lenCol.resolveColumnValueIdForRow(rowId);
len = lenCol.getColumnShardDictionary().decompressValue(lenColValId);
} else
len = lenCol.getColumnShardDictionary().decompressValue(lenCol.getColumnShardDictionary().getMaxId());
rowIndices.add(0L);
for (long repetitionIdx = 0; repetitionIdx < len; repetitionIdx++) {
rowIndices.set(replaceIdx, repetitionIdx);
getColumnPatternsRecursive(rowId, replaceIdx + 1, rowIndices, res);
}
rowIndices.remove(rowIndices.size() - 1);
}
/**
* Create the {@link ConcatStringProvider}s for the given props.
*
* @param baseNames
* List of parts of patterns. Each part is a "baseName" - you can get them by basically splitting the
* pattern at '[*]'. For each Pattern, there is a list of base names here.
* @param fillIdx
* The index of the [*] that should be filled in this recursive call. 0 for a start.
* @param indices
* List of indices the parent incarnations of this method created currently, an empty list for a start.
* @param res
* Add results here: For each index combination there is a list of {@link ConcatStringProvider}s (for each
* pattern one).
* @param parentStringProviders
* List of the parent String providers of parent incarnations of this recursive method - indexed by the
* baseName index (just like the outer list of param baseNames). For a start, use a list that contains
* (length(baseNames) "null" values).
*/
private void createStringProviders(List<List<String>> baseNames, int fillIdx, List<Long> indices,
Map<List<Long>, List<ConcatStringProvider>> res, List<ConcatStringProvider> parentStringProviders)
throws LengthColumnMissingException {
if (fillIdx == numberOfStars) {
// add the last parts of the patterns to the strings. These parts did not have a [*] appended!
List<ConcatStringProvider> finalProviders = new ArrayList<>();
for (int i = 0; i < parentStringProviders.size(); i++) {
ConcatStringProvider newProvider =
new ConcatStringProvider(parentStringProviders.get(i), Iterables.getLast(baseNames.get(i)), null);
finalProviders.add(newProvider);
}
res.put(new ArrayList<>(indices), finalProviders);
return;
}
QueryableLongColumnShard lengthCol = getLengthColumn(indices);
long maxLen =
lengthCol.getColumnShardDictionary().decompressValue(lengthCol.getColumnShardDictionary().getMaxId());
indices.add(0L);
for (long lenIdx = 0; lenIdx < maxLen; lenIdx++) {
List<ConcatStringProvider> delegateParentStringProviders = new ArrayList<>(parentStringProviders);
indices.set(indices.size() - 1, lenIdx);
for (int baseNameIdx = 0; baseNameIdx < baseNames.size(); baseNameIdx++) {
if (baseNames.get(baseNameIdx).size() - 1 > fillIdx) { // last baseName should not get repeated.
ConcatStringProvider newStringProvider = new ConcatStringProvider( //
parentStringProviders.get(baseNameIdx), //
baseNames.get(baseNameIdx).get(fillIdx), //
(int) lenIdx);
delegateParentStringProviders.set(baseNameIdx, newStringProvider);
}
}
createStringProviders(baseNames, fillIdx + 1, indices, res, delegateParentStringProviders);
}
indices.remove(indices.size() - 1);
}
/**
* Helper class: Hierarchical string creation with caching of the strings.
*/
private class ConcatStringProvider {
private String cachedValue;
private ConcatStringProvider parent;
private String baseName;
private Integer repeatedIdx;
private ConcatStringProvider(ConcatStringProvider parent, String baseName, Integer repeatedIdx) {
this.parent = parent;
this.baseName = baseName;
this.repeatedIdx = repeatedIdx;
}
public String create() {
if (cachedValue != null)
return cachedValue;
StringBuilder sb = new StringBuilder();
createFill(sb);
cachedValue = sb.toString();
return cachedValue;
}
protected void createFill(StringBuilder sb) {
if (cachedValue != null) {
sb.append(cachedValue);
return;
}
if (parent != null)
parent.createFill(sb);
if (repeatedIdx != null)
sb.append(repeatedColNames.repeatedAtIndex(baseName, repeatedIdx));
else
sb.append(baseName);
cachedValue = sb.toString();
}
}
}
}