/*
* Copyright (C) 2014 Indeed Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
* in compliance with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the
* License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
* express or implied. See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.indeed.imhotep.web;
import com.google.common.base.Strings;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import com.indeed.util.core.io.Closeables2;
import com.indeed.imhotep.DatasetInfo;
import com.indeed.imhotep.client.ImhotepClient;
import com.indeed.imhotep.metadata.DatasetMetadata;
import com.indeed.imhotep.metadata.FieldMetadata;
import com.indeed.imhotep.metadata.FieldType;
import com.indeed.imhotep.metadata.MetricMetadata;
import org.apache.log4j.Logger;
import org.codehaus.jackson.map.ObjectMapper;
import org.codehaus.jackson.type.TypeReference;
import org.springframework.beans.factory.xml.XmlBeanFactory;
import org.springframework.core.io.FileSystemResource;
import org.springframework.scheduling.annotation.Scheduled;
import javax.annotation.Nonnull;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.*;
import java.util.regex.Pattern;
/**
* @author vladimir
*/
public class ImhotepMetadataCache {
private static final Logger log = Logger.getLogger(ImhotepMetadataCache.class);
private LinkedHashMap<String, DatasetMetadata> datasets = Maps.newLinkedHashMap();
// TODO: integrate into the metadata above?
private volatile Map<String, Set<String>> datasetToKeywordAnaylzerWhitelist = Maps.newHashMap();
private final ImhotepClient imhotepClient;
private String ramsesMetadataPath;
private final List<Pattern> disabledFields = Lists.newArrayList();
public ImhotepMetadataCache(ImhotepClient client, String ramsesMetadataPath, String disabledFields) {
imhotepClient = client;
this.ramsesMetadataPath = ramsesMetadataPath;
if(disabledFields != null) {
for(String field : disabledFields.split(",")) {
try {
this.disabledFields.add(Pattern.compile(field.trim()));
} catch (Exception e) {
log.warn("Failed to compile regex pattern for disabled field: " + field);
}
}
}
}
// updated every 60s and actual shards in ImhotepClient are reloaded every 60s
@Scheduled(fixedRate = 60000)
public void updateDatasets() {
Map<String, DatasetInfo> datasetToShardList = imhotepClient.getDatasetToShardList();
List<String> datasetNames = new ArrayList<String>(datasetToShardList.keySet());
Collections.sort(datasetNames);
if(datasetNames.size() == 0) { // if we get no data, just keep what we already have
log.warn("Imhotep returns no datasets");
return;
}
// First make empty DatasetMetadata instances
final LinkedHashMap<String, DatasetMetadata> newDatasets = Maps.newLinkedHashMap();
for(String datasetName : datasetNames) {
final DatasetMetadata datasetMetadata = new DatasetMetadata(datasetName);
newDatasets.put(datasetName, datasetMetadata);
}
// Now pre-fill the metadata with fields from Imhotep
for(DatasetInfo datasetInfo : datasetToShardList.values()) {
List<String> dsIntFields = Lists.newArrayList(datasetInfo.getIntFields());
List<String> dsStringFields = Lists.newArrayList(datasetInfo.getStringFields());
removeDisabledFields(dsIntFields);
removeDisabledFields(dsStringFields);
Collections.sort(dsIntFields);
Collections.sort(dsStringFields);
final String datasetName = datasetInfo.getDataset();
final DatasetMetadata datasetMetadata = newDatasets.get(datasetName);
final LinkedHashMap<String, FieldMetadata> fieldMetadatas = datasetMetadata.getFields();
for(String intField : dsIntFields) {
fieldMetadatas.put(intField, new FieldMetadata(intField, FieldType.Integer));
}
for(String stringField : dsStringFields) {
fieldMetadatas.put(stringField, new FieldMetadata(stringField, FieldType.String));
}
}
// now load the metadata from files
loadMetadataFromFiles(newDatasets);
for(final DatasetMetadata datasetMetadata : newDatasets.values()) {
addStandardAliases(datasetMetadata);
datasetMetadata.finishLoading();
}
// new metadata instance is ready for use
datasets = newDatasets;
}
private void removeDisabledFields(List<String> fields) {
Iterator<String> iterator = fields.iterator();
while(iterator.hasNext()) {
final String field = iterator.next();
for(Pattern regex : disabledFields) {
if(regex.matcher(field).matches()) {
iterator.remove();
}
}
}
}
public LinkedHashMap<String, DatasetMetadata> getDatasets() {
return datasets;
}
@Nonnull
public DatasetMetadata getDataset(String dataset) {
if(!datasets.containsKey(dataset)) {
return new DatasetMetadata(dataset); // empty
}
return datasets.get(dataset);
}
public Set<String> getKeywordAnalyzerWhitelist(String dataset) {
if(!datasetToKeywordAnaylzerWhitelist.containsKey(dataset)) {
return Collections.emptySet();
}
return Collections.unmodifiableSet(datasetToKeywordAnaylzerWhitelist.get(dataset));
}
@Scheduled(fixedRate = 60000)
private void updateKeywordAnalyzerWhitelist() {
try {
File whitelistFile = new File(ramsesMetadataPath, "keywordAnalyzerWhitelist.json");
if (whitelistFile.exists()) {
final Map<String, Set<String>> newKeywordAnaylzerWhitelist = Maps.newHashMap();
FileInputStream is = new FileInputStream(whitelistFile);
ObjectMapper mapper = new ObjectMapper();
Map<String, List<String>> tmpMap = mapper.readValue(is, new TypeReference<Map<String,List<String>>>(){});
for (final String indexName : tmpMap.keySet()) {
final Set<String> whitelistedFields = Sets.newHashSet(tmpMap.get(indexName));
newKeywordAnaylzerWhitelist.put(indexName, whitelistedFields);
}
datasetToKeywordAnaylzerWhitelist = newKeywordAnaylzerWhitelist;
}
} catch (Exception e) {
log.warn("Failed to process keywordAnalyzerWhitelist.json", e);
}
}
private boolean loadMetadataFromFiles(LinkedHashMap<String, DatasetMetadata> newDatasetToAliases) {
File ramsesDir = new File(ramsesMetadataPath);
if(!ramsesDir.exists() || !ramsesDir.isDirectory()) {
log.error("Directory not found at " + ramsesMetadataPath);
return false;
}
File[] files = ramsesDir.listFiles();
if(files == null) {
log.error("Failed to stat directory at " + ramsesMetadataPath);
return false;
}
for(File indexDir : files) {
if(!indexDir.isDirectory()) {
continue;
}
final String indexName = indexDir.getName();
final DatasetMetadata datasetMetadata = newDatasetToAliases.get(indexName);
if(datasetMetadata == null) {
log.trace("Found dimensions data for unknown dataset: " + indexName);
continue;
}
loadDimensions(indexDir, datasetMetadata);
loadSuggestions(indexDir, datasetMetadata);
}
return true;
}
// aliases applicable to all indexes
private void addStandardAliases(DatasetMetadata datasetMetadata) {
MetricMetadata countsMetadata = datasetMetadata.getMetric("counts");
if(countsMetadata == null) {
countsMetadata = new MetricMetadata("counts");
datasetMetadata.getMetrics().put("counts", countsMetadata);
}
if(!datasetMetadata.isRamsesDataset()) { // for Ramses datasets we should allow counts to be pushed so that scaling can be applied
countsMetadata.setExpression("count()");
}
countsMetadata.setDescription("Count of all documents");
final String timeField = datasetMetadata.getTimeFieldName();
// make sure we have time field in Ramses indexes // TODO: why is it not returned by Imhotep?
if(datasetMetadata.isRamsesDataset()) {
final String ramsesTimeField = "time";
final String timeDescription = "Unix timestamp (seconds since epoch)";
MetricMetadata timeMetric = datasetMetadata.getMetric(ramsesTimeField);
if(timeMetric == null) {
timeMetric = new MetricMetadata(ramsesTimeField);
datasetMetadata.getMetrics().put(ramsesTimeField, timeMetric);
}
timeMetric.setDescription(timeDescription);
timeMetric.setUnit("seconds");
FieldMetadata timeFieldMetadata = datasetMetadata.getField(ramsesTimeField);
if(timeFieldMetadata == null) {
timeFieldMetadata = new FieldMetadata(ramsesTimeField, FieldType.String);
datasetMetadata.getFields().put(ramsesTimeField, timeFieldMetadata);
}
timeFieldMetadata.setDescription(timeDescription);
timeFieldMetadata.setType(FieldType.Integer);
}
tryAddMetricAlias("dayofweek", "(((" + timeField + "-280800)%604800)\\86400)", "day of week (days since Sunday)", datasetMetadata);
tryAddMetricAlias("timeofday", "((" + timeField + "-21600)%86400)", "time of day (seconds since midnight)", datasetMetadata);
}
private static Set<String> RESERVED_KEYWORDS = ImmutableSet.of("time", "bucket", "buckets", "lucene", "in");
private boolean tryAddMetricAlias(String metricName, String replacement, String description, DatasetMetadata datasetMetadata) {
// only add the alias if it's safe to do so. it shouldn't hide an existing field or be a reserved keyword
if(datasetMetadata.hasField(metricName)
&& !replacement.startsWith("floatscale") // allow floatscale operation to replace the original field as floats are not usable as is
|| RESERVED_KEYWORDS.contains(metricName)) {
log.trace("Skipped adding alias due to conflict: " + datasetMetadata.getName() + "." + metricName + "->" + replacement);
return false;
}
MetricMetadata metricMetadata = datasetMetadata.getMetric(metricName);
if(metricMetadata == null) {
metricMetadata = new MetricMetadata(metricName);
datasetMetadata.getMetrics().put(metricName, metricMetadata);
}
metricMetadata.setExpression(replacement);
if(description != null) {
metricMetadata.setDescription(description);
}
return true;
}
private void loadSuggestions(File indexDir, DatasetMetadata datasetMetadata) {
final File suggestionsXml = new File(indexDir, "suggestions.xml");
if (!suggestionsXml.exists()) {
return;
}
@SuppressWarnings("unchecked")
final Map<String, String> suggestions = (Map<String, String>) new XmlBeanFactory(new FileSystemResource(suggestionsXml)).getBean("suggestionMap");
if (suggestions != null) {
for(Map.Entry<String, String> suggestion : suggestions.entrySet()) {
datasetMetadata.addFieldMetricDescription(suggestion.getKey(), suggestion.getValue(), null, false, true, false);
}
}
}
/**
* Loads metrics descriptions and aliases for an index from a Ramses dimensions file
*/
private void loadDimensions(File indexDir, DatasetMetadata datasetMetadata) {
final File dimensionsFile = new File(indexDir, "dimensions.desc");
if(!dimensionsFile.exists()) {
return;
}
BufferedReader reader = null;
try {
final Map<String, Alias> fieldToAlias = Maps.newHashMap();
reader = new BufferedReader(new InputStreamReader(new FileInputStream(dimensionsFile)));
for(String line = reader.readLine(); line != null; line = reader.readLine()) {
if(line.startsWith("#")) {
if(line.startsWith("#/")) {
// dimension only for IQL but not ramses hack
line = line.substring(2);
} else {
continue;
}
}
String[] split = line.split(",");
if(split.length < 5) {
continue; // invalid field entry?
}
String name = split[0].trim();
String desc = split[1].trim();
String unit = split[2].trim();
final String dimType = split[3].trim();
if(Strings.isNullOrEmpty(unit) || "null".equals(unit)) {
unit = null;
}
if(Strings.isNullOrEmpty(desc) || "null".equals(desc)) {
desc = null;
}
boolean isHidden = name.startsWith("!");
if(isHidden) {
name = name.substring(1);
}
if(name.equals("time")) {
continue; // time is a reserved field/keyword
}
boolean metricHasField = false;
Alias alias = null;
if ("add".equals(dimType) || "subtract".equals(dimType) ||
"multiply".equals(dimType) || "divide".equals(dimType)) {
String dim1 = split[4].trim();
String dim2 = split[5].trim();
if (dim1.startsWith("!")) dim1 = dim1.substring(1);
if (dim2.startsWith("!")) dim2 = dim2.substring(1);
final String op;
if ("add".equals(dimType)) {
op = "+";
} else if ("subtract".equals(dimType)) {
op = "-";
} else if("divide".equals(dimType)) {
op = "\\";
} else {
op = "*";
}
alias = new CompositeOp(op, dim1, dim2, isHidden);
} else if("lossless".equals(dimType)) {
String realField = split[4].trim();
if(!name.equals(realField)) {
if(realField.startsWith("floatscale")) {
realField = realField.replace(' ', '(').replace('*', ',').replace('+', ',') + ')';
}
alias = new SimpleField(realField, isHidden);
} else {
metricHasField = true;
}
}
if(!(isHidden && alias != null)) { // if it's an aliased hidden metric, it's intermediary and we can skip it
datasetMetadata.addFieldMetricDescription(name, desc, unit, isHidden, metricHasField, true);
}
if(alias != null) {
fieldToAlias.put(name, alias);
}
}
// now that we have all the aliases loaded we can resolve them
for(Map.Entry<String, Alias> entry : fieldToAlias.entrySet()) {
final Alias alias = entry.getValue();
if(alias.hidden) {
continue; // this is just an intermediate metric
}
final String metricName = entry.getKey();
final String resolvedAlias = alias.resolve(fieldToAlias);
if(resolvedAlias == null) {
log.warn("Found a metric alias with a circular dependency which is illegal: " + datasetMetadata.getName() + "." + metricName);
continue;
}
tryAddMetricAlias(metricName, resolvedAlias, null, datasetMetadata);
log.trace("Aliasing: " + datasetMetadata.getName() + "." + metricName + "->" + resolvedAlias);
}
} catch (FileNotFoundException e) {
log.warn("Dimensions file read failed for " + indexDir, e);
} catch (IOException e) {
log.warn("Dimensions file read failed for " + indexDir, e);
} finally {
if(reader != null) {
Closeables2.closeQuietly(reader, log);
}
}
}
private static abstract class Alias {
boolean hidden;
protected Alias(boolean hidden) {
this.hidden = hidden;
}
abstract String resolve(Map<String,Alias> fieldToAlias);
}
private static class SimpleField extends Alias {
String fieldName;
private SimpleField(String fieldName, boolean hidden){
super(hidden);
this.fieldName = fieldName;
}
@Override
public String resolve(Map<String, Alias> fieldToAlias) {
return fieldName;
}
}
private static class CompositeOp extends Alias {
String operator;
String dim1;
String dim2;
boolean isSeen; // keeps track of whether the resolve process has already encountered this object
private CompositeOp(String operator, String dim1, String dim2, boolean hidden) {
super(hidden);
this.operator = operator;
this.dim1 = dim1;
this.dim2 = dim2;
}
@Override
public String resolve(Map<String, Alias> fieldToAlias) {
if(isSeen) { // protection from infinite recursion
return null;
}
final String dim1Resolved;
final String dim2Resolved;
isSeen = true;
try {
final Alias dim1Alias = fieldToAlias.get(dim1);
if(dim1Alias != null) {
dim1Resolved = dim1Alias.resolve(fieldToAlias);
} else {
dim1Resolved = dim1;
}
final Alias dim2Alias = fieldToAlias.get(dim2);
if(dim2Alias != null) {
dim2Resolved = dim2Alias.resolve(fieldToAlias);
} else {
dim2Resolved = dim2;
}
if(dim1Resolved == null || dim2Resolved == null) { // encountered a loop
return null;
}
} finally {
isSeen = false;
}
return "(" + dim1Resolved + operator + dim2Resolved + ")";
}
}
}