/*
* Copyright (C) 2014 Indeed Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
* in compliance with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the
* License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
* express or implied. See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.indeed.imhotep.web;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.indeed.util.io.Files;
import com.indeed.imhotep.TermCount;
import com.indeed.imhotep.api.ImhotepSession;
import com.indeed.imhotep.client.ImhotepClient;
import com.indeed.imhotep.metadata.DatasetMetadata;
import com.indeed.imhotep.metadata.FieldMetadata;
import com.indeed.imhotep.metadata.FieldType;
import org.apache.log4j.Logger;
import org.joda.time.DateTime;
import org.springframework.scheduling.annotation.Scheduled;
import java.io.File;
import java.io.IOException;
import java.util.Collections;
import java.util.List;
import java.util.Map;
/**
* @author vladimir
*/
public class TopTermsCache {
private static final Logger log = Logger.getLogger(TopTermsCache.class);
private static final int TERMS_TO_CACHE = 100;
private static final int DAYS_DELAY = 2;
private static final String CACHE_FILE_NAME = "toptermscache.bin";
private static final int CACHE_UPDATE_FREQUENCY = 24 * 60 * 60 * 1000; // 24 hours;
private final ImhotepClient client;
private final ImhotepMetadataCache imhotepMetadataCache;
private final String localCachePath;
private boolean initialized = false;
private final boolean devMode;
private volatile Map<String, Map<String, List<String>>> datasetToFieldToTerms = Maps.newHashMap();
public TopTermsCache(ImhotepClient client, ImhotepMetadataCache imhotepMetadataCache, String localCachePath, boolean devMode) {
this.client = client;
this.imhotepMetadataCache = imhotepMetadataCache;
this.localCachePath = localCachePath;
this.devMode = devMode;
}
@Scheduled(fixedRate = CACHE_UPDATE_FREQUENCY)
private void updateTopTerms() {
final File cacheFile = new File(localCachePath, CACHE_FILE_NAME);
final String cacheFilePath = cacheFile.getAbsolutePath();
if((!initialized || devMode) && cacheFile.exists()) {
try {
final TopTermsArtifact artifact = Files.readObjectFromFile(cacheFilePath, TopTermsArtifact.class, true);
if(artifact != null) {
final DateTime artifactExpirationTime = new DateTime(artifact.timestamp).plusMillis(CACHE_UPDATE_FREQUENCY);
if(DateTime.now().isBefore(artifactExpirationTime) || devMode) {
// persisted cache not expired. reuse
datasetToFieldToTerms = artifact.datasetToFieldToTerms;
initialized = true;
return;
}
}
} catch (Exception e) {
log.warn("Exception while reading " + CACHE_FILE_NAME, e);
}
}
// on further invocations, reload from imhotep
log.info("Starting TopTerms cache update. This may take a few minutes");
long started = System.currentTimeMillis();
datasetToFieldToTerms = updateTopTermsFromImhotep();
log.info("TopTerms cache update completed in " + (System.currentTimeMillis()-started)/1000 + "s");
initialized = true;
try {
Files.writeObjectToFileOrDie(new TopTermsArtifact(System.currentTimeMillis(), datasetToFieldToTerms), cacheFilePath);
} catch (IOException e) {
log.warn("Failed to serialize top terms cache to " + cacheFilePath);
}
}
private Map<String, Map<String, List<String>>> updateTopTermsFromImhotep() {
final Map<String, Map<String, List<String>>> newDatasetToFieldToTerms = Maps.newHashMap();
final DateTime startTime = DateTime.now().minusDays(DAYS_DELAY).withTimeAtStartOfDay().plusHours(12);
final DateTime endTime = startTime.plusHours(1);
for(final DatasetMetadata datasetMetadata : imhotepMetadataCache.getDatasets().values()) {
final String dataset = datasetMetadata.getName();
long started = System.currentTimeMillis();
final Map<String, List<String>> fieldToTerms = Maps.newHashMap();
final ImhotepSession imhotepSession;
try {
final ImhotepClient.SessionBuilder sessionBuilder = client.sessionBuilder(dataset, startTime, endTime).username("IQL: topterms");
if(sessionBuilder.getChosenShards().size() == 0) {
log.info("Index " + dataset + " has no shards for midday " + DAYS_DELAY + " days ago");
continue;
}
imhotepSession = sessionBuilder.build();
} catch (Exception e) {
log.warn("Failed to create a session for " + dataset + " " + startTime + " - " + endTime);
continue;
}
for(FieldMetadata fieldMetadata : datasetMetadata.getFields().values()) {
if(fieldMetadata.getType() != FieldType.String) {
continue; // we are trying to get some values for enum like string fields. can skip the random integer values
}
final String field = fieldMetadata.getName();
final List<TermCount> termCounts = imhotepSession.approximateTopTerms(field, false, TERMS_TO_CACHE);
if(termCounts.size() == 0) {
log.debug(dataset + "." + field + " has no terms");
}
final List<String> terms = Lists.newArrayList();
for(TermCount termCount : termCounts) {
terms.add(termCount.getTerm().getTermStringVal());
}
fieldToTerms.put(field, terms);
}
if(fieldToTerms.size() > 0) {
newDatasetToFieldToTerms.put(dataset, fieldToTerms);
}
long tookSeconds = (System.currentTimeMillis()-started)/1000;
if(tookSeconds > 1) {
log.debug("TopTerms for " + dataset + " loaded in " + tookSeconds + "s");
}
}
return newDatasetToFieldToTerms;
}
public List<String> getTopTerms(String dataset, String field) {
final Map<String, List<String>> fieldToTerms = datasetToFieldToTerms.get(dataset);
if(fieldToTerms == null) {
return Collections.emptyList();
}
final List<String> terms = fieldToTerms.get(field);
if(terms == null) {
return Collections.emptyList();
}
return Collections.unmodifiableList(terms);
}
}