/* * Copyright (C) 2014 Indeed Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except * in compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed under the * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either * express or implied. See the License for the specific language governing permissions and * limitations under the License. */ package com.indeed.imhotep.iql; import au.com.bytecode.opencsv.CSVWriter; import com.google.common.base.Charsets; import com.google.common.base.Throwables; import com.google.common.collect.Lists; import com.google.common.io.ByteStreams; import com.google.common.util.concurrent.UncheckedTimeoutException; import com.indeed.util.core.TreeTimer; import com.indeed.imhotep.ShardInfo; import com.indeed.imhotep.api.ImhotepOutOfMemoryException; import com.indeed.imhotep.api.ImhotepSession; import com.indeed.imhotep.client.ImhotepClient; import com.indeed.imhotep.client.ShardIdWithVersion; import com.indeed.imhotep.ez.EZImhotepSession; import com.indeed.imhotep.ez.GroupKey; import com.indeed.imhotep.ez.StatReference; import com.indeed.imhotep.web.ImhotepMetadataCache; import com.indeed.util.core.Pair; import com.indeed.util.core.io.Closeables2; import org.apache.log4j.Logger; import org.joda.time.DateTime; import org.joda.time.Interval; import org.joda.time.Period; import org.joda.time.format.PeriodFormat; import javax.annotation.Nonnull; import java.io.BufferedOutputStream; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.Closeable; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStream; import java.io.OutputStreamWriter; import java.io.PrintWriter; import java.text.DecimalFormat; import java.util.Collection; import java.util.Iterator; import java.util.List; import java.util.Map; import static com.indeed.imhotep.ez.Stats.Stat; /** * @author jplaisance */ public final class IQLQuery implements Closeable { private static final int IN_MEMORY_ROW_LIMIT = 50000; private static final Logger log = Logger.getLogger(IQLQuery.class); private static final Period executionTimeout = Period.minutes(6); public static final String TEMP_FILE_PREFIX = "iql_tmp"; private static final String EVENT_SOURCE_END = "\n\n"; private final List<Stat> stats; private final String dataset; private final DateTime start; private final DateTime end; private final List<Condition> conditions; private final List<Grouping> groupings; private final int rowLimit; private final ImhotepMetadataCache metadata; private final List<ShardIdWithVersion> shardVersionList; private final List<Interval> timeIntervalsMissingShards; private final ImhotepClient.SessionBuilder sessionBuilder; // session used for the current execution private EZImhotepSession session; public IQLQuery(ImhotepClient client, final List<Stat> stats, final String dataset, final DateTime start, final DateTime end, final @Nonnull List<Condition> conditions, final @Nonnull List<Grouping> groupings, final int rowLimit, final String username, ImhotepMetadataCache metadata) { this(client, stats, dataset, start, end, conditions, groupings, rowLimit, username, metadata, -1, -1); } public IQLQuery(ImhotepClient client, final List<Stat> stats, final String dataset, final DateTime start, final DateTime end, final @Nonnull List<Condition> conditions, final @Nonnull List<Grouping> groupings, final int rowLimit, final String username, ImhotepMetadataCache metadata, final long imhotepLocalTempFileSizeLimit, final long imhotepDaemonTempFileSizeLimit) { this.stats = stats; this.dataset = dataset; this.start = start; this.end = end; this.conditions = conditions; this.groupings = groupings; this.rowLimit = rowLimit; this.metadata = metadata; sessionBuilder = client.sessionBuilder(dataset, start, end) .localTempFileSizeLimit(imhotepLocalTempFileSizeLimit) .daemonTempFileSizeLimit(imhotepDaemonTempFileSizeLimit).username(username); shardVersionList = sessionBuilder.getChosenShards(); timeIntervalsMissingShards = sessionBuilder.getTimeIntervalsMissingShards(); } /** * Not thread safe due to session reference caching for close(). */ public ExecutionResult execute(boolean progress, OutputStream outputStream, boolean getTotals) throws ImhotepOutOfMemoryException { //if outputStream passed, update on progress final PrintWriter out = progress ? new PrintWriter(new OutputStreamWriter(new BufferedOutputStream(outputStream), Charsets.UTF_8)) : null; final TreeTimer timer = new TreeTimer(); timer.push("Imhotep session creation"); final ImhotepSession imhotepSession = sessionBuilder.build(); session = new EZImhotepSession(imhotepSession); timer.pop(); final long timeoutTS = System.currentTimeMillis() + executionTimeout.toStandardSeconds().getSeconds() * 1000; try { final int steps = conditions.size() + (groupings.size() == 0 ? 1 : groupings.size()) - 1; int count = 0; if(progress) { out.println(": Beginning IQL Query"); out.println("event: totalsteps"); out.print("data: " + steps + EVENT_SOURCE_END); out.print(": Starting time filter" + EVENT_SOURCE_END); out.flush(); } timer.push("Time filter"); timeFilter(session); timer.pop(); if(progress) { out.print(": Time filtering finished" + EVENT_SOURCE_END); out.flush(); } for (Condition condition : conditions) { checkTimeout(timeoutTS); timer.push("Filtering " + condition.getClass().getSimpleName()); condition.filter(session); timer.pop(); count = updateProgress(progress, out, count); } if (groupings.size() > 0) { List<StatReference> statRefs = null; double[] totals = new double[0]; if(getTotals) { timer.push("Pushing stats"); statRefs = pushStats(session); timer.pop(); timer.push("Getting totals"); totals = getStats(statRefs); timer.pop(); } Map<Integer, GroupKey> groupKeys = EZImhotepSession.newGroupKeys(); // do Imhotep regroup on all except the last grouping for (int i = 0; i < groupings.size()-1; i++) { checkTimeout(timeoutTS); timer.push("Regroup " + (i + 1)); groupKeys = groupings.get(i).regroup(session, groupKeys); timer.pop(); count = updateProgress(progress, out, count); } checkTimeout(timeoutTS); if(!getTotals) { timer.push("Pushing stats"); statRefs = pushStats(session); timer.pop(); } // do FTGS on the last grouping timer.push("FTGS"); final Iterator<GroupStats> groupStatsIterator = groupings.get(groupings.size() - 1).getGroupStats(session, groupKeys, statRefs, timeoutTS); timer.pop(); updateProgress(progress, out, count); return new ExecutionResult(groupStatsIterator, totals, timer.toString(), session.getTempFilesBytesWritten()); } else { timer.push("Pushing stats"); final List<StatReference> statRefs = pushStats(session); timer.pop(); timer.push("Getting stats"); final double[] stats = getStats(statRefs); timer.pop(); count = updateProgress(progress, out, count); final List<GroupStats> result = Lists.newArrayList(); result.add(new GroupStats(GroupKey.<Comparable>empty(), stats)); return new ExecutionResult(result.iterator(), stats, timer.toString(), session.getTempFilesBytesWritten()); } } catch (Throwable t) { log.error("Error while executing the query", t); throw Throwables.propagate(t); } } private int updateProgress(boolean progress, PrintWriter out, int count) { count++; if(progress) { out.println("event: chunkcomplete"); out.print("data: " + count + EVENT_SOURCE_END); out.flush(); } return count; } private double[] getStats(List<StatReference> statRefs) { final double[] stats = new double[statRefs.size()]; for (int i = 0; i < statRefs.size(); i++) { final double[] groupStat = session.getGroupStats(statRefs.get(i)); stats[i] = groupStat.length > 1 ? groupStat[1] : 0; } return stats; } public static class ExecutionResult { private final Iterator<GroupStats> rows; private final double[] totals; private final String timings; private final long imhotepTempFilesBytesWritten; public ExecutionResult(Iterator<GroupStats> rows, double[] totals, String timings, long imhotepTempFilesBytesWritten) { this.rows = rows; this.totals = totals; this.timings = timings; this.imhotepTempFilesBytesWritten = imhotepTempFilesBytesWritten; } public Iterator<GroupStats> getRows() { return rows; } public double[] getTotals() { return totals; } public String getTimings() { return timings; } public long getImhotepTempFilesBytesWritten() { return imhotepTempFilesBytesWritten; } } private void timeFilter(EZImhotepSession session) throws ImhotepOutOfMemoryException { final Pair<Long, Long> shardsMinMax = getShardsMinMax(shardVersionList); final long min = shardsMinMax.getFirst(); final long max = shardsMinMax.getSecond(); if (min < start.getMillis() || max > end.getMillis()) { new MetricCondition(EZImhotepSession.intField(getTimeField()), (int)(start.getMillis()/1000), (int)((end.getMillis()-1)/1000), false).filter(session); } } /** * Throws UncheckedTimeoutException if current time is past the provided timeout timestamp. * @param timeoutTS timestamp of when the query times out in milliseconds */ public void checkTimeout(long timeoutTS) { if(System.currentTimeMillis() > timeoutTS) { throw new UncheckedTimeoutException("The query took longer than the allowed timeout of " + executionTimeout.toString(PeriodFormat.getDefault())); } } /** * Returns minimum and maximum milliseconds covered by the list of shards */ private static Pair<Long, Long> getShardsMinMax(final List<ShardIdWithVersion> shardVersionList) { long min = Long.MAX_VALUE; long max = Long.MIN_VALUE; for (ShardIdWithVersion shard : shardVersionList) { final ShardInfo.DateTimeRange interval = shard.getRange(); if (interval.start.getMillis() < min) { min = interval.start.getMillis(); } if (interval.end.getMillis() > max) { max = interval.end.getMillis(); } } return Pair.of(min, max); } @Nonnull private String getTimeField() { return metadata.getDataset(dataset).getTimeFieldName(); } private List<StatReference> pushStats(EZImhotepSession session) throws ImhotepOutOfMemoryException { final List<StatReference> statRefs = Lists.newArrayList(); for (Stat stat : stats) { final StatReference statReference = session.pushStatGeneric(stat); statRefs.add(statReference); } return statRefs; } private boolean requiresSorting() { // TODO: enable sorting // if(groupings.size() > 0) { // final Grouping lastGrouping = groupings.get(groupings.size() - 1); // if(lastGrouping instanceof FieldGrouping && ((FieldGrouping)lastGrouping).isNoExplode()) { // return true; // currently we only have to sort when using non-exploded field grouping as the last grouping // } // } return false; } public static class WriteResults { public final int rowsWritten; public final File unsortedFile; public final Iterator<GroupStats> resultCacheIterator; public final long timeTaken; public WriteResults(int rowsWritten, File unsortedFile, Iterator<GroupStats> resultCacheIterator, long timeTaken) { this.rowsWritten = rowsWritten; this.unsortedFile = unsortedFile; this.resultCacheIterator = resultCacheIterator; this.timeTaken = timeTaken; } public boolean didOverflowToDisk() { return unsortedFile != null; } } @Nonnull public WriteResults outputResults(final Iterator<GroupStats> rows, OutputStream httpOutStream, final boolean csv, final boolean progress, final int rowLimit, int groupingColumns, int selectColumns, boolean cacheDisabled) { final long timeStarted = System.currentTimeMillis(); final boolean requiresSorting = requiresSorting(); if(cacheDisabled && !requiresSorting) { // just stream the rows out. don't have to worry about keeping a copy at all final int rowsWritten = writeRowsToStream(rows, httpOutStream, csv, rowLimit, progress); return new WriteResults(rowsWritten, null, null, System.currentTimeMillis() - timeStarted); } List<GroupStats> resultsCache = Lists.newArrayList(); int rowsLoaded = 0; boolean cacheOverflow = false; while(rows.hasNext()) { resultsCache.add(rows.next()); if((++rowsLoaded) >= IN_MEMORY_ROW_LIMIT) { cacheOverflow = true; break; } } // TODO: figure out the size of the resulting data for reporting or limiting? if(!cacheOverflow) { // results fit in memory. stream them out // TODO: in memory sort if necessary? or just always defer to gnu sort? final int rowsWritten = writeRowsToStream(resultsCache.iterator(), httpOutStream, csv, rowLimit, progress); return new WriteResults(rowsWritten, null, resultsCache.iterator(), System.currentTimeMillis() - timeStarted); } else { // have to work with the files on the hard drive to avoid OOM try { final File unsortedFile = File.createTempFile(TEMP_FILE_PREFIX, null); final FileOutputStream fileOutputStream = new FileOutputStream(unsortedFile); final long started = System.currentTimeMillis(); int rowsWritten = 0; // flush cache rowsWritten += writeRowsToStream(resultsCache.iterator(), fileOutputStream, csv, Integer.MAX_VALUE, false); //noinspection UnusedAssignment resultsCache = null; // let it be GC'd // save the remaining rows to disk rowsWritten += writeRowsToStream(rows, fileOutputStream, csv, Integer.MAX_VALUE, false); fileOutputStream.close(); log.trace("Stored on disk to " + unsortedFile.getPath() + " in " + (System.currentTimeMillis() - started) + "ms"); final File sortedFile; if(requiresSorting) { // do on disk sort with gnu sort sortedFile = sortFile(unsortedFile, groupingColumns, selectColumns); } else { sortedFile = unsortedFile; } // send the results out to the client copyStream(new FileInputStream(sortedFile), httpOutStream, rowLimit, progress); return new WriteResults(rowsWritten, unsortedFile, null, System.currentTimeMillis() - timeStarted); } catch (IOException e) { throw Throwables.propagate(e); } } } /** * Sorts the given file by invoking gnu 'sort' command and returns a reference to the sorted copy. * Expects inputFile to have ".tmp" in the name */ private File sortFile(File inputFile, int groupingColumns, int selectColumns) { try { final long started; final File sortedFile; started = System.currentTimeMillis(); sortedFile = new File(inputFile.getPath().replace(".tmp", ".sorted.tmp")); final List<String> sortCmd = Lists.newArrayList("sort", "-o", sortedFile.getPath(), "-t", "\t"); // TODO: custom sorting orders for(int i = 1; i <= groupingColumns; i++) { sortCmd.add("-k" + i + "," + i); } for(int i = groupingColumns + 1; i <= groupingColumns + selectColumns; i++) { sortCmd.add("-k" + i + "," + i + "n"); } sortCmd.add(inputFile.getPath()); log.trace(IQLQuery.join(sortCmd, " ")); final Process sortProc = Runtime.getRuntime().exec(sortCmd.toArray(new String[sortCmd.size()]), null); sortProc.waitFor(); log.trace("Sorted to: " + sortedFile.getPath() + " in " + (System.currentTimeMillis() - started) + "ms"); return sortedFile; } catch (IOException e) { throw Throwables.propagate(e); } catch (InterruptedException e) { throw Throwables.propagate(e); } } private static String join(Collection items, String delimiter) { final StringBuilder sb = new StringBuilder(items.size() * 7); for (final Iterator it = items.iterator(); it.hasNext(); ) { sb.append(it.next()); if (it.hasNext()) { sb.append(delimiter); } } return sb.toString(); } /** * Copies everything from input stream to the output stream, limiting to the requested number of lines if necessary. * Input stream is closed; output stream is flushed but not closed when done. */ public static int copyStream(InputStream inputStream, OutputStream outputStream, int lineLimit, boolean eventSource) { final String EVENT_NAME = "resultstream"; try { if(!eventSource && (lineLimit == Integer.MAX_VALUE || lineLimit <= 0)) { // no need to count rows so copy streams completely // we can't do this if we need the eventSource data ByteStreams.copy(inputStream, outputStream); outputStream.flush(); return 0; // unknown how many rows were copied as we haven't counted } // have to count the lines as we copy to enforce the limit final BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream, Charsets.UTF_8)); final BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(outputStream, Charsets.UTF_8)); if(eventSource) { writer.write("event: " + EVENT_NAME); writer.newLine(); } String line = reader.readLine(); int linesCopied = 0; while(line != null) { if(eventSource) { line = "data: " + line; } writer.write(line); writer.newLine(); if(++linesCopied >= lineLimit) { break; } line = reader.readLine(); } if(eventSource) { writer.write("\nevent: complete\ndata: :)\n\n"); } Closeables2.closeQuietly(reader, log); writer.flush(); return linesCopied; } catch (IOException e) { throw Throwables.propagate(e); } finally { Closeables2.closeQuietly(inputStream, log); } } public static int writeRowsToStream(final Iterator<GroupStats> rows, OutputStream os, final boolean csv, final int rowLimit, final boolean progress) { // TODO: how much precision do we want? final DecimalFormat format = new DecimalFormat("#.#######"); final String tsvDelimiter = "\t"; final PrintWriter out = new PrintWriter(new OutputStreamWriter(new BufferedOutputStream(os), Charsets.UTF_8)); final CSVWriter csvWriter; final List<String> csvFields; if(csv) { csvWriter = new CSVWriter(out); csvFields = Lists.newArrayList(); } else { csvWriter = null; csvFields = null; } int rowsProcessed = 0; if(progress) { out.println("event: resultstream"); } while (rows.hasNext()) { final GroupStats entry = rows.next(); if (entry == null) { continue; } if (progress) { out.print("data: "); } if(!csv) { // TSV GroupKey current = entry.groupKey; while (!current.isEmpty()) { out.print(current.head()); current = current.tail(); if (!current.isEmpty()) { out.print(tsvDelimiter); } } for (double l : entry.stats) { out.print(tsvDelimiter); out.print(Double.isNaN(l) ? "NaN" : format.format(l)); } out.println(); } else { // csv GroupKey current = entry.groupKey; while (!current.isEmpty()) { csvFields.add(current.head().toString()); current = current.tail(); } for (double l : entry.stats) { csvFields.add(format.format(l)); } csvWriter.writeNext(csvFields.toArray(new String[csvFields.size()])); csvFields.clear(); // reused on next iteration } if(++rowsProcessed >= rowLimit) { break; // reached the requested row limit } } if(progress) { out.print("\nevent: complete\ndata: :)" + EVENT_SOURCE_END); } out.flush(); return rowsProcessed; } public List<ShardIdWithVersion> getShardVersionList() { return shardVersionList; } public List<Interval> getTimeIntervalsMissingShards() { return timeIntervalsMissingShards; } public int getRowLimit() { return rowLimit; } @Override public void close() throws IOException { if(session != null) { Closeables2.closeQuietly(session, log); } } }