/** * Copyright 2010-2013 Scale Unlimited. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.scaleunlimited.cascading; import java.io.PrintStream; import org.apache.hadoop.io.BytesWritable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import cascading.flow.FlowProcess; import cascading.operation.Debug; import cascading.operation.Filter; import cascading.operation.FilterCall; import cascading.operation.OperationCall; import cascading.pipe.Each; import cascading.pipe.Pipe; import cascading.tuple.Tuple; import cascading.tuple.TupleEntry; /** * A version of Cascading's Debug() operator with several additional features: * * 1. You can use slf4j to log (and only if log level is <= debug). * 2. You can limit output length (and remove \r\n) for cleaner output. * 3. You can limit the maximum number of logged Tuples. * 4. You can log only Tuples that have a target value in one field. * * It also has a static makePipe method, that does nothing if the log * level is set to > DEBUG. That way it's easy to leave code in and not * pay the price during production runs. * * The four constructors that take no output parameter also use slf4j to log * and depend on (level > DEBUG). The four that do take an output parameter * leverage Cascading's planner support and send their output directly to * System.err or System.out, just like Debug does. * */ @SuppressWarnings("serial") public class TupleLogger extends Debug { private static final Logger LOGGER = LoggerFactory.getLogger(TupleLogger.class); public static final int DEFAULT_MAX_ELEMENT_LENGTH = 100; // Support (via setLevel) to explicitly enable/disable tuple logging private static Boolean _enableTupleLogging = null; private String _prefix = null; private boolean _printFields = false; private Output _debugOutput = null; private int _printFieldsEvery = 10; private int _printTupleEvery = 1; private long _printMaxTuples = Long.MAX_VALUE; private int _maxPrintLength = DEFAULT_MAX_ELEMENT_LENGTH; private String _tupleMatchFieldName = null; private Object[] _tupleMatchFieldValues = null; private long _numMatchingTuples = 0L; private long _numPrintedTuples = 0L; public static Pipe makePipe(Pipe inPipe) { return makePipe(inPipe, inPipe.getName(), true, DEFAULT_MAX_ELEMENT_LENGTH); } public static Pipe makePipe(Pipe inPipe, boolean printFields) { return makePipe(inPipe, printFields, DEFAULT_MAX_ELEMENT_LENGTH); } public static Pipe makePipe(Pipe inPipe, boolean printFields, boolean makePipe) { return makePipe(inPipe, inPipe.getName(), printFields, DEFAULT_MAX_ELEMENT_LENGTH, makePipe); } public static Pipe makePipe(Pipe inPipe, String prefix, boolean printFields) { return makePipe(inPipe, prefix, printFields, DEFAULT_MAX_ELEMENT_LENGTH); } /** * Uses inPipe.getName() for the logging prefix. Convenient, but may not be * appropriate for pipes with long names. * * @param inPipe input pipe * @param printFields print field names * @param maxLength max length of any element string. * @return pipe to use */ public static Pipe makePipe(Pipe inPipe, boolean printFields, int maxLength) { return makePipe(inPipe, inPipe.getName(), printFields, maxLength); } /** * Don't create a new operation if nothing will get logged. * * Note this precludes dynamically changing the logging level, as the Flow will * be created without the TupleLogger if the debug level is set to > DEBUG when * the Flow is being created. * * @param inPipe input pipe * @param prefix prefix to use when printing results * @param printFields print field names * @param maxLength max length of any element string. * @return pipe to use */ public static Pipe makePipe(Pipe inPipe, String prefix, boolean printFields, int maxLength) { return makePipe(inPipe, prefix, printFields, maxLength, doTupleLogging()); } public static Pipe makePipe(Pipe inPipe, String prefix, boolean printFields, int maxLength, boolean addPipe) { if (addPipe) { TupleLogger tl = new TupleLogger(prefix, printFields); tl.setMaxPrintLength(maxLength); return new Each(inPipe, tl); } else { return inPipe; } } public TupleLogger() { super(); } public TupleLogger(String prefix) { super(prefix); _prefix = prefix; } public TupleLogger(boolean printFields) { super(printFields); _printFields = printFields; } public TupleLogger(String prefix, boolean printFields) { super(prefix, printFields); _prefix = prefix; _printFields = printFields; } public TupleLogger(Output output) { super(output); _debugOutput = output; } public TupleLogger(Output output, String prefix) { super(output, prefix); _debugOutput = output; _prefix = prefix; } public TupleLogger(Output output, boolean printFields) { super(output, printFields); _debugOutput = output; _printFields = printFields; } public TupleLogger(Output output, String prefix, boolean printFields) { super(output, prefix, printFields); _debugOutput = output; _prefix = prefix; _printFields = printFields; } /** * @return number of Tuples that must be logged before the next time we * output another header line with their field names */ @Override public int getPrintFieldsEvery() { return _printFieldsEvery; } /** * @param printFieldsEvery number of Tuples that must be logged before * the next time we output another header line with their field names */ @Override public void setPrintFieldsEvery(int printFieldsEvery) { _printFields = true; _printFieldsEvery = printFieldsEvery; } /** * @return number of (matching) Tuples to skip before logging the next one. * Note that this may be increased automatically as the total logged * approaches any configured maximum. * @see #setPrintOnlyMatchingTuples(String, Object) * @see #setPrintMaxTuples(long) */ @Override public int getPrintTupleEvery() { return _printTupleEvery; } /** * @param printTupleEvery number of (matching) Tuples to skip before logging * the next one. Note that this may be increased automatically as the * total logged approaches any configured maximum. * @see #setPrintOnlyMatchingTuples(String, Object) * @see #setPrintMaxTuples(long) */ @Override public void setPrintTupleEvery(int printTupleEvery) { _printTupleEvery = printTupleEvery; } /** * @return maximum number of Tuples that will be logged. */ public long getPrintMaxTuples() { return _printMaxTuples; } /** * @param printMaxTuples maximum number of Tuples that will be logged. * Once half of this total have been logged, we automatically begin * skipping more and more of them. */ public void setPrintMaxTuples(long printMaxTuples) { _printMaxTuples = printMaxTuples; } /** * @return maximum characters before Tuple representation is automatically * truncated */ public int getMaxPrintLength() { return _maxPrintLength; } /** * @param maxPrintLength maximum characters to output before Tuple * representation is automatically truncated */ public void setMaxPrintLength(int maxPrintLength) { _maxPrintLength = maxPrintLength; } /** * Log only Tuples whose <code>fieldName</code> equals <code>targetValue</code>. * Note that only such matching Tuples count toward the Tuple and fields * header logging (i.e., it's every N <em>matching</em> Tuples, and * it will log a maximum of M <em>matching</em> Tuples.) * @param fieldName of field whose value must match * @param targetValue of field in matching Tuples */ public void setPrintOnlyMatchingTuples(String fieldName, Object... targetValues) { _tupleMatchFieldName = fieldName; _tupleMatchFieldValues = targetValues; } @SuppressWarnings("rawtypes") @Override public void prepare(FlowProcess flowProcess, OperationCall<Long> operationCall) { // Let Debug have its own context (which we try to keep updated) super.prepare(flowProcess, operationCall); } /** @see Filter#isRemove(cascading.flow.FlowProcess, FilterCall) */ @SuppressWarnings("rawtypes") public boolean isRemove(FlowProcess flowProcess, FilterCall<Long> filterCall) { // Try to keep Debug's context updated so that its cleanup logs as // expected. Long count = filterCall.getContext(); filterCall.setContext(count+1); if ((_debugOutput != null) || doTupleLogging()) { TupleEntry entry = filterCall.getArguments(); // If there's a maximum # Tuples to be logged, then skip more and // more of them as we approach this limit. Here we both avoid a // divide by zero error (as we reach the limit), and also deal with // the fact that Debug has limited our _printTupleEvery field to // an Integer. // TODO Should we introduce a random element? if (_printMaxTuples < Long.MAX_VALUE) { _printTupleEvery = (_numPrintedTuples < _printMaxTuples) ? Math.max( _printTupleEvery, (int) (Math.min( ( _printMaxTuples / ( _printMaxTuples - _numPrintedTuples)), Integer.MAX_VALUE))) : Integer.MAX_VALUE; } // If we're ignoring Tuples that don't have a target value in one // specific field, then figure out if this one matches. boolean isCountTuple = true; if (_tupleMatchFieldName != null) { Object fieldValue = entry.getObject(_tupleMatchFieldName); for (Object tupleMatchFieldValue : _tupleMatchFieldValues) { if (tupleMatchFieldValue == null) { isCountTuple = (fieldValue == null); } else { isCountTuple = tupleMatchFieldValue.equals(fieldValue); } // Once we've found a match, we want to print - it's an OR of // any of the values that were provided. if (isCountTuple) { break; } } } if (isCountTuple) { // Print this Tuple unless we're supposed to skip it. if ((_numMatchingTuples % _printTupleEvery) == 0) { // If we're also printing the Field names, then do so only at // regular multiples of the Tuples we've actually printed. if (_printFields && ((_numPrintedTuples % _printFieldsEvery) == 0)) { log(entry.getFields().print()); } StringBuilder tupleString = new StringBuilder(); if (_printTupleEvery > 1) { String skippedNote = String.format("(%,d skipped) ", _printTupleEvery - 1); tupleString.append(skippedNote); } log(printTuple(tupleString, entry.getTuple())); _numPrintedTuples++; } // Note that this is the count of the Tuples that made it past // any special field value filtering, which is not necessarily // the total Tuple count passing through TupleLogger. _numMatchingTuples++; } } // Never filter anything return false; } @Override @SuppressWarnings("rawtypes") public void cleanup(FlowProcess flowProcess, OperationCall<Long> longOperationCall) { // Delegate to Debug if we're leveraging planner support if (_debugOutput != null) { super.cleanup(flowProcess, longOperationCall); // Otherwise send the same kind of thing to slf4j } else if (doTupleLogging()) { log("tuples count: " + longOperationCall.getContext().toString()); } } /** * Decide if we want to log tuples. * * @return true if we should log tuples. */ private static boolean doTupleLogging() { if (_enableTupleLogging != null) { return _enableTupleLogging; } else { return LOGGER.isDebugEnabled(); } } private void log(String message) { if (_prefix != null) { log(new StringBuilder(message)); } else { logInternal(message); } } private void log(StringBuilder message) { if (_prefix != null) { message.insert(0, ": "); message.insert(0, _prefix); } logInternal(message.toString()); } protected void logInternal(String message) { if (_debugOutput == null) { LOGGER.debug(message); } else { @SuppressWarnings("resource") PrintStream stream = ( _debugOutput == Output.STDOUT ? System.out : System.err); stream.println(message); } } public StringBuilder printTuple(StringBuilder buffer, Tuple tuple) { buffer.append( "[" ); for (int i = 0; i < tuple.size(); i++) { Object element = tuple.getObject( i ); if (element instanceof Tuple) { printTuple(buffer, (Tuple)element); } else { buffer.append("\'"); buffer.append(printObject(element, _maxPrintLength)); buffer.append( "\'" ); } if (i < tuple.size() - 1) { buffer.append(", "); } } buffer.append( "]" ); return buffer; } @SuppressWarnings({ "deprecation" }) public static String printObject(Object element, int maxLength) { StringBuilder result = new StringBuilder(); if (element == null) { result.append("null"); } else if (element instanceof String) { result.append((String)element); } else if (element instanceof BytesWritable) { // Use get() vs. getBytes() so we work with Hadoop 0.18.3 byte[] bytes = ((BytesWritable)element).get(); int numBytes = Math.min(bytes.length, maxLength/3); for (int i = 0; i < numBytes; i++) { result.append(String.format("%02X", bytes[i])); result.append(' '); } // Get rid of extra trailing space. if (numBytes > 0) { return result.substring(0, result.length() - 1); } } else { result.append(element.toString()); } // Now we want to limit the length, and get rid of control, \n, \r sequences return result.substring(0, Math.min(maxLength, result.length())).replaceAll("[\r\n\t]", " "); } /** * Explicitly set whether to enable/disable logging, which then will * impact what makePipe() will do. * * @param enabled true to force logging, false to force no logging. */ public static void enableLogging(boolean enabled) { _enableTupleLogging = enabled; } }