/*
* Copyright 2015 The Apache Software Foundation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.drill.exec.store.httpd;
import com.google.common.base.Preconditions;
import com.google.common.collect.Maps;
import io.netty.buffer.DrillBuf;
import nl.basjes.parse.core.Casts;
import nl.basjes.parse.core.Parser;
import nl.basjes.parse.core.exceptions.DissectionFailure;
import nl.basjes.parse.core.exceptions.InvalidDissectorException;
import nl.basjes.parse.core.exceptions.MissingDissectorsException;
import nl.basjes.parse.httpdlog.HttpdLoglineParser;
import org.apache.drill.exec.vector.complex.writer.BaseWriter.MapWriter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.EnumSet;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class HttpdParser {
private static final Logger LOG = LoggerFactory.getLogger(HttpdParser.class);
public static final String PARSER_WILDCARD = ".*";
public static final String SAFE_WILDCARD = "_$";
public static final String SAFE_SEPARATOR = "_";
public static final String REMAPPING_FLAG = "#";
private final Parser<HttpdLogRecord> parser;
private final HttpdLogRecord record;
public static final HashMap<String, String> LOGFIELDS = new HashMap<String, String>();
static
{
LOGFIELDS.put("request_receive_time_weekyear__utc", "TIME_YEAR:request_receive_time_weekyear__utc");
LOGFIELDS.put("request_referer_ref", "HTTP_REF:request_referer_ref");
LOGFIELDS.put("request_referer_protocol", "HTTP_PROTOCOL:request_referer_protocol");
LOGFIELDS.put("request_receive_time_timezone", "TIME_ZONE:request_receive_time_timezone");
LOGFIELDS.put("connection_client_host", "IP:connection_client_host");
LOGFIELDS.put("connection_client_ip", "IP:connection_client_ip");
LOGFIELDS.put("connection_client_peerip", "IP:connection_client_peerip");
LOGFIELDS.put("connection_server_ip", "IP:connection_server_ip");
LOGFIELDS.put("request_receive_time_day", "TIME_DAY:request_receive_time_day");
LOGFIELDS.put("request_receive_time_minute__utc", "TIME_MINUTE:request_receive_time_minute__utc");
LOGFIELDS.put("request_referer_query_$", "STRING:request_referer_query_$");
LOGFIELDS.put("request_receive_time_millisecond__utc", "TIME_MILLISECOND:request_receive_time_millisecond__utc");
LOGFIELDS.put("request_firstline_uri_port", "HTTP_PORT:request_firstline_uri_port");
LOGFIELDS.put("request_referer_userinfo", "HTTP_USERINFO:request_referer_userinfo");
LOGFIELDS.put("request_receive_time_second__utc", "TIME_SECOND:request_receive_time_second__utc");
LOGFIELDS.put("request_firstline_uri_protocol", "HTTP_PROTOCOL:request_firstline_uri_protocol");
LOGFIELDS.put("request_receive_time_month", "TIME_MONTH:request_receive_time_month");
LOGFIELDS.put("request_firstline_uri_query", "HTTP_QUERYSTRING:request_firstline_uri_query");
LOGFIELDS.put("request_firstline_uri_path", "HTTP_PATH:request_firstline_uri_path");
LOGFIELDS.put("request_receive_time_hour__utc", "TIME_HOUR:request_receive_time_hour__utc");
LOGFIELDS.put("request_receive_time_monthname", "TIME_MONTHNAME:request_receive_time_monthname");
LOGFIELDS.put("request_receive_time_year__utc", "TIME_YEAR:request_receive_time_year__utc");
LOGFIELDS.put("request_receive_time_second", "TIME_SECOND:request_receive_time_second");
LOGFIELDS.put("request_referer", "HTTP_URI:request_referer");
LOGFIELDS.put("request_receive_time_monthname__utc", "TIME_MONTHNAME:request_receive_time_monthname__utc");
LOGFIELDS.put("request_referer_path", "HTTP_PATH:request_referer_path");
LOGFIELDS.put("request_receive_time_weekyear", "TIME_YEAR:request_receive_time_weekyear");
LOGFIELDS.put("request_firstline_protocol", "HTTP_PROTOCOL:request_firstline_protocol");
LOGFIELDS.put("request_referer_port", "HTTP_PORT:request_referer_port");
LOGFIELDS.put("request_receive_time_minute", "TIME_MINUTE:request_receive_time_minute");
LOGFIELDS.put("request_status_last", "STRING:request_status_last");
LOGFIELDS.put("request_receive_time_hour", "TIME_HOUR:request_receive_time_hour");
LOGFIELDS.put("request_firstline_protocol_version", "HTTP_PROTOCOL_VERSION:request_firstline_protocol_version");
LOGFIELDS.put("request_receive_time", "TIME_STAMP:request_receive_time");
LOGFIELDS.put("request_firstline_method", "HTTP_METHOD:request_firstline_method");
LOGFIELDS.put("request_receive_time_epoch", "TIME_EPOCH:request_receive_time_epoch");
LOGFIELDS.put("request_receive_time_weekofweekyear", "TIME_WEEK:request_receive_time_weekofweekyear");
LOGFIELDS.put("request_firstline_uri_host", "HTTP_HOST:request_firstline_uri_host");
LOGFIELDS.put("request_referer_query", "HTTP_QUERYSTRING:request_referer_query");
LOGFIELDS.put("request_firstline_uri_userinfo", "HTTP_USERINFO:request_firstline_uri_userinfo");
LOGFIELDS.put("response_body_bytes", "BYTES:response_body_bytes");
LOGFIELDS.put("response_body_bytesclf", "BYTES:response_body_bytesclf");
LOGFIELDS.put("request_referer_host", "HTTP_HOST:request_referer_host");
LOGFIELDS.put("request_receive_time_weekofweekyear__utc", "TIME_WEEK:request_receive_time_weekofweekyear__utc");
LOGFIELDS.put("request_firstline_uri", "HTTP_URI:request_firstline_uri");
LOGFIELDS.put("request_firstline_uri_ref", "HTTP_REF:request_firstline_uri_ref");
LOGFIELDS.put("request_receive_time_year", "TIME_YEAR:request_receive_time_year");
LOGFIELDS.put("request_firstline", "HTTP_FIRSTLINE:request_firstline");
LOGFIELDS.put("request_user-agent", "HTTP_USERAGENT:request_user-agent");
LOGFIELDS.put("request_cookies", "HTTP_COOKIE:request_cookies");
LOGFIELDS.put("server_process_time", "MICROSECONDS:server_process_time");
LOGFIELDS.put("request_cookies_$", "HTTP_COOKIE:request_cookies_$");
LOGFIELDS.put("server_environment_$", "VARIABLE:server_environment_$");
LOGFIELDS.put("server_filename", "FILENAME:server_filename");
LOGFIELDS.put("request_protocol", "PROTOCOL:request_protocol");
LOGFIELDS.put("request_header_", "HTTP_HEADER:request_header_");
LOGFIELDS.put("connection_keepalivecount", "NUMBER:connection_keepalivecount");
LOGFIELDS.put("connection_client_logname", "NUMBER:connection_client_logname");
LOGFIELDS.put("request_errorlogid", "STRING:request_errorlogid");
LOGFIELDS.put("request_method", "HTTP_METHOD:request_method");
LOGFIELDS.put("server_module_note_$", "STRING:server_module_note_$");
LOGFIELDS.put("response_header_$", "HTTP_HEADER:response_header_$");
LOGFIELDS.put("request_server_port_canonical", "PORT:request_server_port_canonical");
LOGFIELDS.put("connection_server_port_canonical", "PORT:connection_server_port_canonical");
LOGFIELDS.put("connection_server_port", "PORT:connection_server_port");
LOGFIELDS.put("connection_client_port", "PORT:connection_client_port");
LOGFIELDS.put("connection_server_child_processid", "NUMBER:connection_server_child_processid");
LOGFIELDS.put("connection_server_child_threadid", "NUMBER:connection_server_child_threadid");
LOGFIELDS.put("connection_server_child_hexthreadid", "NUMBER:connection_server_child_hexthreadid");
LOGFIELDS.put("request_querystring", "HTTP_QUERYSTRING:request_querystring");
LOGFIELDS.put("request_handler", "STRING:request_handler");
LOGFIELDS.put("request_status_original", "STRING:request_status_original");
LOGFIELDS.put("request_status_last", "STRING:request_status_last");
LOGFIELDS.put("request_receive_time_begin_msec", "TIME_EPOCH:request_receive_time_begin_msec");
LOGFIELDS.put("request_receive_time_end_msec", "TIME_EPOCH:request_receive_time_end_msec");
LOGFIELDS.put("request_receive_time_begin_usec", "TIME_EPOCH_USEC:request_receive_time_begin_usec");
LOGFIELDS.put("request_receive_time_begin_usec", "TIME_EPOCH_USEC:request_receive_time_begin_usec");
LOGFIELDS.put("request_receive_time_end_usec", "TIME_EPOCH_USEC:request_receive_time_end_usec");
LOGFIELDS.put("request_receive_time_begin_msec_frac", "TIME_EPOCH:request_receive_time_begin_msec_frac");
LOGFIELDS.put("request_receive_time_begin_msec_frac", "TIME_EPOCH:request_receive_time_begin_msec_frac");
LOGFIELDS.put("request_receive_time_end_msec_frac", "TIME_EPOCH:request_receive_time_end_msec_frac");
LOGFIELDS.put("request_receive_time_begin_usec_frac", "TIME_EPOCH_USEC_FRAC:request_receive_time_begin_usec_frac");
LOGFIELDS.put("request_receive_time_begin_usec_frac", "TIME_EPOCH_USEC_FRAC:request.receive.time.begin.usec_frac");
LOGFIELDS.put("request_receive_time_end_usec_frac", "TIME_EPOCH_USEC_FRAC:request_receive_time_end_usec_frac");
LOGFIELDS.put("response_server_processing_time", "SECONDS:response_server_processing_time");
LOGFIELDS.put("connection_client_user", "STRING:connection_client_user");
LOGFIELDS.put("request_urlpath", "URI:request_urlpath");
LOGFIELDS.put("connection_server_name_canonical", "STRING:connection_server_name_canonical");
LOGFIELDS.put("connection_server_name", "STRING:connection_server_name");
LOGFIELDS.put("response_connection_status", "HTTP_CONNECTSTATUS:response_connection_status");
LOGFIELDS.put("request_bytes", "BYTES:request_bytes");
LOGFIELDS.put("response_bytes", "BYTES:response_bytes");
}
//Map map = Collections.synchronizedMap(LOGFIELDS);
public HttpdParser(final MapWriter mapWriter, final DrillBuf managedBuffer, final String logFormat,
final String timestampFormat, final Map<String, String> fieldMapping)
throws NoSuchMethodException, MissingDissectorsException, InvalidDissectorException {
Preconditions.checkArgument(logFormat != null && !logFormat.trim().isEmpty(), "logFormat cannot be null or empty");
this.record = new HttpdLogRecord(managedBuffer);
this.parser = new HttpdLoglineParser<>(HttpdLogRecord.class, logFormat, timestampFormat);
setupParser(mapWriter, logFormat, fieldMapping);
if (timestampFormat != null && !timestampFormat.trim().isEmpty()) {
LOG.info("Custom timestamp format has been specified. This is an informational note only as custom timestamps is rather unusual.");
}
if (logFormat.contains("\n")) {
LOG.info("Specified logformat is a multiline log format: {}", logFormat);
}
}
/**
* We do not expose the underlying parser or the record which is used to manage the writers.
*
* @param line log line to tear apart.
*
* @throws DissectionFailure
* @throws InvalidDissectorException
* @throws MissingDissectorsException
*/
public void parse(final String line) throws DissectionFailure, InvalidDissectorException, MissingDissectorsException {
parser.parse(record, line);
record.finishRecord();
}
/**
* In order to define a type remapping the format of the field configuration will look like: <br/>
* HTTP.URI:request.firstline.uri.query.[parameter name] <br/>
*
* @param parser Add type remapping to this parser instance.
* @param fieldName request.firstline.uri.query.[parameter_name]
* @param fieldType HTTP.URI, etc..
*/
private void addTypeRemapping(final Parser<HttpdLogRecord> parser, final String fieldName, final String fieldType) {
LOG.debug("Adding type remapping - fieldName: {}, fieldType: {}", fieldName, fieldType);
parser.addTypeRemapping(fieldName, fieldType);
}
/**
* The parser deals with dots unlike Drill wanting underscores request_referer. For the sake of simplicity we are
* going replace the dots. The resultant output field will look like: request.referer.<br>
* Additionally, wild cards will get replaced with .*
*
* @param drillFieldName name to be cleansed.
* @return
*/
public static String parserFormattedFieldName(final String drillFieldName) {
String tempFieldName;
tempFieldName = LOGFIELDS.get(drillFieldName);
return tempFieldName.replace(SAFE_WILDCARD, PARSER_WILDCARD).replaceAll(SAFE_SEPARATOR, ".").replaceAll("\\.\\.", "_");
}
/**
* Drill cannot deal with fields with dots in them like request.referer. For the sake of simplicity we are going
* ensure the field name is cleansed. The resultant output field will look like: request_referer.<br>
* Additionally, wild cards will get replaced with _$
*
* @param parserFieldName name to be cleansed.
* @return
*/
public static String drillFormattedFieldName(final String parserFieldName) {
if (parserFieldName.contains(":") ) {
String[] fieldPart= parserFieldName.split(":");
return fieldPart[1].replaceAll("_", "__").replace(PARSER_WILDCARD, SAFE_WILDCARD).replaceAll("\\.", SAFE_SEPARATOR);
}
else{
return parserFieldName.replaceAll("_", "__").replace(PARSER_WILDCARD, SAFE_WILDCARD).replaceAll("\\.", SAFE_SEPARATOR);
}
}
private void setupParser(final MapWriter mapWriter, final String logFormat, final Map<String, String> fieldMapping)
throws NoSuchMethodException, MissingDissectorsException, InvalidDissectorException {
/**
* If the user has selected fields, then we will use them to configure the parser because this would be the most
* efficient way to parse the log.
*/
final Map<String, String> requestedPaths;
final List<String> allParserPaths = parser.getPossiblePaths();
if (fieldMapping != null && !fieldMapping.isEmpty()) {
LOG.debug("Using fields defined by user.");
requestedPaths = fieldMapping;
}
else {
/**
* Use all possible paths that the parser has determined from the specified log format.
*/
LOG.debug("No fields defined by user, defaulting to all possible fields.");
requestedPaths = Maps.newHashMap();
for (final String parserPath : allParserPaths) {
requestedPaths.put(drillFormattedFieldName(parserPath), parserPath);
}
}
/**
* By adding the parse target to the dummy instance we activate it for use. Which we can then use to find out which
* paths cast to which native data types. After we are done figuring this information out, we throw this away
* because this will be the slowest parsing path possible for the specified format.
*/
Parser<Object> dummy = new HttpdLoglineParser<>(Object.class, logFormat);
dummy.addParseTarget(String.class.getMethod("indexOf", String.class), allParserPaths);
for (final Map.Entry<String, String> entry : requestedPaths.entrySet()) {
final EnumSet<Casts> casts;
/**
* Check the field specified by the user to see if it is supposed to be remapped.
*/
if (entry.getValue().startsWith(REMAPPING_FLAG)) {
/**
* Because this field is being remapped we need to replace the field name that the parser uses.
*/
entry.setValue(entry.getValue().substring(REMAPPING_FLAG.length()));
final String[] pieces = entry.getValue().split(":");
addTypeRemapping(parser, pieces[1], pieces[0]);
casts = Casts.STRING_ONLY;
}
else {
casts = dummy.getCasts(entry.getValue());
}
LOG.debug("Setting up drill field: {}, parser field: {}, which casts as: {}", entry.getKey(), entry.getValue(), casts);
record.addField(parser, mapWriter, casts, entry.getValue(), entry.getKey());
}
}
}