/**
* Licensed to Cloudera, Inc. under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. Cloudera, Inc. licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.cloudera.flume.handlers.text;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.cloudera.flume.core.Event;
import com.cloudera.flume.core.EventImpl;
import com.cloudera.flume.core.Event.Priority;
import com.cloudera.util.Clock;
import com.cloudera.util.NetUtils;
/**
* Apache Access Logs are configurable to have custom formats. The default
* (which we parse here) is :
*
* Default configured to be: LogFormat "%h %l %u %t \"%r\" %>s %b" common
*
* which means:
*
* %h ip of remote host
*
* %l identd check (usually '-' because is usually unreliable data)
*
* %u user according to HTTP Authentication. '-' if no pw
*
* %t time request received in apache date format.
*
* %r request string. The format has it in quotes.
*
* %>s status code sent back to client
*
* %b size of object sent to client not including header. '-' here means 0 data.
*
* Others escape sequences are explained here
* http://httpd.apache.org/docs/2.0/logs.html
*
* Another common one is the CombinedLogFormat which adds two fields:
*
* LogFormat "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-agent}i\""
* combined
*
* TODO (jon) Add support for parsing based on Apache LogFormat directive
* http://httpd.apache.org/docs/1.3/mod/mod_log_config.html#formats
*
* TODO (jon) These links talk about the cost incurred by the Calendar class.
* Part of it is due to the fact that it is not thread safe. The extract method
* is very likely to run in multiple threads at some point so this will become
* an issue.
*
* http://blog.bielu.com/2008/08/javautilcalendar-confusion-is-it-safe_28.html
*
* The article links to the library below -- apparently to be standard as part
* of java 7.
*
* http://joda-time.sourceforge.net/
*
* TODO (jon) implement apache format outputter.
*/
public class ApacheAccessLogFormat implements InputFormat {
static final Logger LOG = LoggerFactory.getLogger(ApacheAccessLogFormat.class);
final static Pattern APACHE_PAT = Pattern
.compile("^(\\S+) (\\S+) (\\S+) \\[(.*?)\\] \\\"(.*?)\\\" (\\S+) (\\S+)( \\\"(.*?)\\\" \\\"(.*?)\\\")?$");
final SimpleDateFormat APACHE_DF = new SimpleDateFormat(
"dd/MMM/yyyy:HH:mm:ss zzzzz");
/**
* This attempts returns null if a line fails to be parsed.
*/
public Event extract(String s) {
Matcher m = APACHE_PAT.matcher(s);
if (!m.matches())
return null;
try {
String service = "apache";
String date = m.group(4);
Date d = APACHE_DF.parse(date);
Calendar c = Calendar.getInstance();
c.setTime(d);
d = c.getTime();
String host = NetUtils.localhost(); // local host
// TODO(jon) body should be the raw entry, and a new field should be
// created for this instead.
String body = m.group(5);
String client = m.group(1);
String res = m.group(6);
String size = m.group(7);
String referrer = m.group(9);
String browser = m.group(10);
Map<String, byte[]> fields = new HashMap<String, byte[]>();
fields.put("service", service.getBytes());
fields.put("client", client.getBytes());
fields.put("req_result", res.getBytes());
fields.put("req_size", size.getBytes());
if (referrer != null && !referrer.equals("-")) {
fields.put("referrer", referrer.getBytes());
}
if (browser != null && !browser.equals("-")) {
fields.put("browser", browser.getBytes());
}
Event e = new EventImpl(body.getBytes(), d.getTime(), Priority.INFO,
Clock.nanos(), host, fields);
return e;
} catch (ParseException e) {
LOG.warn("Failed to parse apache access log line: '" + s + "'", e);
return null;
}
}
}