/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.contrib.serde2.s3;
import java.nio.charset.CharacterCodingException;
import java.util.List;
import java.util.Properties;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.serde2.AbstractDeserializer;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.serde2.SerDeStats;
import org.apache.hadoop.hive.serde2.SerDeUtils;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.ReflectionStructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
/**
* S3LogDeserializer.
*
*/
public class S3LogDeserializer extends AbstractDeserializer {
public static final Logger LOG = LoggerFactory.getLogger(S3LogDeserializer.class
.getName());
static {
StackTraceElement[] sTrace = new Exception().getStackTrace();
sTrace[0].getClassName();
}
private ObjectInspector cachedObjectInspector;
@Override
public String toString() {
return "S3ZemantaDeserializer[]";
}
public S3LogDeserializer() throws SerDeException {
}
// This regex is a bit lax in order to compensate for lack of any escaping
// done by Amazon S3 ... for example useragent string can have double quotes
// inside!
static Pattern regexpat = Pattern
.compile("(\\S+) (\\S+) \\[(.*?)\\] (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) \"(.+)\" (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) \"(.*)\" \"(.*)\"");
// static Pattern regexrid = Pattern.compile("x-id=([-0-9a-f]{36})");
// static SimpleDateFormat dateparser = new
// SimpleDateFormat("dd/MMM/yyyy:hh:mm:ss ZZZZZ");
S3LogStruct deserializeCache = new S3LogStruct();
@Override
public void initialize(Configuration job, Properties tbl)
throws SerDeException {
cachedObjectInspector = ObjectInspectorFactory
.getReflectionObjectInspector(S3LogStruct.class,
ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
LOG.debug(getClass().getName() + ": initialized");
}
public static Integer toInt(String s) {
if (s.compareTo("-") == 0) {
return null;
} else {
return Integer.valueOf(s);
}
}
public static Object deserialize(S3LogStruct c, String row) throws Exception {
Matcher match = regexpat.matcher(row);
int t = 1;
try {
match.matches();
c.bucketowner = match.group(t++);
c.bucketname = match.group(t++);
} catch (Exception e) {
throw new SerDeException("S3 Log Regex did not match:" + row, e);
}
c.rdatetime = match.group(t++);
// Should we convert the datetime to the format Hive understands by default
// - either yyyy-mm-dd HH:MM:SS or seconds since epoch?
// Date d = dateparser.parse(c.rdatetime);
// c.rdatetimeepoch = d.getTime() / 1000;
c.rip = match.group(t++);
c.requester = match.group(t++);
c.requestid = match.group(t++);
c.operation = match.group(t++);
c.rkey = match.group(t++);
c.requesturi = match.group(t++);
// System.err.println(c.requesturi);
/*
* // Zemanta specific data extractor try { Matcher m2 =
* regexrid.matcher(c.requesturi); m2.find(); c.rid = m2.group(1); } catch
* (Exception e) { c.rid = null; }
*/
c.httpstatus = toInt(match.group(t++));
c.errorcode = match.group(t++);
c.bytessent = toInt(match.group(t++));
c.objsize = toInt(match.group(t++));
c.totaltime = toInt(match.group(t++));
c.turnaroundtime = toInt(match.group(t++));
c.referer = match.group(t++);
c.useragent = match.group(t++);
return (c);
}
@Override
public Object deserialize(Writable field) throws SerDeException {
String row = null;
if (field instanceof BytesWritable) {
BytesWritable b = (BytesWritable) field;
try {
row = Text.decode(b.getBytes(), 0, b.getLength());
} catch (CharacterCodingException e) {
throw new SerDeException(e);
}
} else if (field instanceof Text) {
row = field.toString();
}
try {
deserialize(deserializeCache, row);
return deserializeCache;
} catch (ClassCastException e) {
throw new SerDeException(this.getClass().getName()
+ " expects Text or BytesWritable", e);
} catch (Exception e) {
throw new SerDeException(e);
}
}
@Override
public ObjectInspector getObjectInspector() throws SerDeException {
return cachedObjectInspector;
}
/**
* @param args
*/
public static void main(String[] args) {
System.err.println("This is only a test run");
try {
S3LogDeserializer serDe = new S3LogDeserializer();
Configuration conf = new Configuration();
Properties tbl = new Properties();
// Some nasty examples that show how S3 log format is broken ... and to
// test the regex
// These are all sourced from genuine S3 logs
// Text sample = new
// Text("04ff331638adc13885d6c42059584deabbdeabcd55bf0bee491172a79a87b196 img.zemanta.com [09/Apr/2009:22:00:07 +0000] 190.225.84.114 65a011a29cdf8ec533ec3d1ccaae921c F4FC3FEAD8C00024 REST.GET.OBJECT pixy.gif \"GET /pixy.gif?x-id=23d25db1-160b-48bb-a932-e7dc1e88c321 HTTP/1.1\" 304 - - 828 3 - \"http://www.viamujer.com/2009/03/horoscopo-acuario-abril-mayo-y-junio-2009/\" \"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)\"");
// Text sample = new
// Text("04ff331638adc13885d6c42059584deabbdeabcd55bf0bee491172a79a87b196 img.zemanta.com [09/Apr/2009:22:19:49 +0000] 60.28.204.7 65a011a29cdf8ec533ec3d1ccaae921c 7D87B6835125671E REST.GET.OBJECT pixy.gif \"GET /pixy.gif?x-id=b50a4544-938b-4a63-992c-721d1a644b28 HTTP/1.1\" 200 - 828 828 4 3 \"\" \"ZhuaXia.com\"");
// Text sample = new
// Text("04ff331638adc13885d6c42059584deabbdeabcd55bf0bee491172a79a87b196 static.zemanta.com [09/Apr/2009:23:12:39 +0000] 65.94.12.181 65a011a29cdf8ec533ec3d1ccaae921c EEE6FFE9B9F9EA29 REST.HEAD.OBJECT readside/loader.js%22+defer%3D%22defer \"HEAD /readside/loader.js\"+defer=\"defer HTTP/1.0\" 403 AccessDenied 231 - 7 - \"-\" \"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0)\"");
Text sample = new Text(
"04ff331638adc13885d6c42059584deabbdeabcd55bf0bee491172a79a87b196 img.zemanta.com [10/Apr/2009:05:34:01 +0000] 70.32.81.92 65a011a29cdf8ec533ec3d1ccaae921c F939A7D698D27C63 REST.GET.OBJECT reblog_b.png \"GET /reblog_b.png?x-id=79ca9376-6326-41b7-9257-eea43d112eb2 HTTP/1.0\" 200 - 1250 1250 160 159 \"-\" \"Firefox 0.8 (Linux)\" useragent=\"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.6) Gecko/20040614 Firefox/0.8\"");
SerDeUtils.initializeSerDe(serDe, conf, tbl, null);
Object row = serDe.deserialize(sample);
System.err.println(serDe.getObjectInspector().getClass().toString());
ReflectionStructObjectInspector oi = (ReflectionStructObjectInspector) serDe
.getObjectInspector();
List<? extends StructField> fieldRefs = oi.getAllStructFieldRefs();
for (int i = 0; i < fieldRefs.size(); i++) {
System.err.println(fieldRefs.get(i).toString());
Object fieldData = oi.getStructFieldData(row, fieldRefs.get(i));
if (fieldData == null) {
System.err.println("null");
} else {
System.err.println(fieldData.toString());
}
}
} catch (Exception e) {
System.err.println("Caught: " + e);
e.printStackTrace();
}
}
@Override
public SerDeStats getSerDeStats() {
// no support for statistics
return null;
}
}