/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package com.gigya.flume;
import static org.apache.flume.sink.elasticsearch.ElasticSearchSinkConstants.SERIALIZER;
import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.commons.lang.StringUtils;
import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.conf.ComponentConfiguration;
import org.apache.flume.sink.elasticsearch.DocumentIdBuilder;
import org.apache.flume.sink.elasticsearch.ElasticSearchEventSerializer;
import org.apache.lucene.analysis.compound.DictionaryCompoundWordTokenFilter;
import org.elasticsearch.common.Base64;
import org.elasticsearch.common.bytes.BytesReference;
import org.elasticsearch.common.collect.Maps;
import org.elasticsearch.common.io.BytesStream;
import org.elasticsearch.common.xcontent.XContentBuilder;
/**
* An extended serializer for flume events into the same format LogStash uses</p>
* This adds some more features on top of the default ES serializer that is part of
* the Flume distribution.</p>
* For more details see: https://github.com/gigya/flume-ng-elasticsearch-ser-ex
* </p>
* Logstash format:
*
* <pre>
* {
* "@timestamp": "2010-12-21T21:48:33.309258Z",
* "@tags": [ "array", "of", "tags" ],
* "@type": "string",
* "@source": "source of the event, usually a URL."
* "@source_host": ""
* "@source_path": ""
* "@fields":{
* # a set of fields for this event
* "user": "jordan",
* "command": "shutdown -r":
* }
* "@message": "the original plain-text message"
* }
* </pre>
*
* If the following headers are present, they will map to the above logstash
* output as long as the logstash fields are not already present.</p>
*
* <pre>
* message : String -> @message : String
* or body : String -> @message : String
* timestamp: long -> @timestamp:Date
* host: String -> @source_host: String
* src_path: String -> @source_path: String
* type: String -> @type: String
* source: String -> @source: String
* </pre>
*
*
* @author Rotem Hermon
*/
public class ExtendedElasticSearchLogStashEventSerializer implements ElasticSearchEventSerializer, DocumentIdBuilder {
/**
* Configuration property to set fields that might contain a JSON string, to be
* parsed as an object
*/
public static final String OBJECT_FIELDS = "objectFields";
/**
* Configuration property, set to true to remove the logstash '@fields' prefix
* for custom fields
*/
public static final String REMOVE_FIELDS_PREFIX = "removeFieldsPrefix";
/**
* Configuration property, set to true to collect dot notated field names into an
* object (so 'params.f1' and 'params.f2' will be turned when indexed into
* an object: { params : {f1 : ... , f2 : ... } }
*/
public static final String COLLATE_OBJECTS = "collateObjects";
/**
* Configuration property to control the depth of object collating.
* Default is 1, meaning only the first object level will be collated.
* So for example: "params.f1.a" will be turned into
* { "params" : { "f1.a" : ... } }
* Set to -1 for unlimited levels.
*/
public static final String COLLATE_DEPTH = "collateDepth";
/**
* Configuration property, set to true to generate an _id for the indexed event,
* not letting ES to auto generate an _id. The _id is an MD5 of the serialized event.
*/
public static final String GENERATE_ID = "generateId";
private boolean generateId = false;
private Map<String, Boolean> objectFields = null;
private boolean removeFieldsPrefix = false;
private boolean collateObjects = false;
private int collateDepth = 1;
public XContentBuilder getXContentBuilder(Event event) throws IOException {
XContentBuilder builder = jsonBuilder().startObject();
appendHeaders(builder, event);
return builder;
}
@Override
public BytesStream getContentBuilder(Event event) throws IOException {
return getXContentBuilder(event);
}
private void appendBody(XContentBuilder builder, Event event) throws IOException, UnsupportedEncodingException {
byte[] body = event.getBody();
ContentBuilderUtilEx.appendField(builder, "@message", body, isObjectField("body"));
}
private void appendHeaders(XContentBuilder builder, Event event) throws IOException {
Map<String, String> headers = Maps.newHashMap(event.getHeaders());
Map<String, Object> collatedFields = null;
if (collateObjects)
collatedFields = Maps.newHashMap();
// look for a "message" header and append as body if exists
String message = ensureFieldSize(headers.get("message"));
if (!StringUtils.isBlank(message) && StringUtils.isBlank(headers.get("@message"))) {
ContentBuilderUtilEx.appendField(builder, "@message", message.getBytes(charset), isObjectField("message"));
headers.remove("message");
} else {
// if not, append the body as the message
appendBody(builder, event);
}
String timestamp = ensureFieldSize(headers.get("timestamp"));
if (!StringUtils.isBlank(timestamp) && StringUtils.isBlank(headers.get("@timestamp"))) {
long timestampMs = Long.parseLong(timestamp);
builder.field("@timestamp", new Date(timestampMs));
headers.remove("timestamp");
}
String source = ensureFieldSize(headers.get("source"));
if (!StringUtils.isBlank(source) && StringUtils.isBlank(headers.get("@source"))) {
ContentBuilderUtilEx.appendField(builder, "@source", source.getBytes(charset));
headers.remove("source");
}
String type = ensureFieldSize(headers.get("type"));
if (!StringUtils.isBlank(type) && StringUtils.isBlank(headers.get("@type"))) {
ContentBuilderUtilEx.appendField(builder, "@type", type.getBytes(charset));
headers.remove("type");
}
String host = ensureFieldSize(headers.get("host"));
if (!StringUtils.isBlank(host) && StringUtils.isBlank(headers.get("@source_host"))) {
ContentBuilderUtilEx.appendField(builder, "@source_host", host.getBytes(charset));
headers.remove("host");
}
String srcPath = ensureFieldSize(headers.get("src_path"));
if (!StringUtils.isBlank(srcPath) && StringUtils.isBlank(headers.get("@source_path"))) {
ContentBuilderUtilEx.appendField(builder, "@source_path", srcPath.getBytes(charset));
headers.remove("src_path");
}
if (!removeFieldsPrefix)
builder.startObject("@fields");
for (String key : headers.keySet()) {
if (collateObjects) {
collectField(key, headers.get(key), collatedFields, 1);
} else {
byte[] val = ensureFieldSize(headers.get(key)).getBytes(charset);
ContentBuilderUtilEx.appendField(builder, key, val, isObjectField(key));
}
}
if (collateObjects) {
for (String fieldName : collatedFields.keySet()) {
ContentBuilderUtilEx.appendField(builder, fieldName, collatedFields.get(fieldName));
}
}
if (!removeFieldsPrefix)
builder.endObject();
}
private Object ensureFieldSize(Object field){
if (null != field){
Class type = field.getClass();
if (type == String.class) {
return ensureFieldSize((String)field);
}
}
return field;
}
private String ensureFieldSize(String field){
// Elasticsearch does not accept not analyzed fields that are bigger than 32K.
// We will trim all fields that are bigger than that to avoid an error.
if (null == field) return field;
// allow some overhead to make sure we're not over the limit
long size = field.length();
if (size < 30000) return field;
return field.substring(0, 30000);
}
private void collectField(String key, String val, Map<String, Object> fields, int level) {
// see if we have an object dot notation
int pos = 0;
if (collateDepth < 0 || level <= collateDepth){
pos = key.indexOf('.');
}
if (pos > 0) {
// this is an object field. get the field name
String fieldName = key.substring(0, pos);
String rest = key.substring(pos + 1);
// get the field object. create a new map if not already there
Map<String, Object> fieldMap = getFieldMap(fieldName, fields, true);
// if the field was already set as a primitive type just write this
// one
// as a regular field and not as an object
if (null == fieldMap) {
fields.put(key, ensureFieldSize(val));
} else {
// process the rest of the field
collectField(rest, val, fieldMap, level+1);
}
} else {
// check that this not overrides an existing object
Map<String, Object> fieldMap = getFieldMap(key, fields, false);
// this is a regular field, add the value. check if we should parse this
// as an object
if (isObjectField(key) || null != fieldMap){
if (null == fieldMap)
fieldMap = getFieldMap(key, fields, true);
Map<String,Object> valMap = ContentBuilderUtilEx.tryParsingToMap(val);
if (null != valMap){
for (String fieldName : valMap.keySet()){
fieldMap.put(fieldName, ensureFieldSize(valMap.get(fieldName)));
}
}
}
else {
fields.put(key, ensureFieldSize(val));
}
}
}
@SuppressWarnings("unchecked")
private Map<String, Object> getFieldMap(String key, Map<String, Object> fields, boolean createNew) {
Map<String, Object> fieldMap = null;
Object field = fields.get(key);
if (null == field) {
if (createNew) {
fieldMap = Maps.newHashMap();
fields.put(key, fieldMap);
}
} else if (field instanceof Map) {
fieldMap = (Map<String, Object>) field;
}
return fieldMap;
}
private boolean isObjectField(String fieldName) {
if (null != objectFields && null != fieldName) {
if (objectFields.containsKey(fieldName))
return true;
}
return false;
}
@Override
public void configure(Context context) {
// look for the objectFields configuration
if (StringUtils.isNotBlank(context.getString(OBJECT_FIELDS))) {
String fields = context.getString(OBJECT_FIELDS);
if (null != fields) {
objectFields = new HashMap<String, Boolean>();
String[] splitted = fields.split(",");
for (int i = 0; i < splitted.length; i++) {
String field = splitted[i].trim();
if (!field.isEmpty())
objectFields.put(field, true);
}
}
}
if (StringUtils.isNotBlank(context.getString(REMOVE_FIELDS_PREFIX))) {
String remove = context.getString(REMOVE_FIELDS_PREFIX);
if ("true".equalsIgnoreCase(remove) || "1".equalsIgnoreCase(remove)) {
removeFieldsPrefix = true;
}
}
if (StringUtils.isNotBlank(context.getString(COLLATE_OBJECTS))) {
String remove = context.getString(COLLATE_OBJECTS);
if ("true".equalsIgnoreCase(remove) || "1".equalsIgnoreCase(remove)) {
collateObjects = true;
}
}
if (StringUtils.isNotBlank(context.getString(COLLATE_DEPTH))) {
String depth = context.getString(COLLATE_DEPTH);
try{
collateDepth = Integer.parseInt(depth);
}
catch (Throwable t){
}
}
if (StringUtils.isNotBlank(context.getString(GENERATE_ID))) {
String remove = context.getString(GENERATE_ID);
if ("true".equalsIgnoreCase(remove) || "1".equalsIgnoreCase(remove)) {
generateId = true;
}
}
}
@Override
public void configure(ComponentConfiguration conf) {
// NO-OP...
}
@Override
public String getDocumentId(BytesReference contentBytes) {
if (generateId) {
// if we need to generate an _id for the event, get an MD5 hash for
// the serialized
// event bytes.
String hashId = null;
try {
byte[] bytes = contentBytes.toBytes();
if (contentBytes.length() > 0 && null != bytes) {
MessageDigest md = MessageDigest.getInstance("MD5");
byte[] thedigest = md.digest(bytes);
hashId = Base64.encodeBytes(thedigest, Base64.URL_SAFE);
// remove padding
if (hashId.endsWith("=="))
hashId = hashId.substring(0, hashId.length()-2);
}
} catch (NoSuchAlgorithmException | IOException e) {
Integer hash = contentBytes.hashCode();
hashId = hash.toString();
}
if (null != hashId && !hashId.isEmpty())
return hashId;
}
return null;
}
}