/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package com.gigya.flume; import static org.apache.flume.sink.elasticsearch.ElasticSearchSinkConstants.SERIALIZER; import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; import java.util.Date; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.commons.lang.StringUtils; import org.apache.flume.Context; import org.apache.flume.Event; import org.apache.flume.conf.ComponentConfiguration; import org.apache.flume.sink.elasticsearch.DocumentIdBuilder; import org.apache.flume.sink.elasticsearch.ElasticSearchEventSerializer; import org.apache.lucene.analysis.compound.DictionaryCompoundWordTokenFilter; import org.elasticsearch.common.Base64; import org.elasticsearch.common.bytes.BytesReference; import org.elasticsearch.common.collect.Maps; import org.elasticsearch.common.io.BytesStream; import org.elasticsearch.common.xcontent.XContentBuilder; /** * An extended serializer for flume events into the same format LogStash uses</p> * This adds some more features on top of the default ES serializer that is part of * the Flume distribution.</p> * For more details see: https://github.com/gigya/flume-ng-elasticsearch-ser-ex * </p> * Logstash format: * * <pre> * { * "@timestamp": "2010-12-21T21:48:33.309258Z", * "@tags": [ "array", "of", "tags" ], * "@type": "string", * "@source": "source of the event, usually a URL." * "@source_host": "" * "@source_path": "" * "@fields":{ * # a set of fields for this event * "user": "jordan", * "command": "shutdown -r": * } * "@message": "the original plain-text message" * } * </pre> * * If the following headers are present, they will map to the above logstash * output as long as the logstash fields are not already present.</p> * * <pre> * message : String -> @message : String * or body : String -> @message : String * timestamp: long -> @timestamp:Date * host: String -> @source_host: String * src_path: String -> @source_path: String * type: String -> @type: String * source: String -> @source: String * </pre> * * * @author Rotem Hermon */ public class ExtendedElasticSearchLogStashEventSerializer implements ElasticSearchEventSerializer, DocumentIdBuilder { /** * Configuration property to set fields that might contain a JSON string, to be * parsed as an object */ public static final String OBJECT_FIELDS = "objectFields"; /** * Configuration property, set to true to remove the logstash '@fields' prefix * for custom fields */ public static final String REMOVE_FIELDS_PREFIX = "removeFieldsPrefix"; /** * Configuration property, set to true to collect dot notated field names into an * object (so 'params.f1' and 'params.f2' will be turned when indexed into * an object: { params : {f1 : ... , f2 : ... } } */ public static final String COLLATE_OBJECTS = "collateObjects"; /** * Configuration property to control the depth of object collating. * Default is 1, meaning only the first object level will be collated. * So for example: "params.f1.a" will be turned into * { "params" : { "f1.a" : ... } } * Set to -1 for unlimited levels. */ public static final String COLLATE_DEPTH = "collateDepth"; /** * Configuration property, set to true to generate an _id for the indexed event, * not letting ES to auto generate an _id. The _id is an MD5 of the serialized event. */ public static final String GENERATE_ID = "generateId"; private boolean generateId = false; private Map<String, Boolean> objectFields = null; private boolean removeFieldsPrefix = false; private boolean collateObjects = false; private int collateDepth = 1; public XContentBuilder getXContentBuilder(Event event) throws IOException { XContentBuilder builder = jsonBuilder().startObject(); appendHeaders(builder, event); return builder; } @Override public BytesStream getContentBuilder(Event event) throws IOException { return getXContentBuilder(event); } private void appendBody(XContentBuilder builder, Event event) throws IOException, UnsupportedEncodingException { byte[] body = event.getBody(); ContentBuilderUtilEx.appendField(builder, "@message", body, isObjectField("body")); } private void appendHeaders(XContentBuilder builder, Event event) throws IOException { Map<String, String> headers = Maps.newHashMap(event.getHeaders()); Map<String, Object> collatedFields = null; if (collateObjects) collatedFields = Maps.newHashMap(); // look for a "message" header and append as body if exists String message = ensureFieldSize(headers.get("message")); if (!StringUtils.isBlank(message) && StringUtils.isBlank(headers.get("@message"))) { ContentBuilderUtilEx.appendField(builder, "@message", message.getBytes(charset), isObjectField("message")); headers.remove("message"); } else { // if not, append the body as the message appendBody(builder, event); } String timestamp = ensureFieldSize(headers.get("timestamp")); if (!StringUtils.isBlank(timestamp) && StringUtils.isBlank(headers.get("@timestamp"))) { long timestampMs = Long.parseLong(timestamp); builder.field("@timestamp", new Date(timestampMs)); headers.remove("timestamp"); } String source = ensureFieldSize(headers.get("source")); if (!StringUtils.isBlank(source) && StringUtils.isBlank(headers.get("@source"))) { ContentBuilderUtilEx.appendField(builder, "@source", source.getBytes(charset)); headers.remove("source"); } String type = ensureFieldSize(headers.get("type")); if (!StringUtils.isBlank(type) && StringUtils.isBlank(headers.get("@type"))) { ContentBuilderUtilEx.appendField(builder, "@type", type.getBytes(charset)); headers.remove("type"); } String host = ensureFieldSize(headers.get("host")); if (!StringUtils.isBlank(host) && StringUtils.isBlank(headers.get("@source_host"))) { ContentBuilderUtilEx.appendField(builder, "@source_host", host.getBytes(charset)); headers.remove("host"); } String srcPath = ensureFieldSize(headers.get("src_path")); if (!StringUtils.isBlank(srcPath) && StringUtils.isBlank(headers.get("@source_path"))) { ContentBuilderUtilEx.appendField(builder, "@source_path", srcPath.getBytes(charset)); headers.remove("src_path"); } if (!removeFieldsPrefix) builder.startObject("@fields"); for (String key : headers.keySet()) { if (collateObjects) { collectField(key, headers.get(key), collatedFields, 1); } else { byte[] val = ensureFieldSize(headers.get(key)).getBytes(charset); ContentBuilderUtilEx.appendField(builder, key, val, isObjectField(key)); } } if (collateObjects) { for (String fieldName : collatedFields.keySet()) { ContentBuilderUtilEx.appendField(builder, fieldName, collatedFields.get(fieldName)); } } if (!removeFieldsPrefix) builder.endObject(); } private Object ensureFieldSize(Object field){ if (null != field){ Class type = field.getClass(); if (type == String.class) { return ensureFieldSize((String)field); } } return field; } private String ensureFieldSize(String field){ // Elasticsearch does not accept not analyzed fields that are bigger than 32K. // We will trim all fields that are bigger than that to avoid an error. if (null == field) return field; // allow some overhead to make sure we're not over the limit long size = field.length(); if (size < 30000) return field; return field.substring(0, 30000); } private void collectField(String key, String val, Map<String, Object> fields, int level) { // see if we have an object dot notation int pos = 0; if (collateDepth < 0 || level <= collateDepth){ pos = key.indexOf('.'); } if (pos > 0) { // this is an object field. get the field name String fieldName = key.substring(0, pos); String rest = key.substring(pos + 1); // get the field object. create a new map if not already there Map<String, Object> fieldMap = getFieldMap(fieldName, fields, true); // if the field was already set as a primitive type just write this // one // as a regular field and not as an object if (null == fieldMap) { fields.put(key, ensureFieldSize(val)); } else { // process the rest of the field collectField(rest, val, fieldMap, level+1); } } else { // check that this not overrides an existing object Map<String, Object> fieldMap = getFieldMap(key, fields, false); // this is a regular field, add the value. check if we should parse this // as an object if (isObjectField(key) || null != fieldMap){ if (null == fieldMap) fieldMap = getFieldMap(key, fields, true); Map<String,Object> valMap = ContentBuilderUtilEx.tryParsingToMap(val); if (null != valMap){ for (String fieldName : valMap.keySet()){ fieldMap.put(fieldName, ensureFieldSize(valMap.get(fieldName))); } } } else { fields.put(key, ensureFieldSize(val)); } } } @SuppressWarnings("unchecked") private Map<String, Object> getFieldMap(String key, Map<String, Object> fields, boolean createNew) { Map<String, Object> fieldMap = null; Object field = fields.get(key); if (null == field) { if (createNew) { fieldMap = Maps.newHashMap(); fields.put(key, fieldMap); } } else if (field instanceof Map) { fieldMap = (Map<String, Object>) field; } return fieldMap; } private boolean isObjectField(String fieldName) { if (null != objectFields && null != fieldName) { if (objectFields.containsKey(fieldName)) return true; } return false; } @Override public void configure(Context context) { // look for the objectFields configuration if (StringUtils.isNotBlank(context.getString(OBJECT_FIELDS))) { String fields = context.getString(OBJECT_FIELDS); if (null != fields) { objectFields = new HashMap<String, Boolean>(); String[] splitted = fields.split(","); for (int i = 0; i < splitted.length; i++) { String field = splitted[i].trim(); if (!field.isEmpty()) objectFields.put(field, true); } } } if (StringUtils.isNotBlank(context.getString(REMOVE_FIELDS_PREFIX))) { String remove = context.getString(REMOVE_FIELDS_PREFIX); if ("true".equalsIgnoreCase(remove) || "1".equalsIgnoreCase(remove)) { removeFieldsPrefix = true; } } if (StringUtils.isNotBlank(context.getString(COLLATE_OBJECTS))) { String remove = context.getString(COLLATE_OBJECTS); if ("true".equalsIgnoreCase(remove) || "1".equalsIgnoreCase(remove)) { collateObjects = true; } } if (StringUtils.isNotBlank(context.getString(COLLATE_DEPTH))) { String depth = context.getString(COLLATE_DEPTH); try{ collateDepth = Integer.parseInt(depth); } catch (Throwable t){ } } if (StringUtils.isNotBlank(context.getString(GENERATE_ID))) { String remove = context.getString(GENERATE_ID); if ("true".equalsIgnoreCase(remove) || "1".equalsIgnoreCase(remove)) { generateId = true; } } } @Override public void configure(ComponentConfiguration conf) { // NO-OP... } @Override public String getDocumentId(BytesReference contentBytes) { if (generateId) { // if we need to generate an _id for the event, get an MD5 hash for // the serialized // event bytes. String hashId = null; try { byte[] bytes = contentBytes.toBytes(); if (contentBytes.length() > 0 && null != bytes) { MessageDigest md = MessageDigest.getInstance("MD5"); byte[] thedigest = md.digest(bytes); hashId = Base64.encodeBytes(thedigest, Base64.URL_SAFE); // remove padding if (hashId.endsWith("==")) hashId = hashId.substring(0, hashId.length()-2); } } catch (NoSuchAlgorithmException | IOException e) { Integer hash = contentBytes.hashCode(); hashId = hash.toString(); } if (null != hashId && !hashId.isEmpty()) return hashId; } return null; } }