/* * Copyright © 2015 Cask Data, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package co.cask.cdap.format; import co.cask.cdap.api.common.Bytes; import co.cask.cdap.api.data.format.StructuredRecord; import co.cask.cdap.api.data.format.UnexpectedFormatException; import co.cask.cdap.api.data.schema.Schema; import co.cask.cdap.api.data.schema.UnsupportedTypeException; import co.cask.cdap.api.flow.flowlet.StreamEvent; import co.cask.cdap.spi.stream.AbstractStreamEventRecordFormat; import com.google.common.base.Charsets; import com.google.common.collect.ImmutableMap; import com.google.common.io.Resources; import oi.thekraken.grok.api.Grok; import oi.thekraken.grok.api.Match; import oi.thekraken.grok.api.exception.GrokException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.io.StringReader; import java.net.URL; import java.util.Iterator; import java.util.Map; /** * GrokRecordFormat. Grok parses a string and outputs a map of field name (string) to value (string). */ public class GrokRecordFormat extends AbstractStreamEventRecordFormat<StructuredRecord> { private static final Logger LOG = LoggerFactory.getLogger(GrokRecordFormat.class); private static final String DEFAULT_PATTERN = "%{GREEDYDATA:body}"; private static final String PATTERN_SETTING = "pattern"; private final Grok grok = new Grok(); private String pattern = null; public static Map<String, String> settings(String pattern) { return ImmutableMap.of(PATTERN_SETTING, pattern); } @Override public StructuredRecord read(StreamEvent event) throws UnexpectedFormatException { String bodyAsStr = Bytes.toString(event.getBody(), Charsets.UTF_8); StructuredRecord.Builder builder = StructuredRecord.builder(schema); Match gm = grok.match(bodyAsStr); gm.captures(); Map<String, Object> x = gm.toMap(); for (Schema.Field field : schema.getFields()) { String fieldName = field.getName(); Object value = x.get(fieldName); if (value != null) { builder.convertAndSet(fieldName, value.toString()); } } return builder.build(); } @Override protected Schema getDefaultSchema() { // default is a String[] return Schema.recordOf("streamEvent", Schema.Field.of("body", Schema.nullableOf(Schema.of(Schema.Type.STRING)))); } @Override protected void validateSchema(Schema desiredSchema) throws UnsupportedTypeException { // a valid schema is a record of simple types. In other words, no maps, arrays, records, unions, or enums allowed. // the exception is the very last field, which is allowed to be an array of simple types. // These types may be nullable, which is a union of a null and non-null type. Iterator<Schema.Field> fields = desiredSchema.getFields().iterator(); // check that each field is a simple field, except for the very last field, which can be an array of simple types. while (fields.hasNext()) { Schema.Field field = fields.next(); Schema schema = field.getSchema(); // if we're not on the very last field, the field must be a simple type or a nullable simple type. boolean isSimple = schema.getType().isSimpleType(); boolean isNullableSimple = schema.isNullableSimple(); if (!isSimple && !isNullableSimple) { // if this is the very last field and a string array, it is valid. otherwise it is not. if (fields.hasNext() || !isStringArray(schema)) { throw new UnsupportedTypeException("Field " + field.getName() + " is of invalid type."); } } } } // check that it's an array of strings or array of nullable strings. the array itself can also be nullable. private boolean isStringArray(Schema schema) { Schema arrSchema = schema.isNullable() ? schema.getNonNullable() : schema; if (arrSchema.getType() == Schema.Type.ARRAY) { Schema componentSchema = arrSchema.getComponentSchema(); if (componentSchema.isNullable()) { return componentSchema.getNonNullable().getType() == Schema.Type.STRING; } else { return componentSchema.getType() == Schema.Type.STRING; } } return false; } @Override protected void configure(Map<String, String> settings) { addPatterns(grok); try { this.pattern = determinePattern(settings); grok.compile(pattern); } catch (GrokException e) { LOG.error("Failed to compile grok pattern '{}'", pattern, e); } } protected void addPatterns(Grok grok) { addPattern(grok, "cdap/grok/patterns/firewalls"); addPattern(grok, "cdap/grok/patterns/grok-patterns"); addPattern(grok, "cdap/grok/patterns/haproxy"); addPattern(grok, "cdap/grok/patterns/java"); addPattern(grok, "cdap/grok/patterns/junos"); addPattern(grok, "cdap/grok/patterns/linux-syslog"); addPattern(grok, "cdap/grok/patterns/mcollective"); addPattern(grok, "cdap/grok/patterns/mcollective-patterns"); addPattern(grok, "cdap/grok/patterns/mongodb"); addPattern(grok, "cdap/grok/patterns/nagios"); addPattern(grok, "cdap/grok/patterns/postgresql"); addPattern(grok, "cdap/grok/patterns/redis"); addPattern(grok, "cdap/grok/patterns/ruby"); } protected String determinePattern(Map<String, String> settings) { if (!settings.containsKey(PATTERN_SETTING)) { return DEFAULT_PATTERN; } else { return settings.get(PATTERN_SETTING); } } protected void addPattern(Grok grok, String resource) { URL url = this.getClass().getClassLoader().getResource(resource); if (url == null) { LOG.error("Resource '{}' for grok pattern was not found", resource); return; } try { String patternFile = Resources.toString(url, Charsets.UTF_8); try { grok.addPatternFromReader(new StringReader(patternFile)); } catch (GrokException e) { LOG.error("Invalid grok pattern from resource '{}'", resource, e); } } catch (IOException e) { LOG.error("Failed to load resource '{}' for grok pattern", resource, e); } } public String getPattern() { return pattern; } }