/* * Copyright © 2015 Cask Data, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package co.cask.cdap.format; import co.cask.cdap.api.common.Bytes; import co.cask.cdap.api.data.format.StructuredRecord; import co.cask.cdap.api.data.format.UnexpectedFormatException; import co.cask.cdap.api.data.schema.Schema; import co.cask.cdap.api.data.schema.UnsupportedTypeException; import co.cask.cdap.api.flow.flowlet.StreamEvent; import co.cask.cdap.spi.stream.AbstractStreamEventRecordFormat; import com.google.common.collect.Lists; import java.util.Iterator; import java.util.List; /** * Stream record format that interprets stream body as data in Combined Log Format. * CLF format: remote_host remote_login auth_user [date] "request" status content_length referrer user_agent * Sample CLF data: * 220.181.108.77 - - [01/Feb/2015:06:59:57 +0000] "GET / HTTP/1.1" 301 295 "-" * "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)" * */ public class CombinedLogRecordFormat extends AbstractStreamEventRecordFormat<StructuredRecord> { @Override public StructuredRecord read(StreamEvent event) throws UnexpectedFormatException { String bodyAsStr = Bytes.toString(event.getBody()); StructuredRecord.Builder builder = StructuredRecord.builder(schema); List<String> parts = getLogEntries(bodyAsStr); List<Schema.Field> fields = schema.getFields(); int index = 0; while (index < fields.size()) { Schema.Field field = fields.get(index); String val = (parts.size() < index || (parts.get(index).equals("-") && field.getSchema().getType() != Schema.Type.STRING)) ? null : parts.get(index); builder.convertAndSet(field.getName(), val); index++; } return builder.build(); } @Override protected Schema getDefaultSchema() { return Schema.recordOf("streamEvent", Schema.Field.of("remote_host", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("remote_login", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("auth_user", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("request_time", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("request", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("status", Schema.nullableOf(Schema.of(Schema.Type.INT))), Schema.Field.of("content_length", Schema.nullableOf(Schema.of(Schema.Type.INT))), Schema.Field.of("referrer", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("user_agent", Schema.nullableOf(Schema.of(Schema.Type.STRING)))); } @Override protected void validateSchema(Schema desiredSchema) throws UnsupportedTypeException { // a valid schema is a record of simple types. Iterator<Schema.Field> fields = desiredSchema.getFields().iterator(); while (fields.hasNext()) { Schema.Field field = fields.next(); Schema schema = field.getSchema(); boolean isSimple = schema.getType().isSimpleType(); boolean isNullableSimple = schema.isNullableSimple(); if (!isSimple && !isNullableSimple) { throw new UnsupportedTypeException("Field " + field.getName() + " is of invalid type."); } } } // parse CLF logEvent and get the record values. private List<String> getLogEntries(String logEvent) { List<String> parts = Lists.newArrayList(); int start = 0; while (start < logEvent.length()) { if (logEvent.charAt(start) == ' ') { // Skip empty spaces start++; } else { start = addNextLogEntry(logEvent, start, parts); } } return parts; } // addNextLogEntry and return the start position of next entry. private int addNextLogEntry(String data, int start, List<String> parts) { int end = -1; if (data.charAt(start) == '"') { // Find the closing '"' and extract values within start = start + 1; end = findNext(data, start, '"'); } else if (data.charAt(start) == '[') { // find the closing ']' and extract values start = start + 1; end = findNext(data, start, ']'); } else { // find the next ' ' and extract values end = findNext(data, start + 1, ' '); } if (end == -1) { throw new UnexpectedFormatException(String.format("Could not parse data in CLF format. Entry %s", data)); } parts.add(data.substring(start, end)); return end + 1; } // Find the next character matching the "entry". Skip the entry that is escaped. private int findNext(String data, int startPosition, char entry) { int position = startPosition; int length = data.length(); while (position < length) { if (data.charAt(position) == entry && (position == 0 || data.charAt(position - 1) != '\\')) { return position; } position++; } return -1; } }