/*
* Copyright (C) 2015 Stratio (http://stratio.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.stratio.morphlines.commons;
import com.google.common.base.Preconditions;
import com.stratio.morphlines.commons.dict.VgrokDictionaries;
import com.typesafe.config.Config;
import com.typesafe.config.ConfigFactory;
import org.kitesdk.morphline.api.Command;
import org.kitesdk.morphline.api.CommandBuilder;
import org.kitesdk.morphline.api.MorphlineContext;
import org.kitesdk.morphline.api.Record;
import org.kitesdk.morphline.base.AbstractCommand;
import org.kitesdk.morphline.base.Configs;
import org.kitesdk.morphline.base.Validator;
import org.kitesdk.morphline.shaded.com.google.code.regexp.GroupInfo;
import org.kitesdk.morphline.shaded.com.google.code.regexp.Matcher;
import java.util.*;
/**
* The Vgrok command uses regular expression pattern matching to extract structured fields from
* unstructured log data.
* <p>
* If the pattern matching fails, the field which causes the error is added to a list. At the end of the command,
* all error fields are pritned to a new field in the output record (vgrok_error_fields).
*/
public final class VgrokBuilder implements CommandBuilder {
/*
* Uses a shaded version of com.google.code.regexp-0.2.3 to minimize potential dependency issues.
* See https://github.com/tony19/named-regexp
*/
@Override
public Collection<String> getNames() {
return Collections.singletonList("vgrok");
}
@Override
public Command build(Config config, Command parent, Command child, MorphlineContext context) {
return new Vgrok(this, config, parent, child, context);
}
///////////////////////////////////////////////////////////////////////////////
// Nested classes:
///////////////////////////////////////////////////////////////////////////////
private static final class Vgrok extends AbstractCommand {
private final String VGROK_ERROR_FIELD_NAME = "vgrok_error_fields";
private final List<Regex> regexes = new ArrayList<Regex>();
private final boolean extract;
private final boolean extractInPlace;
private final NumRequiredMatches numRequiredMatches;
private final boolean findSubstrings;
private final boolean addEmptyStrings;
private final String firstKey; // cached value
private final String renderedConfig; // cached value
public Vgrok(CommandBuilder builder, Config config, Command parent, Command child, MorphlineContext context) {
super(builder, config, parent, child, context);
VgrokDictionaries dict = new VgrokDictionaries(config, getConfigs());
Config exprConfig = getConfigs().getConfig(config, "expressions", ConfigFactory.empty());
for (Map.Entry<String, Object> entry : new Configs().getEntrySet(exprConfig)) {
String expr = entry.getValue().toString();
this.regexes.add(new Regex(entry.getKey(), dict.compileExpression(expr).matcher("")));
}
this.firstKey = (regexes.size() == 0 ? null : regexes.iterator().next().getRecordInputField());
String extractStr = getConfigs().getString(config, "extract", "true");
this.extractInPlace = extractStr.equals("inplace");
if (extractInPlace) {
this.extract = true;
} else {
this.extract = getConfigs().getBoolean(config, "extract", true);
}
this.numRequiredMatches = new Validator<NumRequiredMatches>().validateEnum(
config,
getConfigs().getString(config, "numRequiredMatches", NumRequiredMatches.atLeastOnce.toString()),
NumRequiredMatches.class);
this.findSubstrings = getConfigs().getBoolean(config, "findSubstrings", false);
this.addEmptyStrings = getConfigs().getBoolean(config, "addEmptyStrings", false);
validateArguments();
this.renderedConfig = config.root().render();
}
@Override
protected boolean doProcess(Record inputRecord) {
Record outputRecord;
outputRecord = ((extractInPlace || !extract) ? inputRecord : inputRecord.copy());
if (extractInPlace) {
// Ensure that we mutate the record inplace only if *all* expressions match.
// To ensure this we potentially run doMatch() twice: the first time to check, the second
// time to mutate
boolean isFast;
if (regexes.size() == 0) {
isFast = true;
} else if (regexes.size() > 1) {
isFast = false;
} else if (numRequiredMatches == NumRequiredMatches.atLeastOnce) {
isFast = true;
} else { // all or once
assert regexes.size() == 1;
assert firstKey != null;
isFast = (inputRecord.get(firstKey).size() <= 1);
}
if (!isFast) {
if (!doMatch(inputRecord, outputRecord, false)) {
return false;
}
} else {
; // no need to do anything
// This is a performance enhancement for some cases with a single expression:
// By the time we find a regex match we know that the whole command will succeed,
// so there's really no need to run doMatch() twice.
}
}
if (!doMatch(inputRecord, outputRecord, extract)) {
return false;
}
// pass record to next command in chain:
return super.doProcess(outputRecord);
}
private boolean doMatch(Record inputRecord, Record outputRecord, boolean doExtract) {
List<String> errorFields = new ArrayList<String>();
for (Regex regex : regexes) {
Matcher matcher = regex.getMatcher();
List values = inputRecord.get(regex.getRecordInputField());
int todo = values.size();
int minMatches = 1;
int maxMatches = Integer.MAX_VALUE;
switch (numRequiredMatches) {
case once : {
maxMatches = 1;
break;
}
case all : {
minMatches = todo;
break;
}
default: {
break;
}
}
int numMatches = 0;
for (Object value : values) {
matcher.reset(value.toString());
if (!findSubstrings) {
if (matcher.matches()) {
numMatches++;
if (numMatches > maxMatches) {
LOG.debug("grok failed because it found too many matches for values: {} for grok command: {}",
values, renderedConfig);
errorFields.add(regex.getRecordInputField());
}
extract(outputRecord, regex, doExtract);
}
} else {
int previousNumMatches = numMatches;
while (matcher.find()) {
if (numMatches == previousNumMatches) {
numMatches++;
if (numMatches > maxMatches) {
LOG.debug("grok failed because it found too many matches for values: {} for grok command: {}",
values, renderedConfig);
errorFields.add(regex.getRecordInputField());
}
if (!doExtract && numMatches >= minMatches && maxMatches == Integer.MAX_VALUE) {
break; // fast path
}
}
extract(outputRecord, regex, doExtract);
}
}
todo--;
if (!doExtract && numMatches >= minMatches && maxMatches == Integer.MAX_VALUE) {
break; // fast path
}
}
if (numMatches + todo < minMatches) {
LOG.debug("grok failed because it found too few matches for values: {} for grok command: {}",
values, renderedConfig);
errorFields.add(regex.getRecordInputField());
}
}
if(!errorFields.isEmpty()) {
outputRecord.put(VGROK_ERROR_FIELD_NAME, errorFields.toString());
}
return true;
}
private void extract(Record outputRecord, Regex regex, boolean doExtract) {
if (doExtract) {
regex.extract(outputRecord, addEmptyStrings);
}
}
///////////////////////////////////////////////////////////////////////////////
// Nested classes:
///////////////////////////////////////////////////////////////////////////////
/* Caches various regex matcher info for best performance */
private static final class Regex {
private final String recordInputField;
private final Matcher matcher;
private final String[] groupNames;
private final int[] groupNumbers;
public Regex(String recordInputField, Matcher matcher) {
Preconditions.checkNotNull(recordInputField);
Preconditions.checkNotNull(matcher);
this.recordInputField = recordInputField;
this.matcher = matcher;
int size = 0;
for (Map.Entry<String, List<GroupInfo>> entry : matcher.namedPattern().groupInfo().entrySet()) {
size += entry.getValue().size();
}
this.groupNames = new String[size];
this.groupNumbers = new int[size];
int i = 0;
for (Map.Entry<String, List<GroupInfo>> entry : matcher.namedPattern().groupInfo().entrySet()) {
String groupName = entry.getKey();
assert groupName != null;
List<GroupInfo> groupInfos = entry.getValue();
for (GroupInfo groupInfo : groupInfos) {
int idx = groupInfo.groupIndex();
int group = idx > -1 ? idx + 1 : -1;
groupNames[i] = groupName;
groupNumbers[i] = group;
i++;
}
}
assert i == size;
}
public void extract(Record outputRecord, boolean addEmptyStrings) {
for (int i = 0; i < groupNumbers.length; i++) {
String value = matcher.group(groupNumbers[i]);
if (value != null && (value.length() > 0 || addEmptyStrings)) {
outputRecord.put(groupNames[i], value);
}
}
}
public String getRecordInputField() {
return recordInputField;
}
public Matcher getMatcher() {
return matcher;
}
}
///////////////////////////////////////////////////////////////////////////////
// Nested classes:
///////////////////////////////////////////////////////////////////////////////
private static enum NumRequiredMatches {
atLeastOnce,
once,
all
}
}
}