/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.ingest.common;
import org.jcodings.specific.UTF8Encoding;
import org.joni.Matcher;
import org.joni.NameEntry;
import org.joni.Option;
import org.joni.Regex;
import org.joni.Region;
import org.joni.Syntax;
import org.joni.exception.ValueException;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Locale;
import java.util.Map;
final class Grok {
private static final String NAME_GROUP = "name";
private static final String SUBNAME_GROUP = "subname";
private static final String PATTERN_GROUP = "pattern";
private static final String DEFINITION_GROUP = "definition";
private static final String GROK_PATTERN =
"%\\{" +
"(?<name>" +
"(?<pattern>[A-z0-9]+)" +
"(?::(?<subname>[A-z0-9_:.-]+))?" +
")" +
"(?:=(?<definition>" +
"(?:" +
"(?:[^{}]+|\\.+)+" +
")+" +
")" +
")?" + "\\}";
private static final Regex GROK_PATTERN_REGEX = new Regex(GROK_PATTERN.getBytes(StandardCharsets.UTF_8), 0,
GROK_PATTERN.getBytes(StandardCharsets.UTF_8).length, Option.NONE, UTF8Encoding.INSTANCE, Syntax.DEFAULT);
private final Map<String, String> patternBank;
private final boolean namedCaptures;
private final Regex compiledExpression;
private final String expression;
Grok(Map<String, String> patternBank, String grokPattern) {
this(patternBank, grokPattern, true);
}
@SuppressWarnings("unchecked")
Grok(Map<String, String> patternBank, String grokPattern, boolean namedCaptures) {
this.patternBank = patternBank;
this.namedCaptures = namedCaptures;
this.expression = toRegex(grokPattern);
byte[] expressionBytes = expression.getBytes(StandardCharsets.UTF_8);
this.compiledExpression = new Regex(expressionBytes, 0, expressionBytes.length, Option.DEFAULT, UTF8Encoding.INSTANCE);
}
public String groupMatch(String name, Region region, String pattern) {
try {
int number = GROK_PATTERN_REGEX.nameToBackrefNumber(name.getBytes(StandardCharsets.UTF_8), 0,
name.getBytes(StandardCharsets.UTF_8).length, region);
int begin = region.beg[number];
int end = region.end[number];
return new String(pattern.getBytes(StandardCharsets.UTF_8), begin, end - begin, StandardCharsets.UTF_8);
} catch (StringIndexOutOfBoundsException e) {
return null;
} catch (ValueException e) {
return null;
}
}
/**
* converts a grok expression into a named regex expression
*
* @return named regex expression
*/
public String toRegex(String grokPattern) {
byte[] grokPatternBytes = grokPattern.getBytes(StandardCharsets.UTF_8);
Matcher matcher = GROK_PATTERN_REGEX.matcher(grokPatternBytes);
int result = matcher.search(0, grokPatternBytes.length, Option.NONE);
if (result != -1) {
Region region = matcher.getEagerRegion();
String namedPatternRef = groupMatch(NAME_GROUP, region, grokPattern);
String subName = groupMatch(SUBNAME_GROUP, region, grokPattern);
// TODO(tal): Support definitions
String definition = groupMatch(DEFINITION_GROUP, region, grokPattern);
String patternName = groupMatch(PATTERN_GROUP, region, grokPattern);
String pattern = patternBank.get(patternName);
String grokPart;
if (namedCaptures && subName != null) {
grokPart = String.format(Locale.US, "(?<%s>%s)", namedPatternRef, pattern);
} else if (!namedCaptures) {
grokPart = String.format(Locale.US, "(?<%s>%s)", patternName + "_" + String.valueOf(result), pattern);
} else {
grokPart = String.format(Locale.US, "(?:%s)", pattern);
}
String start = new String(grokPatternBytes, 0, result, StandardCharsets.UTF_8);
String rest = new String(grokPatternBytes, region.end[0], grokPatternBytes.length - region.end[0], StandardCharsets.UTF_8);
return start + toRegex(grokPart + rest);
}
return grokPattern;
}
/**
* Checks whether a specific text matches the defined grok expression.
*
* @param text the string to match
* @return true if grok expression matches text, false otherwise.
*/
public boolean match(String text) {
Matcher matcher = compiledExpression.matcher(text.getBytes(StandardCharsets.UTF_8));
int result = matcher.search(0, text.length(), Option.DEFAULT);
return (result != -1);
}
/**
* Matches and returns any named captures within a compiled grok expression that matched
* within the provided text.
*
* @param text the text to match and extract values from.
* @return a map containing field names and their respective coerced values that matched.
*/
public Map<String, Object> captures(String text) {
byte[] textAsBytes = text.getBytes(StandardCharsets.UTF_8);
Map<String, Object> fields = new HashMap<>();
Matcher matcher = compiledExpression.matcher(textAsBytes);
int result = matcher.search(0, textAsBytes.length, Option.DEFAULT);
if (result != -1 && compiledExpression.numberOfNames() > 0) {
Region region = matcher.getEagerRegion();
for (Iterator<NameEntry> entry = compiledExpression.namedBackrefIterator(); entry.hasNext();) {
NameEntry e = entry.next();
String groupName = new String(e.name, e.nameP, e.nameEnd - e.nameP, StandardCharsets.UTF_8);
for (int number : e.getBackRefs()) {
if (region.beg[number] >= 0) {
String matchValue = new String(textAsBytes, region.beg[number], region.end[number] - region.beg[number],
StandardCharsets.UTF_8);
GrokMatchGroup match = new GrokMatchGroup(groupName, matchValue);
fields.put(match.getName(), match.getValue());
break;
}
}
}
return fields;
} else if (result != -1) {
return fields;
}
return null;
}
}