/* * Copyright (C) 2015 Stratio (http://stratio.com) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.stratio.morphlines.commons.dict; /** * Created by dvazquez on 25/11/16. */ import com.google.common.base.Charsets; import com.google.common.base.Joiner; import com.google.common.base.Preconditions; import com.google.common.io.CharStreams; import org.kitesdk.morphline.shaded.com.google.common.io.Closeables; import com.typesafe.config.Config; import org.kitesdk.morphline.api.MorphlineCompilationException; import org.kitesdk.morphline.base.Configs; import org.kitesdk.morphline.shaded.com.google.code.regexp.Pattern; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStreamReader; import java.io.Reader; import java.io.StringReader; import java.net.URL; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.Map; import java.util.TreeMap; /** * Utility that parses and resolves a set of grok dictionaries ala logstash. */ public final class VgrokDictionaries { /* * Uses a shaded version of com.google.code.regexp-0.2.3 to minimize potential dependency issues. * See https://github.com/tony19/named-regexp */ private final Config config; private final Map<String, String> dictionary = new HashMap<String, String>(); private final Logger LOG = LoggerFactory.getLogger(VgrokDictionaries.class); public VgrokDictionaries(Config config, Configs configs) { this.config = config; try { // Load dictionaries from the classpath. for (String dictionaryResource : configs.getStringList(config, "dictionaryResources", Collections.<String>emptyList())) { ClassLoader classLoader = getClass().getClassLoader(); URL resource = classLoader.getResource(dictionaryResource); Preconditions.checkArgument( resource != null, "Can not find grok dictionary resource:%s (classLoader:%s)", dictionaryResource, classLoader ); LOG.debug("Loading grok dictionary:{} source:classpath", dictionaryResource); InputStreamReader inputStreamReader = new InputStreamReader(resource.openStream(), Charsets.UTF_8); loadDictionary(inputStreamReader); inputStreamReader.close(); } // Load dictionaries from the filesystem. for (String dictionaryFile : configs.getStringList(config, "dictionaryFiles", Collections.<String>emptyList())) { LOG.debug("Loading grok dictionary:{} source:filesystem", dictionaryFile); loadDictionaryFile(new File(dictionaryFile)); } // Load inline dictionary definitions. String dictionaryString = configs.getString(config, "dictionaryString", ""); if (LOG.isDebugEnabled() && !dictionaryString.isEmpty()) { LOG.debug("Loading inline grok dictionary:{}", dictionaryString); } loadDictionary(new StringReader(dictionaryString)); } catch (IOException e) { throw new MorphlineCompilationException("Cannot compile grok dictionary", config, e); } resolveDictionaryExpressions(); } public Pattern compileExpression(String expr) { expr = resolveExpression(expr); //LOG.debug("expr after : {}", expr); // TODO extract and replace conversions (?<queue_field:int>foo) return Pattern.compile(expr); } private Config getConfig() { return config; } private void loadDictionaryFile(File fileOrDir) throws IOException { if (!fileOrDir.exists()) { throw new FileNotFoundException("File not found: " + fileOrDir); } if (!fileOrDir.canRead()) { throw new IOException("Insufficient permissions to read file: " + fileOrDir); } if (fileOrDir.isDirectory()) { File[] files = fileOrDir.listFiles(); Arrays.sort(files); for (File file : files) { loadDictionaryFile(file); } } else { Reader reader = new InputStreamReader(new FileInputStream(fileOrDir), Charsets.UTF_8); try { loadDictionary(reader); } finally { Closeables.closeQuietly(reader); } } } private void loadDictionary(Reader reader) throws IOException { for (String line : CharStreams.readLines(reader)) { line = line.trim(); if (line.length() == 0) { continue; // ignore empty lines } if (line.startsWith("#")) { continue; // ignore comment lines } int i = line.indexOf(" "); if (i < 0) { throw new MorphlineCompilationException("Dictionary entry line must contain a space to separate name and value: " + line, getConfig()); } if (i == 0) { throw new MorphlineCompilationException("Dictionary entry line must contain a name: " + line, getConfig()); } String name = line.substring(0, i); String value = line.substring(i + 1, line.length()).trim(); if (value.length() == 0) { throw new MorphlineCompilationException("Dictionary entry line must contain a value: " + line, getConfig()); } dictionary.put(name, value); } } private void resolveDictionaryExpressions() { boolean wasModified = true; while (wasModified) { wasModified = false; for (Map.Entry<String, String> entry : dictionary.entrySet()) { String expr = entry.getValue(); String resolvedExpr = resolveExpression(expr); wasModified = (expr != resolvedExpr); if (wasModified) { entry.setValue(resolvedExpr); break; } } } LOG.debug("dictionary: {}", Joiner.on("\n").join( new TreeMap<String,String>(dictionary).entrySet())); for (Map.Entry<String, String> entry : dictionary.entrySet()) { Pattern.compile(entry.getValue()); // validate syntax } } private String resolveExpression(String expr) { String PATTERN_START = "%{"; String PATTERN_END= "}"; char SEPARATOR = ':'; while (true) { int i = expr.indexOf(PATTERN_START); if (i < 0) { break; } int j = expr.indexOf(PATTERN_END, i + PATTERN_START.length()); if (j < 0) { break; } String grokPattern = expr.substring(i + PATTERN_START.length(), j); //LOG.debug("grokPattern=" + grokPattern + ", entryValue=" + entryValue); int p = grokPattern.indexOf(SEPARATOR); String regexName = grokPattern; String groupName = null; String conversion = null; // FIXME if (p >= 0) { regexName = grokPattern.substring(0, p); groupName = grokPattern.substring(p+1, grokPattern.length()); int q = groupName.indexOf(SEPARATOR); if (q >= 0) { conversion = groupName.substring(q+1, groupName.length()); groupName = groupName.substring(0, q); } } //LOG.debug("patternName=" + patternName + ", groupName=" + groupName + ", conversion=" + conversion); String refValue = dictionary.get(regexName); if (refValue == null) { throw new MorphlineCompilationException("Missing value for name: " + regexName, getConfig()); } if (refValue.contains(PATTERN_START)) { break; // not a literal value; defer resolution until next iteration } String replacement = refValue; if (groupName != null) { // named capturing group replacement = "(?<" + groupName + ">" + refValue + ")"; } expr = new StringBuilder(expr).replace(i, j + PATTERN_END.length(), replacement).toString(); } return expr; } }