/* * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.addthis.hydra.data.filter.value; import java.util.regex.Matcher; import java.util.regex.Pattern; import com.addthis.bundle.util.ValueUtil; import com.addthis.bundle.value.ValueArray; import com.addthis.bundle.value.ValueFactory; import com.addthis.bundle.value.ValueObject; import com.addthis.codec.annotations.FieldConfig; /** * This {@link AbstractValueFilter ValueFilter} <span class="hydra-summary">performs regular expression matching on the input string</span>. * <p/> * <p>The default behavior is to perform regular expression matching on the input string, * and return an array with all the groups that match to the regular expression. You must * specify one or more groups in your regular expression. Group zero, the implicit group * that represents the entire match, is not returned. If the {@link #replace replace} * field is used, then all substrings that match against the pattern are replaced with * the replacement string and the output is a string. * <p/> * <p>The regex specification used by hydra (similar to perl 5) can be found <a href="http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html">here</a>. * Once you design a regex to match the desired pattern, you must encode it in a JSON string. * Notably, this means your pattern must be wrapped in double-quotes ("), and JSON string special characters, backslash (\), single-quote ('), and double-quote(") must be escaped by prefixing with a backslash. * Consider the follow examples: * <pre> * 1932741273804 desired text to match * \d+ regex pattern, using \d to match numeric characters * "\\d+" pattern encoded as JSON string, quote surrounded and \ escaped by \ * {op: "regex", pattern: "\\d+", replace: "string of numbers"} * </pre> * <pre> * "C:\WINDOWS" desired text to match (match enclosing quotes as well) * "C:\\WINDOWS" regex pattern, \ is escaped by \ according to regex standard * "\"C:\\\\WINDOWS\"" pattern encoded as JSON string, quote surrounded with \ and " escaped by \ * {op: "regex", pattern: "\"C:\\\\WINDOWS\"", replace: "LINUX"} * </pre> * </p> * <p>Example:</p> * <pre> * {from:"SOURCE", to:"SOURCE", regex:"Log_([0-9]+)\\."} * </pre> * * @user-reference */ public class ValueFilterRegex extends AbstractValueFilter { /** * Regular expression to match against. This field is required. */ @FieldConfig(codable = true, required = true) private Pattern pattern; /** * If non-null, then replace all matches with this string. Default is null. */ @FieldConfig(codable = true) private String replace; public ValueFilterRegex setPattern(Pattern p) { pattern = p; return this; } public ValueFilterRegex setReplace(String r) { replace = r; return this; } @Override public ValueObject filterValue(ValueObject value) { String sv = ValueUtil.asNativeString(value); if (sv == null) { return null; } Matcher matcher = pattern.matcher(sv); if (replace != null) { return ValueFactory.create(matcher.replaceAll(replace)); } ValueArray arr = ValueFactory.createArray(1); while (matcher.find()) { int count = matcher.groupCount(); for (int i = 1; i <= count; i++) { arr.add(ValueFactory.create(matcher.group(i))); } } return arr; } }