/** * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hbase.filter; import com.google.protobuf.InvalidProtocolBufferException; import java.nio.charset.Charset; import java.nio.charset.IllegalCharsetNameException; import java.util.Arrays; import java.util.regex.Pattern; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.hbase.classification.InterfaceAudience; import org.apache.hadoop.hbase.classification.InterfaceStability; import org.apache.hadoop.hbase.exceptions.DeserializationException; import org.apache.hadoop.hbase.protobuf.generated.ComparatorProtos; import org.apache.hadoop.hbase.util.Bytes; import org.jcodings.Encoding; import org.jcodings.EncodingDB; import org.jcodings.specific.UTF8Encoding; import org.joni.Matcher; import org.joni.Option; import org.joni.Regex; import org.joni.Syntax; /** * This comparator is for use with {@link CompareFilter} implementations, such * as {@link RowFilter}, {@link QualifierFilter}, and {@link ValueFilter}, for * filtering based on the value of a given column. Use it to test if a given * regular expression matches a cell value in the column. * <p> * Only EQUAL or NOT_EQUAL comparisons are valid with this comparator. * <p> * For example: * <p> * <pre> * ValueFilter vf = new ValueFilter(CompareOp.EQUAL, * new RegexStringComparator( * // v4 IP address * "(((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3,3}" + * "(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?))(\\/[0-9]+)?" + * "|" + * // v6 IP address * "((([\\dA-Fa-f]{1,4}:){7}[\\dA-Fa-f]{1,4})(:([\\d]{1,3}.)" + * "{3}[\\d]{1,3})?)(\\/[0-9]+)?")); * </pre> * <p> * Supports {@link java.util.regex.Pattern} flags as well: * <p> * <pre> * ValueFilter vf = new ValueFilter(CompareOp.EQUAL, * new RegexStringComparator("regex", Pattern.CASE_INSENSITIVE | Pattern.DOTALL)); * </pre> * @see java.util.regex.Pattern */ @InterfaceAudience.Public @InterfaceStability.Stable public class RegexStringComparator extends ByteArrayComparable { private static final Log LOG = LogFactory.getLog(RegexStringComparator.class); private Engine engine; /** Engine implementation type (default=JAVA) */ @InterfaceAudience.Public @InterfaceStability.Stable public enum EngineType { JAVA, JONI } /** * Constructor * Adds Pattern.DOTALL to the underlying Pattern * @param expr a valid regular expression */ public RegexStringComparator(String expr) { this(expr, Pattern.DOTALL); } /** * Constructor * Adds Pattern.DOTALL to the underlying Pattern * @param expr a valid regular expression * @param engine engine implementation type */ public RegexStringComparator(String expr, EngineType engine) { this(expr, Pattern.DOTALL, engine); } /** * Constructor * @param expr a valid regular expression * @param flags java.util.regex.Pattern flags */ public RegexStringComparator(String expr, int flags) { this(expr, flags, EngineType.JAVA); } /** * Constructor * @param expr a valid regular expression * @param flags java.util.regex.Pattern flags * @param engine engine implementation type */ public RegexStringComparator(String expr, int flags, EngineType engine) { super(Bytes.toBytes(expr)); switch (engine) { case JAVA: this.engine = new JavaRegexEngine(expr, flags); break; case JONI: this.engine = new JoniRegexEngine(expr, flags); break; } } /** * Specifies the {@link Charset} to use to convert the row key to a String. * <p> * The row key needs to be converted to a String in order to be matched * against the regular expression. This method controls which charset is * used to do this conversion. * <p> * If the row key is made of arbitrary bytes, the charset {@code ISO-8859-1} * is recommended. * @param charset The charset to use. */ public void setCharset(final Charset charset) { engine.setCharset(charset.name()); } @Override public int compareTo(byte[] value, int offset, int length) { return engine.compareTo(value, offset, length); } /** * @return The comparator serialized using pb */ @Override public byte [] toByteArray() { return engine.toByteArray(); } /** * @param pbBytes A pb serialized {@link RegexStringComparator} instance * @return An instance of {@link RegexStringComparator} made from <code>bytes</code> * @throws DeserializationException * @see #toByteArray */ public static RegexStringComparator parseFrom(final byte [] pbBytes) throws DeserializationException { ComparatorProtos.RegexStringComparator proto; try { proto = ComparatorProtos.RegexStringComparator.parseFrom(pbBytes); } catch (InvalidProtocolBufferException e) { throw new DeserializationException(e); } RegexStringComparator comparator; if (proto.hasEngine()) { EngineType engine = EngineType.valueOf(proto.getEngine()); comparator = new RegexStringComparator(proto.getPattern(), proto.getPatternFlags(), engine); } else { comparator = new RegexStringComparator(proto.getPattern(), proto.getPatternFlags()); } String charset = proto.getCharset(); if (charset.length() > 0) { try { comparator.getEngine().setCharset(charset); } catch (IllegalCharsetNameException e) { LOG.error("invalid charset", e); } } return comparator; } /** * @param other * @return true if and only if the fields of the comparator that are serialized * are equal to the corresponding fields in other. Used for testing. */ @Override boolean areSerializedFieldsEqual(ByteArrayComparable other) { if (other == this) return true; if (!(other instanceof RegexStringComparator)) return false; RegexStringComparator comparator = (RegexStringComparator)other; return super.areSerializedFieldsEqual(comparator) && engine.getClass().isInstance(comparator.getEngine()) && engine.getPattern().equals(comparator.getEngine().getPattern()) && engine.getFlags() == comparator.getEngine().getFlags() && engine.getCharset().equals(comparator.getEngine().getCharset()); } Engine getEngine() { return engine; } /** * This is an internal interface for abstracting access to different regular * expression matching engines. */ static interface Engine { /** * Returns the string representation of the configured regular expression * for matching */ String getPattern(); /** * Returns the set of configured match flags, a bit mask that may include * {@link Pattern} flags */ int getFlags(); /** * Returns the name of the configured charset */ String getCharset(); /** * Set the charset used when matching * @param charset the name of the desired charset for matching */ void setCharset(final String charset); /** * Return the serialized form of the configured matcher */ byte [] toByteArray(); /** * Match the given input against the configured pattern * @param value the data to be matched * @param offset offset of the data to be matched * @param length length of the data to be matched * @return 0 if a match was made, 1 otherwise */ int compareTo(byte[] value, int offset, int length); } /** * Implementation of the Engine interface using Java's Pattern. * <p> * This is the default engine. */ static class JavaRegexEngine implements Engine { private Charset charset = Charset.forName("UTF-8"); private Pattern pattern; public JavaRegexEngine(String regex, int flags) { this.pattern = Pattern.compile(regex, flags); } @Override public String getPattern() { return pattern.toString(); } @Override public int getFlags() { return pattern.flags(); } @Override public String getCharset() { return charset.name(); } @Override public void setCharset(String charset) { this.charset = Charset.forName(charset); } @Override public int compareTo(byte[] value, int offset, int length) { // Use find() for subsequence match instead of matches() (full sequence // match) to adhere to the principle of least surprise. String tmp; if (length < value.length / 2) { // See HBASE-9428. Make a copy of the relevant part of the byte[], // or the JDK will copy the entire byte[] during String decode tmp = new String(Arrays.copyOfRange(value, offset, offset + length), charset); } else { tmp = new String(value, offset, length, charset); } return pattern.matcher(tmp).find() ? 0 : 1; } @Override public byte[] toByteArray() { ComparatorProtos.RegexStringComparator.Builder builder = ComparatorProtos.RegexStringComparator.newBuilder(); builder.setPattern(pattern.pattern()); builder.setPatternFlags(pattern.flags()); builder.setCharset(charset.name()); builder.setEngine(EngineType.JAVA.name()); return builder.build().toByteArray(); } } /** * Implementation of the Engine interface using Jruby's joni regex engine. * <p> * This engine operates on byte arrays directly so is expected to be more GC * friendly, and reportedly is twice as fast as Java's Pattern engine. * <p> * NOTE: Only the {@link Pattern} flags CASE_INSENSITIVE, DOTALL, and * MULTILINE are supported. */ static class JoniRegexEngine implements Engine { private Encoding encoding = UTF8Encoding.INSTANCE; private String regex; private Regex pattern; public JoniRegexEngine(String regex, int flags) { this.regex = regex; byte[] b = Bytes.toBytes(regex); this.pattern = new Regex(b, 0, b.length, patternToJoniFlags(flags), encoding, Syntax.Java); } @Override public String getPattern() { return regex; } @Override public int getFlags() { return pattern.getOptions(); } @Override public String getCharset() { return encoding.getCharsetName(); } @Override public void setCharset(String name) { setEncoding(name); } @Override public int compareTo(byte[] value, int offset, int length) { // Use subsequence match instead of full sequence match to adhere to the // principle of least surprise. Matcher m = pattern.matcher(value); return m.search(offset, length, pattern.getOptions()) < 0 ? 1 : 0; } @Override public byte[] toByteArray() { ComparatorProtos.RegexStringComparator.Builder builder = ComparatorProtos.RegexStringComparator.newBuilder(); builder.setPattern(regex); builder.setPatternFlags(joniToPatternFlags(pattern.getOptions())); builder.setCharset(encoding.getCharsetName()); builder.setEngine(EngineType.JONI.name()); return builder.build().toByteArray(); } private int patternToJoniFlags(int flags) { int newFlags = 0; if ((flags & Pattern.CASE_INSENSITIVE) != 0) { newFlags |= Option.IGNORECASE; } if ((flags & Pattern.DOTALL) != 0) { // This does NOT mean Pattern.MULTILINE newFlags |= Option.MULTILINE; } if ((flags & Pattern.MULTILINE) != 0) { // This is what Java 8's Nashorn engine does when using joni and // translating Pattern's MULTILINE flag newFlags &= ~Option.SINGLELINE; newFlags |= Option.NEGATE_SINGLELINE; } return newFlags; } private int joniToPatternFlags(int flags) { int newFlags = 0; if ((flags & Option.IGNORECASE) != 0) { newFlags |= Pattern.CASE_INSENSITIVE; } // This does NOT mean Pattern.MULTILINE, this is equivalent to Pattern.DOTALL if ((flags & Option.MULTILINE) != 0) { newFlags |= Pattern.DOTALL; } // This means Pattern.MULTILINE. Nice if ((flags & Option.NEGATE_SINGLELINE) != 0) { newFlags |= Pattern.MULTILINE; } return newFlags; } private void setEncoding(String name) { EncodingDB.Entry e = EncodingDB.getEncodings().get(Bytes.toBytes(name)); if (e != null) { encoding = e.getEncoding(); } else { throw new IllegalCharsetNameException(name); } } } }