/* * Licensed to CRATE Technology GmbH ("Crate") under one or more contributor * license agreements. See the NOTICE file distributed with this work for * additional information regarding copyright ownership. Crate licenses * this file to you under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. You may * obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations * under the License. * * However, if you have executed another commercial license agreement * with Crate these terms will supersede the license and you may use the * software solely pursuant to the terms of the relevant commercial agreement. */ package io.crate.operation.scalar.regex; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.UnicodeUtil; import org.elasticsearch.common.lucene.BytesRefs; import javax.annotation.Nullable; import java.util.regex.Matcher; import java.util.regex.Pattern; public class RegexMatcher { private final Matcher matcher; private final CharsRef utf16 = new CharsRef(10); private final boolean globalFlag; public RegexMatcher(String regex, int flags, boolean globalFlag) { Pattern pattern = Pattern.compile(regex, flags); this.matcher = pattern.matcher(utf16); this.globalFlag = globalFlag; } public RegexMatcher(String regex, @Nullable BytesRef flags) { this(regex, parseFlags(BytesRefs.toString(flags)), isGlobal(BytesRefs.toString(flags))); } public RegexMatcher(String regex, @Nullable String flags) { this(regex, parseFlags(flags), isGlobal(flags)); } public RegexMatcher(String regex) { this(regex, 0, false); } private static void UTF8toUTF16(BytesRef bytes, CharsRef charsRef) { if (charsRef.chars.length < bytes.length) { charsRef.chars = new char[bytes.length]; } charsRef.length = UnicodeUtil.UTF8toUTF16(bytes, charsRef.chars); } public boolean match(BytesRef term) { UTF8toUTF16(term, utf16); return matcher.reset().find(); } @Nullable public BytesRef[] groups() { try { if (matcher.groupCount() == 0) { return new BytesRef[]{new BytesRef(matcher.group())}; } BytesRef[] groups = new BytesRef[matcher.groupCount()]; // skip first group (the original string) for (int i = 1; i <= matcher.groupCount(); i++) { String group = matcher.group(i); if (group != null) { groups[i - 1] = new BytesRef(group); } else { groups[i - 1] = null; } } return groups; } catch (IllegalStateException e) { // no match -> no groups } return null; } public BytesRef replace(BytesRef term, String replacement) { UTF8toUTF16(term, utf16); if (globalFlag) { return new BytesRef(matcher.replaceAll(replacement)); } else { return new BytesRef(matcher.replaceFirst(replacement)); } } public static int parseFlags(@Nullable String flagsString) { int flags = 0; if (flagsString == null) { return flags; } for (char flag : flagsString.toCharArray()) { switch (flag) { case 'i': flags = flags | Pattern.CASE_INSENSITIVE; break; case 'u': flags = flags | Pattern.UNICODE_CASE; break; case 'U': flags = flags | Pattern.UNICODE_CHARACTER_CLASS; break; case 's': flags = flags | Pattern.DOTALL; break; case 'm': flags = flags | Pattern.MULTILINE; break; case 'x': flags = flags | Pattern.COMMENTS; break; case 'd': flags = flags | Pattern.UNIX_LINES; break; default: break; } } return flags; } public static boolean isGlobal(@Nullable String flags) { if (flags == null) { return false; } return flags.indexOf('g') != -1; } // PCRE features public static final String character_classes = "dDsSwW"; public static final String boundary_matchers = "bBAGZz"; public static final String embedded_flags = "idmsuxU"; // recognize pcre escaped sequences anywhere inside the pattern public static final String escape_sequences_pattern = ".*\\\\[" + character_classes + boundary_matchers + "].*"; // recognize pcre embedded flags at the beginning of the pattern public static final String embedded_flags_pattern = "^\\(\\?[" + embedded_flags + "]\\).*"; // final precompiled java.util.regex.Pattern public static final Pattern pcre_pattern = Pattern.compile(escape_sequences_pattern + "|" + embedded_flags_pattern); /** * Determine whether regex pattern contains PCRE features, e.g. * - predefined character classes like \d, \D, \s, \S, \w, \W * - boundary matchers like \b, \B, \A, \G, \Z, \z * - embedded flag expressions like (?i), (?d), etc. * * @see java.util.regex.Pattern */ public static boolean isPcrePattern(String pattern) { return pcre_pattern.matcher(pattern).matches(); } }