/* * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.facebook.presto.type; import com.facebook.presto.spi.PrestoException; import com.facebook.presto.spi.block.Block; import com.facebook.presto.spi.block.BlockBuilder; import com.facebook.presto.spi.block.BlockBuilderStatus; import com.google.re2j.Matcher; import com.google.re2j.Options; import com.google.re2j.Pattern; import io.airlift.log.Logger; import io.airlift.slice.Slice; import static com.facebook.presto.spi.StandardErrorCode.INVALID_FUNCTION_ARGUMENT; import static com.facebook.presto.spi.type.VarcharType.VARCHAR; import static com.google.common.base.Preconditions.checkState; import static com.google.re2j.Options.Algorithm.DFA_FALLBACK_TO_NFA; import static java.lang.Math.toIntExact; import static java.lang.String.format; public final class Re2JRegexp { private static final Logger log = Logger.get(Re2JRegexp.class); private static final java.util.regex.Pattern DOT_STAR_PREFIX_PATTERN = java.util.regex.Pattern.compile("(?s)^(\\.\\*\\??)?(.*)"); private static final int CORE_PATTERN_INDEX = 2; public final int dfaStatesLimit; public final int dfaRetries; public final Pattern re2jPattern; public final Pattern re2jPatternWithoutDotStartPrefix; public Re2JRegexp(int dfaStatesLimit, int dfaRetries, Slice pattern) { this.dfaStatesLimit = dfaStatesLimit; this.dfaRetries = dfaRetries; Options options = Options.builder() .setAlgorithm(DFA_FALLBACK_TO_NFA) .setMaximumNumberOfDFAStates(dfaStatesLimit) .setNumberOfDFARetries(dfaRetries) .setEventsListener(new RE2JEventsListener()) .build(); String patternString = pattern.toStringUtf8(); re2jPattern = Pattern.compile(patternString, options); // Remove .*? prefix. DFA has optimization which does fast lookup for first byte of a potential match. // When pattern is prefixed with .*? this optimization doesn't work in Pattern.find() function. java.util.regex.Matcher dotStarPrefixMatcher = DOT_STAR_PREFIX_PATTERN.matcher(patternString); checkState(dotStarPrefixMatcher.matches()); String patternStringWithoutDotStartPrefix = dotStarPrefixMatcher.group(CORE_PATTERN_INDEX); if (!patternStringWithoutDotStartPrefix.equals(patternString)) { re2jPatternWithoutDotStartPrefix = Pattern.compile(patternStringWithoutDotStartPrefix, options); } else { re2jPatternWithoutDotStartPrefix = re2jPattern; } } public boolean matches(Slice source) { return re2jPatternWithoutDotStartPrefix.find(source); } public Slice replace(Slice source, Slice replacement) { Matcher matcher = re2jPattern.matcher(source); try { return matcher.replaceAll(replacement); } catch (IndexOutOfBoundsException | IllegalArgumentException e) { throw new PrestoException(INVALID_FUNCTION_ARGUMENT, "Illegal replacement sequence: " + replacement.toStringUtf8()); } } public Block extractAll(Slice source, long groupIndex) { Matcher matcher = re2jPattern.matcher(source); int group = toIntExact(groupIndex); validateGroup(group, matcher.groupCount()); BlockBuilder blockBuilder = VARCHAR.createBlockBuilder(new BlockBuilderStatus(), 32); while (true) { if (!matcher.find()) { break; } Slice searchedGroup = matcher.group(group); if (searchedGroup == null) { blockBuilder.appendNull(); continue; } VARCHAR.writeSlice(blockBuilder, searchedGroup); } return blockBuilder.build(); } public Slice extract(Slice source, long groupIndex) { Matcher matcher = re2jPattern.matcher(source); int group = toIntExact(groupIndex); validateGroup(group, matcher.groupCount()); if (!matcher.find()) { return null; } return matcher.group(group); } public Block split(Slice source) { Matcher matcher = re2jPattern.matcher(source); BlockBuilder blockBuilder = VARCHAR.createBlockBuilder(new BlockBuilderStatus(), 32); int lastEnd = 0; while (matcher.find()) { Slice slice = source.slice(lastEnd, matcher.start() - lastEnd); lastEnd = matcher.end(); VARCHAR.writeSlice(blockBuilder, slice); } VARCHAR.writeSlice(blockBuilder, source.slice(lastEnd, source.length() - lastEnd)); return blockBuilder.build(); } private static void validateGroup(int group, int groupCount) { if (group < 0) { throw new PrestoException(INVALID_FUNCTION_ARGUMENT, "Group cannot be negative"); } if (group > groupCount) { throw new PrestoException(INVALID_FUNCTION_ARGUMENT, format("Pattern has %d groups. Cannot access group %d", groupCount, group)); } } private class RE2JEventsListener implements Options.EventsListener { @Override public void fallbackToNFA() { log.debug("Fallback to NFA, pattern: %s, DFA states limit: %d, DFA retries: %d", re2jPattern.pattern(), dfaStatesLimit, dfaRetries); } } }