/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.apache.streams.regex; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * Provides utilities for extracting matches from content. */ public class RegexUtils { private static final Map<String, Pattern> patternCache = new ConcurrentHashMap<>(); private static final Logger LOGGER = LoggerFactory.getLogger(RegexUtils.class); private RegexUtils() {} /** * Extracts matches of the given pattern in the content and returns them as a list. * @param pattern the pattern for the substring to match. For example, [0-9]* matches 911 in Emergency number is 911. * @param content the complete content to find matches in. * @return a non-null list of matches. */ public static Map<String, List<Integer>> extractMatches(String pattern, String content) { return getMatches(pattern, content, -1); } /** * Extracts matches of the given pattern that are bounded by separation characters and returns them as a list. * @param pattern the pattern for the substring to match. For example, [0-9]* matches 911 in Emergency number is 911. * @param content the complete content to find matches in. * @return a non-null list of matches. */ public static Map<String, List<Integer>> extractWordMatches(String pattern, String content) { pattern = "(^|\\s)(" + pattern + ")([\\s!\\.;,?]|$)"; return getMatches(pattern, content, 2); } protected static Map<String, List<Integer>> getMatches(String pattern, String content, int capture) { try { Map<String, List<Integer>> matches = new HashMap<>(); if (content == null) { return matches; } Matcher matcher = getPattern(pattern).matcher(content); while (matcher.find()) { String group = capture > 0 ? matcher.group(capture) : matcher.group(); if (group != null && !group.equals("")) { List<Integer> indices; if (matches.containsKey(group)) { indices = matches.get(group); } else { indices = new ArrayList<>(); matches.put(group, indices); } indices.add(matcher.start()); } } return matches; } catch (Throwable ex) { LOGGER.error("Throwable process {}", ex); ex.printStackTrace(); throw new RuntimeException(ex); } } private static Pattern getPattern(String patternString) { Pattern pattern; if (patternCache.containsKey(patternString)) { pattern = patternCache.get(patternString); } else { pattern = Pattern.compile(patternString); patternCache.put(patternString, pattern); } return pattern; } }