/* * Copyright (c) 2013-2017 Cinchapi Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.cinchapi.concourse.util; import java.util.List; import org.apache.commons.lang.StringUtils; import com.cinchapi.concourse.server.GlobalState; import com.cinchapi.concourse.util.StringSplitter; import com.google.common.collect.Lists; /** * String based utility functions that depend on proprietary information that is * specific to Concourse (i.e. the stopwords defined for a Concourse Server * deployment). * * The {@link Strings} class contains a collection of truly generic String * functions. * * @author Jeff Nelson */ public final class TStrings { /** * <p> * Unlike {@link #isInfixSearchMatch(String, String)} this method assumes * that the two parameters are tokens of strings that have had stop words * removed using the {@link #stripStopWordsAndTokenize(String)} method. * </p> * Return {@code true} if {@code haystack} is an <strong>infix * search match</strong> for {@code needle}. If {@code haystack} is an infix * search match, it means that it contains a sequence of terms where each * term or a substring of the term matches the term in the same relative * position in {@code needle}. * <p> * <ul> * <li><em>foo bar</em> (haystack) <strong>IS</strong> a match for * <em>foo bar</em> (needle)</li> * <li><em>foo bar</em> (haystack) <strong>IS</strong> a match for * <em>f bar</em> (needle)</li> * <li><em>foo bar</em> (haystack) <strong>IS</strong> a match for * <em>oo a</em> (needle)</li> * <li><em>f b</em> (haystack) <strong>IS</strong> a match for * <em>f bar</em> (needle)</li> * <li><em>barfoobar foobarfoo</em> (haystack) <strong>IS</strong> a match * for <em>f bar</em> (needle)</li> * </ul> * </p> * * @param needle * @param haystack * @return {@code true} if {@code haystack} is an infix search match for * {@code needle}. */ public static boolean isInfixSearchMatch(String[] needle, String[] haystack) { int npos = 0; int hpos = 0; while (hpos < haystack.length && npos < needle.length) { if(haystack.length - hpos < needle.length - npos) { // If the number of remaining haystack tokens is less than the // number of remaining needle tokens, then we can exit // immediately because it is not possible for the needle to be // fond in the haystack return false; } String n = needle[npos]; String h = haystack[hpos]; if(Strings.isSubString(n, h)) { ++npos; ++hpos; } else { // If the needle position is greater than 0, then we must keep // the haystack position constant so that we can use it as the // new starting point to see if the needle can be found in the // remaining tokens. if(npos > 0) { npos = 0; } else { ++hpos; } } } return npos == needle.length; } /** * Return {@code true} if {@code haystack} is an <strong>infix * search match</strong> for {@code needle}. If {@code haystack} is an infix * search match, it means that it contains a sequence of terms where each * term or a substring of the term matches the term in the same relative * position in {@code needle}. * <p> * <ul> * <li><em>foo bar</em> (haystack) <strong>IS</strong> a match for * <em>foo bar</em> (needle)</li> * <li><em>foo bar</em> (haystack) <strong>IS</strong> a match for * <em>f bar</em> (needle)</li> * <li><em>foo bar</em> (haystack) <strong>IS</strong> a match for * <em>oo a</em> (needle)</li> * <li><em>f b</em> (haystack) <strong>IS</strong> a match for * <em>f bar</em> (needle)</li> * <li><em>barfoobar foobarfoo</em> (haystack) <strong>IS</strong> a match * for <em>f bar</em> (needle)</li> * </ul> * </p> * * @param needle * @param haystack * @return {@code true} if {@code haystack} is an infix search match for * {@code needle}. */ public static boolean isInfixSearchMatch(String needle, String haystack) { String[] ntoks = stripStopWordsAndTokenize(needle.toLowerCase()); String[] htoks = stripStopWordsAndTokenize(haystack.toLowerCase()); return isInfixSearchMatch(ntoks, htoks); } /** * Return a copy of {@code string} with all of the stopwords removed. This * method depends on the stopwords defined in {@link GlobalState#STOPWORDS}. * * @param string * @return A copy of {@code string} without stopwords */ public static String stripStopWords(String string) { String[] toks = string .split(REGEX_GROUP_OF_ONE_OR_MORE_WHITESPACE_CHARS); StringBuilder sb = new StringBuilder(); for (String tok : toks) { if(!GlobalState.STOPWORDS.contains(tok)) { sb.append(tok); sb.append(" "); } } return sb.toString().trim(); } /** * Tokenize the {@code string} and return an array of tokens where all the * stopwords are removed. * * @param string * @return the tokens without stopwords */ public static String[] stripStopWordsAndTokenize(String string) { List<String> toks = Lists.newArrayList(); StringSplitter it = new StringSplitter(string, ' '); int size = 0; while (it.hasNext()) { String next = it.next(); if(!StringUtils.isBlank(next) && !GlobalState.STOPWORDS.contains(next)) { toks.add(next); ++size; } } return toks.toArray(new String[size]); } /** * Match a group of one or more whitespace characters including space, tab * and newline. This is typically used to split a string into distinct * terms. */ public static final String REGEX_GROUP_OF_ONE_OR_MORE_WHITESPACE_CHARS = "\\s+"; /** * {@code REGEX_PERCENT_SIGN_WITH_ESCAPE_CHAR} Matches the percent sign * without escape character[\%]. */ public static final String REGEX_PERCENT_SIGN_WITH_ESCAPE_CHAR = "\\\\%"; /** * {@code REGEX_PERCENT_SIGN_WITHOUT_ESCAPE_CHAR} Matches the percent sign * without escape character[%]. */ public static final String REGEX_PERCENT_SIGN_WITHOUT_ESCAPE_CHAR = "(?<!\\\\)%"; protected static final String REGEX_SINGLE_WHITESPACE = "[\\s]"; protected static final String REGEX_ZERO_OR_MORE_NON_WHITESPACE_CHARS = "[^\\s]*"; private TStrings() {/* utility class */} }