/* * Licensed under the Apache License, Version 2.0 (the "License"); * * You may not use this file except in compliance with the License. * * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * * See the License for the specific language governing permissions and * limitations under the License. * * Contributions from 2013-2017 where performed either by US government * employees, or under US Veterans Health Administration contracts. * * US Veterans Health Administration contributions by government employees * are work of the U.S. Government and are not subject to copyright * protection in the United States. Portions contributed by government * employees are USGovWork (17USC ยง105). Not subject to copyright. * * Contribution by contractors to the US Veterans Health Administration * during this period are contractually contributed under the * Apache License, Version 2.0. * * See: https://www.usa.gov/government-works * * Contributions prior to 2013: * * Copyright (C) International Health Terminology Standards Development Organisation. * Licensed under the Apache License, Version 2.0. * */ package sh.isaac.provider.query.search; //~--- JDK imports ------------------------------------------------------------ import java.util.ArrayList; //~--- classes ---------------------------------------------------------------- /** * {@link SearchStringProcessor}. * * @author <a href="mailto:daniel.armbrust.list@gmail.com">Dan Armbrust</a> */ //TODO this class may not even need to exist, I think it was developed out of a mis-understanding of lucene. Need to reevaulate as part of the search rewrite. public class SearchStringProcessor { /** The Constant punctuationRegEx. */ public static final String punctuationRegEx = "!|\"|,|\'s|\'|:|;|\\?|`"; /** The Constant symbolsRegEx. */ public static final String symbolsRegEx = "&|#|\\$|\\%|@|\\\\|_|\\|"; /** The Constant operatorsRegEx. */ public static final String operatorsRegEx = "\\+|\\-|\\*|\\/|<|>|=|\\^|~"; /** The Constant parensRegEx. */ public static final String parensRegEx = "\\(|\\)|\\{|\\}|\\[|\\]"; /** The Constant escapedCharactersRegEx. */ public static final String escapedCharactersRegEx = "\\+|\\-|&|\\||!|\\(|\\)|\\{|\\}|\\[|\\]|\\^|\"|~|\\*|\\?|:|\\/"; /** The Constant nonPrintableRegEx. */ // Note: \xc2\xa0 is non-breaking space public static final String nonPrintableRegEx = "\\x00|\\x01|\\x02|\\x03|\\x04|\\x05|\\x06|\\x07|\\x08|\\x09|\\x0a|\\x0b|\\x0c|\\x0d|\\x0e|\\x0f|" + "\\x10|\\x11|\\x12|\\x13|\\x14|\\x15|\\x16|\\x17|\\x18|\\x19|\\x1a|\\x1b|\\x1c|\\x1d|\\x1e|\\x1f|" + "\\xc2\\xa0"; /** The Constant stopWords. */ public static final ArrayList<String> stopWords = new ArrayList<String>(); //~--- static initializers ------------------------------------------------- static { stopWords.add("a"); stopWords.add("an"); stopWords.add("and"); stopWords.add("by"); stopWords.add("for"); stopWords.add("in"); stopWords.add("not"); stopWords.add("of"); stopWords.add("on"); stopWords.add("or"); stopWords.add("the"); stopWords.add("to"); stopWords.add("with"); } //~--- methods ------------------------------------------------------------- /** * Escape characters. * * @param s the s * @return the string */ public static String escapeCharacters(String s) { return s.replaceAll(escapedCharactersRegEx, "\\\\$0"); } /** * Prepare search string. * * @param s the s * @return the string */ public static String prepareSearchString(String s) { String processedString = s; processedString = stripNonPrintable(processedString); processedString = escapeCharacters(processedString); return processedString; } /** * Removes the stop words. * * @param s the s * @return the string */ public static String removeStopWords(String s) { final String[] words = s.trim() .toLowerCase() .split("\\s+"); final StringBuilder sb = new StringBuilder(""); for (String w: words) { w = w.trim(); if (!stopWords.contains(w)) { sb.append(w); sb.append(" "); } } return sb.toString() .trim(); } /** * Strip all. * * @param s the s * @return the string */ public static String stripAll(String s) { final String allRegEx = punctuationRegEx + "|" + symbolsRegEx + "|" + operatorsRegEx + "|" + parensRegEx + "|" + nonPrintableRegEx; return s.replaceAll(allRegEx, " "); } /** * Strip non printable. * * @param s the s * @return the string */ public static String stripNonPrintable(String s) { return s.replaceAll(nonPrintableRegEx, " "); } /** * Strip operators. * * @param s the s * @return the string */ public static String stripOperators(String s) { return s.replaceAll(operatorsRegEx, " "); } /** * Strip parens. * * @param s the s * @return the string */ public static String stripParens(String s) { return s.replaceAll(parensRegEx, " "); } /** * Strip punctuation. * * @param s the s * @return the string */ public static String stripPunctuation(String s) { return s.replaceAll(punctuationRegEx, " ") .trim(); } /** * Strip symbols. * * @param s the s * @return the string */ public static String stripSymbols(String s) { return s.replaceAll(symbolsRegEx, " "); } }