/* * BoyerMooreSearchMatcher.java - Literal pattern String matcher utilizing the * Boyer-Moore algorithm * :tabSize=4:indentSize=4:noTabs=false: * :folding=explicit:collapseFolds=1: * * Copyright (C) 1999, 2000 mike dillon * Portions copyright (C) 2001 Tom Locke * Portions copyright (C) 2001, 2002 Slava Pestov * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ package org.gjt.sp.jedit.search; /** * Implements literal search using the Boyer-Moore algorithm. * @version $Id: BoyerMooreSearchMatcher.java 23332 2013-11-13 14:24:31Z kpouer $ */ public class BoyerMooreSearchMatcher extends SearchMatcher { //{{{ BoyerMooreSearchMatcher constructors /** * Creates a new string literal matcher. * @param pattern the search pattern * @param ignoreCase <code>true</code> if you want to ignore case */ public BoyerMooreSearchMatcher(String pattern, boolean ignoreCase) { this(pattern, ignoreCase, false); } /** * Creates a new string literal matcher. * @param pattern the search pattern * @param ignoreCase <code>true</code> if you want to ignore case * @param wholeWord <code>true</code> to search for whole word only * @since 4.5pre1 */ public BoyerMooreSearchMatcher(String pattern, boolean ignoreCase, boolean wholeWord) { this.pattern = pattern.toCharArray(); if(ignoreCase) { for(int i = 0; i < this.pattern.length; i++) { this.pattern[i] = Character.toUpperCase( this.pattern[i]); } } this.ignoreCase = ignoreCase; pattern_end = this.pattern.length - 1; this.wholeWord = wholeWord; } //}}} //{{{ nextMatch() method @Override public SearchMatcher.Match nextMatch(CharSequence text, boolean start, boolean end, boolean firstTime, boolean reverse) throws InterruptedException { int pos = match(text,reverse); if (pos == -1) { return null; } else { returnValue.start = pos; returnValue.end = pos + pattern.length; int _end = returnValue.end; if (wholeWord) { CharSequence subText; while (!isWholeWord(text, returnValue.start, returnValue.end)) { subText = text.subSequence(returnValue.end, text.length()); Match match = nextMatch(subText, start, end, firstTime, reverse); // match == returnValue or null if (match == null) return null; match.start += _end; _end += match.end; match.end = match.start + pattern.length; } } return returnValue; } } //}}} //{{{ match() method /** * a good introduction to the Boyer-Moore fast string matching * algorithm may be found on Moore's website at: * * http://www.cs.utexas.edu/users/moore/best-ideas/string-searching/ * @throws InterruptedException * * @since jEdit 4.3pre5 */ public int match(CharSequence text, boolean reverse) throws InterruptedException { if(Thread.interrupted()) throw new InterruptedException(); //{{{ // lazily create skip and suffix arrays for either the // search pattern, or the reversed search pattern int[] skip, suffix; if(reverse) { if(back_skip == null) { back_skip = generateSkipArray(true); back_suffix = generateSuffixArray(true); } skip = back_skip; suffix = back_suffix; } else { if(fwd_skip == null) { fwd_skip = generateSkipArray(false); fwd_suffix = generateSuffixArray(false); } skip = fwd_skip; suffix = fwd_suffix; } //}}} // position variable for pattern test position int pos; // position variable for pattern start int anchor = 0; // last possible start position of a match with this pattern; // this is negative if the pattern is longer than the text // causing the search loop below to immediately fail //int last_anchor = reverseSearch // ? offset + pattern.length - 1 // : length - pattern.length; char ch = 0; int bad_char; int good_suffix; // the search works by starting the anchor (first character // of the pattern) at the initial offset. as long as the // anchor is far enough from the enough of the text for the // pattern to match, and until the pattern matches, we // compare the pattern to the text from the last character // to the first character in reverse order. where a character // in the pattern mismatches, we use the two heuristics // based on the mismatch character and its position in the // pattern to determine the furthest we can move the anchor // without missing any potential pattern matches. SEARCH: while (anchor + pattern_end < text.length()) { if(Thread.interrupted()) throw new InterruptedException(); for (pos = pattern_end; pos >= 0; --pos) { ch = text.charAt(pos + anchor); if(ignoreCase) ch = Character.toUpperCase(ch); // pattern test if ((reverse ? ch != pattern[pattern_end - pos] : ch != pattern[pos])) { // character mismatch, determine how many characters to skip // heuristic #1 bad_char = pos - skip[getSkipIndex(ch)]; // heuristic #2 good_suffix = suffix[pos]; // skip the greater of the two distances provided by the // heuristics int skip_index = (bad_char > good_suffix) ? bad_char : good_suffix; anchor += skip_index; // go back to the while loop continue SEARCH; } } // MATCH: return the position of its first character return anchor; } // MISMATCH: return -1 as defined by API return -1; } //}}} //{{{ toString() method public String toString() { return "BoyerMooreSearchMatcher[" + new String(pattern) + ',' + ignoreCase + ']'; } //}}} //{{{ Private members private char[] pattern; private int pattern_end; private boolean ignoreCase; // Boyer-Moore member fields private int[] fwd_skip; private int[] fwd_suffix; private int[] back_skip; private int[] back_suffix; // Boyer-Moore helper methods //{{{ generateSkipArray() method /* * the 'skip' array is used to determine for each index in the * hashed alphabet how many characters can be skipped if * a mismatch occurs on a characater hashing to that index. */ private int[] generateSkipArray(boolean reverse) { // initialize the skip array to all zeros int[] skip = new int[256]; // leave the table cleanly-initialized for an empty pattern if (pattern.length == 0) return skip; int pos = 0; do { skip[getSkipIndex(pattern[reverse ? pattern_end - pos : pos])] = pos; } while (++pos < pattern.length); return skip; } //}}} //{{{ getSkipIndex() method /* * to avoid our skip table having a length of 2 ^ 16, we hash each * character of the input into a character in the alphabet [\x00-\xFF] * using the lower 8 bits of the character's value (resulting in * a more reasonable skip table of length 2 ^ 8). * * the result of this is that more than one character can hash to the * same index, but since the skip table encodes the position of * occurence of the character furthest into the string with a particular * index (whether or not it is the only character with that index), an * index collision only means that that this heuristic will give a * sub-optimal skip (i.e. a complete skip table could use the differences * between colliding characters to maximal effect, at the expense of * building a table that is over 2 orders of magnitude larger and very * sparse). */ private static int getSkipIndex(char ch) { return ch & 0x000000FF; } //}}} //{{{ generateSuffixArray() method /* * XXX: hairy code that is basically just a functional(?) port of some * other code i barely understood */ private int[] generateSuffixArray(boolean reverse) { int m = pattern.length; int j = m + 1; int[] suffix = new int[j]; int[] tmp = new int[j]; tmp[m] = j; for (int i = m; i > 0; --i) { while (j <= m && pattern[reverse ? pattern_end - i + 1 : i - 1] != pattern[reverse ? pattern_end - j + 1 : j - 1]) { if (suffix[j] == 0) { suffix[j] = j - i; } j = tmp[j]; } tmp[i - 1] = --j; } int k = tmp[0]; for (j = 0; j <= m; j++) { // the code above builds a 1-indexed suffix array, // but we shift it to be 0-indexed, ignoring the // original 0-th element if (j > 0) { suffix[j - 1] = (suffix[j] == 0) ? k : suffix[j]; } if (j == k) { k = tmp[k]; } } return suffix; } //}}} //}}} }