/* * $Id$ * * Copyright (C) 2003-2015 JNode.org * * This library is free software; you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation; either version 2.1 of the License, or * (at your option) any later version. * * This library is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public * License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this library; If not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */ package org.jnode.shell; import java.io.File; import java.io.FilenameFilter; import java.lang.ref.WeakReference; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.LinkedList; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * Loosely modeled on java.util.regex.Pattern, this class provides a simple * mechanism for expanding UNIX-style pathname patterns into a list of pathnames * for filesystem objects. * * Depending on the flags supplied when a pattern is compiled, the following * pattern constructs are available: * <ul> * <li>A star ("*") matches zero or more characters. * <li>A question mark ("?") matches exactly one character. * <li>A matching pair of square brackets ("[]") denote a character class. The * character class "[abz]" matches one of "a", "b" or "z". Ranges are allowed, * so that "[0-9A-F]" matches a hexadecimal digit. If the first character of a * character class is "!" or "^", the character class is negated; i.e. * "[^a-zA-Z]" matches any character that is not an ASCII letter. * <li>A single quote ("'") causes characters up to the next "'" to be treated * as literal characters. * <li>A backslash ("\") causes the next character (even a single quote) to be * treated as a literal character; i.e. any special meaning. * </ul> * <p> * Patterns are first split into file components on "/" boundaries, then the * sub-patterns are used to match names in a given directory. Neither quoting or * escaping affect "/" interpretation, and a "/" in a character class causes it * to be treated as literal characters. * <p> * The pattern expander treats "dot" files (i.e. files starting with ".") as * hidden. A hidden file is only matched when the pattern has an explicit "." as * the first character of a component. Thus the pattern "*" does not match "." * or "..", but the pattern ".*" does. * <p> * This class also exposes a static method for compiling patterns in the UNIX * shell-style syntax to Java {@link Pattern} objects. The resulting * objects allow you to use the shell-style syntax for matching arbitrary * strings. The pathname-specific matching behaviors of PathnamePattern * such as implicit anchoring, and the handling of '/' in character classes * are supported via flags. * <p> * TODO: * <ul> * <li>Provide a method that returns a "lazy" pathname iterator for cases where * we don't want to build a (potentially huge) in-memory list of pathnames. * <li>Support expansions of ~ and {..,..} patterns. (Note that the latter are * not part of the POSIX specification.) * <li>Add a parameter (or parameters) to allow the caller to limit the size of * the result list. * </ul> * * @author crawley@jnode */ public class PathnamePattern { /** * When set, this flag causes the pathname list returned by 'expand' to be * lexically sorted. */ public static final int SORT_MATCHES = 0x01; /** * When set, this flag enables UNIX like handling of hidden files. File and * directories whose name starts with a "." are only matched if the first * character in the pattern is a ".". */ public static final int HIDE_DOT_FILENAMES = 0x02; /** * When set, this flag causes the '.' and '..' directories to be included in * domain of objects to be matched. (You probably don't want to set this * flag without setting HIDE_DOT_FILENAMES as well. Under normal * circumstances a user doesn't expect '.' and '..' to be returned in a * pattern match.) */ public static final int INCLUDE_DOT_AND_DOTDOT = 0x04; /** * When set, this flag causes a '\' in a pattern to escape the next * character. For example, the sequence "\*" in a pattern will match a "*" * character in a filename. */ public static final int BACKSLASH_ESCAPES = 0x08; /** * When set, this flag causes characters inside matching single-quote * characters to be match literal characters in the pathname. Only a '\' is * unaffected. Thus "'a*c'" will match the file "a*c", but "'a\'c'" will * match "a'c"; i.e. a filename containing a single-quote character. */ public static final int SINGLE_QUOTE_ESCAPES = 0x10; /** * When set, this flag causes characters inside matching double-quote * characters to be match literal characters in the pathname. Only a '\' is * unaffected. Thus ""a*c"" will match the file "a*c", but ""a\"c"" will * match "a"c"; i.e. a filename containing a double-quote character. */ public static final int DOUBLE_QUOTE_ESCAPES = 0x20; /** * When set, this flag causes the [...] character class syntax to be * recognized. */ public static final int CHARACTER_CLASSES = 0x40; /** * When set, the pattern is anchored to the left of the string to be searched. * This is set implicitly by the pathname matching methods. */ public static final int ANCHOR_LEFT = 0x80; /** * When set, the pattern is anchored to the right of the string to be searched. * This is set implicitly by the pathname matching methods. */ public static final int ANCHOR_RIGHT = 0x100; /** * When set, '*' is eager, matching as many characters as possible. * This is set implicitly by the pathname matching methods. * matching is always eager. */ public static final int EAGER = 0x200; /** * When set, an unescaped '/' inside a character class causes the entire class * to be interpreted as a literal character sequence. * This is set implicitly by the pathname matching methods. */ public static final int SLASH_DISABLES_CHARACTER_CLASSES = 0x400; public static final int DEFAULT_FLAGS = SORT_MATCHES | HIDE_DOT_FILENAMES | INCLUDE_DOT_AND_DOTDOT | BACKSLASH_ESCAPES | SINGLE_QUOTE_ESCAPES | DOUBLE_QUOTE_ESCAPES | CHARACTER_CLASSES; private static final boolean DEBUG = false; private final String source; private ArrayList<Object> patterns; private boolean isAbsolute; private char lastQuote; // Use a weak reference for the pattern cache to avoid storage leakage. private static WeakReference<HashMap<String, PathnamePattern>> cache; private PathnamePattern(String source) { this.source = source; this.patterns = new ArrayList<Object>(); } /** * Expand a pattern, returning the pathnames of the file system objects that * it matches. * * @param current this is the notional current directory for expanding a * relative pattern. * @return the lest of matching pathnames. The names will be absolute if the * original pattern was absolute, and relative if not. */ public LinkedList<String> expand(File current) { return doGlob(isAbsolute ? new File(File.separator) : current, 0, DEFAULT_FLAGS); } /** * Expand a pattern, returning the pathnames of the file system objects that * it matches. * * @param current this is the notional current directory for expanding a * relative pattern. * @param flags these flags control the behavior of the expander. * @return the lest of matching pathnames. The names will be absolute if the * original pattern was absolute, and relative if not. */ public LinkedList<String> expand(File current, int flags) { return doGlob(isAbsolute ? new File(File.separator) : current, 0, flags); } /** * This method recursively visits each element of the compiled pattern, * building a list of the pathname strings for FS objects that match it. * * @param current the current file context for expansion. * @param pos our index into the 'pattern' array. * @return the list of partial pathnames matched in the context of * 'current'. */ private LinkedList<String> doGlob(File current, int pos, int flags) { LinkedList<File> matches = new LinkedList<File>(); LinkedList<String> res = new LinkedList<String>(); if (patterns.get(pos) instanceof String) { File file = new File(current, (String) patterns.get(pos)); if (file.exists()) { matches.add(file); } } else { final Pattern pat = (Pattern) patterns.get(pos); final Matcher mat = pat.matcher(""); final FilenameFilter filter = new FilenameFilter() { public boolean accept(File dir, String name) { return mat.reset(name).matches(); } }; // A directory's "." and ".." entries are not returned by // File.listFiles so we have to match / add them explicitly. if ((flags & INCLUDE_DOT_AND_DOTDOT) != 0) { if (filter.accept(current, ".")) { matches.add(new File(current, ".")); } if (filter.accept(current, "..")) { matches.add(new File(current, "..")); } } // Process the 'regular' directory contents for (File file : current.listFiles(filter)) { matches.add(file); } if ((flags & SORT_MATCHES) == SORT_MATCHES) { Collections.sort(matches); } } for (File match : matches) { String name = match.getName(); if (pos == 0 && isAbsolute) { name = File.separator + name; } if (pos == patterns.size() - 1) { res.add(name); } else if (match.isDirectory()) { LinkedList<String> subList = doGlob(match, pos + 1, flags); for (String sub : subList) { res.add(name + File.separator + sub); } } } return res; } /** * Create and compile a pathname pattern using the default flags. * * @param source the pattern source * @return a compiler pattern for the source. */ public static PathnamePattern compilePathPattern(String source) { return compilePathPattern(source, DEFAULT_FLAGS); } /** * Create and compile a pathname pattern. The flags determine which pattern * meta-characters are recognized by the compiled pattern. If a pattern * meta-character is not recognized, it will be treated as a literal * character. * * @param source the pattern source * @param flags pattern compilation flags * @return a compiler pattern for the source. */ public static PathnamePattern compilePathPattern(String source, int flags) { String key = flags + ":" + source; synchronized (PathnamePattern.class) { HashMap<String, PathnamePattern> cp; if (cache != null && (cp = cache.get()) != null) { PathnamePattern pat = cp.get(key); if (pat != null) { return pat; } } } PathnamePattern pp = new PathnamePattern(source); String[] parts = source.split(File.separator + "+", -1); for (int i = 0; i < parts.length; i++) { String part = parts[i]; Object pat = (isPattern(part, flags)) ? compilePosixShellPattern(part, flags | ANCHOR_LEFT | ANCHOR_RIGHT | EAGER | SLASH_DISABLES_CHARACTER_CLASSES, pp) : part; if (pat == null || pat.toString().length() == 0) { if (i == 0) { pp.isAbsolute = true; } } else { pp.patterns.add(pat); } if (DEBUG) { System.err.println(i + ": " + pat); } } if (pp.lastQuote != 0) { throw new IllegalArgumentException("Unbalanced quotes in pattern"); } synchronized (PathnamePattern.class) { HashMap<String, PathnamePattern> cp = null; if (cache == null || (cp = cache.get()) == null) { cp = new HashMap<String, PathnamePattern>(); cache = new WeakReference<HashMap<String, PathnamePattern>>(cp); } cp.put(key, pp); } return pp; } /** * Clear the pattern cache */ public static void clearCache() { synchronized (PathnamePattern.class) { cache = null; } } /** * Provide a fast determination if a string requires pattern expansion, * assuming the default pattern flags. * * @param str the string to be examined * @return <code>true</code> if the string is potentially a pattern; i.e. * if it contains '*', '?' or '[' characters. */ public static boolean isPattern(String str) { return isPattern(str, DEFAULT_FLAGS); } /** * Provide a fast determination if a string requires pattern expansion. * * @param str the string to be examined * @param flags pattern compilation flags * @return <code>true</code> if the string is potentially a pattern; i.e. * if it contains meta-characters enabled in the compilation flags. */ public static boolean isPattern(String str, int flags) { int len = str.length(); for (int i = 0; i < len; i++) { switch (str.charAt(i)) { case '*': case '?': return true; case '[': if ((flags & CHARACTER_CLASSES) != 0) { return true; } break; case '\\': if ((flags & BACKSLASH_ESCAPES) != 0) { return true; } break; case '\'': if ((flags & SINGLE_QUOTE_ESCAPES) != 0) { return true; } break; case '\"': if ((flags & DOUBLE_QUOTE_ESCAPES) != 0) { return true; } break; default: } } return false; } /** * Turn a string in POSIX shell pattern syntax into a regex. This method * generates a {@link Pattern} that can be matched against a character sequence. * * @param pattern the pattern in shell syntax. * @param flags compilation flags * @return the corresponding regex as a {@link Pattern}. */ public static Pattern compilePosixShellPattern(CharSequence pattern, int flags) { return compilePosixShellPattern(pattern, flags, null); } /** * @param pattern the pattern in shell syntax. * @param flags compilation flags * @param pp if not {@code null}, * @return the corresponding regex as a {@link Pattern}. */ private static Pattern compilePosixShellPattern( CharSequence pattern, int flags, PathnamePattern pp) { // This method needs to be really careful to avoid 'ordinary' characters // in the source pattern being accidentally mapped to Java regex // meta-characters. int len = pattern.length(); StringBuilder sb = new StringBuilder(len); char quote = (pp == null) ? ((char) 0) : pp.lastQuote; boolean eager = (flags & EAGER) != 0; for (int i = 0; i < len; i++) { char ch = pattern.charAt(i); switch (ch) { case '?': if (quote != 0) { sb.append(protect(ch)); } else if (i == 0 && (flags & HIDE_DOT_FILENAMES) != 0) { sb.append("[^\\.]"); } else { sb.append("."); } break; case '*': if (quote != 0) { sb.append(protect(ch)); } else if (i == 0 && (flags & HIDE_DOT_FILENAMES) != 0) { sb.append("(|[^\\.]").append(eager ? ".*" : ".*?").append(")"); } else { sb.append(eager ? ".*" : ".*?"); } break; case '[': if ((flags & CHARACTER_CLASSES) != 0) { int j; StringBuilder sb2 = new StringBuilder(len); boolean charClassOK = true; LOOP: for (j = i + 1; j < len; j++) { char ch2 = pattern.charAt(j); switch (ch2) { case ']': break LOOP; case '\\': sb2.append(protect(pattern.charAt(++j))); break; case '!': case '^': sb2.append((j == i + 1) ? "^" : protect(ch2)); break; case '-': sb2.append('-'); break; case '/': sb2.append(protect(ch2)); charClassOK = ((flags & SLASH_DISABLES_CHARACTER_CLASSES) == 0); break; default: sb2.append(protect(ch2)); } } if (j == len) { sb.append(protect('[')); } else if (!charClassOK) { sb.append(protect('[')).append(sb2).append(protect(']')); i = j; } else { sb.append("[").append(sb2).append(']'); i = j; } } else { sb.append(protect(ch)); } break; case '\\': if ((flags & BACKSLASH_ESCAPES) != 0) { sb.append(protect(pattern.charAt(++i))); } else { sb.append(protect(ch)); } break; case '\'': if ((flags & SINGLE_QUOTE_ESCAPES) != 0) { if (quote == '\'') { quote = 0; } else if (quote == 0) { quote = '\''; } else { sb.append(protect(ch)); } } else { sb.append(protect(ch)); } break; case '\"': if ((flags & DOUBLE_QUOTE_ESCAPES) != 0) { if (quote == '\"') { quote = 0; } else if (quote == 0) { quote = '\"'; } else { sb.append(protect(ch)); } } else { sb.append(protect(ch)); } break; default: sb.append(protect(ch)); } } if (pp != null) { pp.lastQuote = quote; } if (sb.length() == 0) { return null; } if ((flags & ANCHOR_LEFT) != 0) { sb.insert(0, '^'); } if ((flags & ANCHOR_RIGHT) != 0) { sb.append('$'); } return Pattern.compile(sb.toString()); } private static String protect(char ch) { switch (ch) { case '.': case '|': case '[': case ']': case '(': case ')': case '+': case '*': case '?': case '$': case '{': case '}': case '^': case '\\': return "\\" + ch; default: return Character.toString(ch); } } public String toString() { return source; } public String toRegexString() { StringBuilder sb = new StringBuilder(); sb.append("PathnamePattern{source='").append(this.source); sb.append("',absolute=").append(this.isAbsolute); sb.append(",patterns=["); int len = this.patterns.size(); for (int i = 0; i < len; i++) { if (i > 0) { sb.append(","); } sb.append('\'').append(patterns.get(i)).append('\''); } sb.append("]}"); return sb.toString(); } }