// // FilePattern.java // /* LOCI Bio-Formats package for reading and converting biological file formats. Copyright (C) 2005-@year@ Melissa Linkert, Curtis Rueden, Chris Allan, Eric Kjellman and Brian Loranger. This program is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU Library General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ package loci.formats; import java.io.File; import java.math.BigInteger; import java.util.Arrays; import java.util.Vector; /** * FilePattern is a collection of methods for handling file patterns, a way of * succinctly representing a collection of files meant to be part of the same * data series. * * Examples: * <ul> * <li>C:\data\BillM\sdub<1-12>.pic</li> * <li>C:\data\Kevin\80<01-59>0<2-3>.pic</li> * <li>/data/Josiah/cell-Z<0-39>.C<0-1>.tiff</li> * </ul> * * <dl><dt><b>Source code:</b></dt> * <dd><a href="https://skyking.microscopy.wisc.edu/trac/java/browser/trunk/loci/formats/FilePattern.java">Trac</a>, * <a href="https://skyking.microscopy.wisc.edu/svn/java/trunk/loci/formats/FilePattern.java">SVN</a></dd></dl> * * @author Curtis Rueden ctrueden at wisc.edu */ public class FilePattern { // -- Fields -- /** The file pattern string. */ private String pattern; /** The validity of the file pattern. */ private boolean valid; /** Error message generated during file pattern construction. */ private String msg; /** Indices into the pattern indicating the start of a numerical block. */ private int[] startIndex; /** Indices into the pattern indicating the end of a numerical block. */ private int[] endIndex; /** First number of each numerical block. */ private BigInteger[] begin; /** Last number of each numerical block. */ private BigInteger[] end; /** Step size of each numerical block. */ private BigInteger[] step; /** Total numbers withins each numerical block. */ private int[] count; /** Whether each numerical block is fixed width. */ private boolean[] fixed; /** The number of leading zeroes for each numerical block. */ private int[] zeroes; /** File listing for this file pattern. */ private String[] files; // -- Constructors -- /** Creates a pattern object using the given file as a template. */ public FilePattern(Location file) { this(FilePattern.findPattern(file)); } /** * Creates a pattern object using the given * filename and directory path as a template. */ public FilePattern(String name, String dir) { this(FilePattern.findPattern(name, dir)); } /** Creates a pattern object for files with the given pattern string. */ public FilePattern(String pattern) { this.pattern = pattern; valid = false; if (pattern == null) { msg = "Null pattern string."; return; } // locate numerical blocks int len = pattern.length(); Vector lt = new Vector(len); Vector gt = new Vector(len); int left = -1; while (true) { left = pattern.indexOf("<", left + 1); if (left < 0) break; lt.add(new Integer(left)); } int right = -1; while (true) { right = pattern.indexOf(">", right + 1); if (right < 0) break; gt.add(new Integer(right)); } // assemble numerical block indices int num = lt.size(); if (num != gt.size()) { msg = "Mismatched numerical block markers."; return; } startIndex = new int[num]; endIndex = new int[num]; for (int i=0; i<num; i++) { int val = ((Integer) lt.elementAt(i)).intValue(); if (i > 0 && val < endIndex[i - 1]) { msg = "Bad numerical block marker order."; return; } startIndex[i] = val; val = ((Integer) gt.elementAt(i)).intValue(); if (val <= startIndex[i]) { msg = "Bad numerical block marker order."; return; } endIndex[i] = val + 1; } // parse numerical blocks begin = new BigInteger[num]; end = new BigInteger[num]; step = new BigInteger[num]; count = new int[num]; fixed = new boolean[num]; zeroes = new int[num]; for (int i=0; i<num; i++) { String block = pattern.substring(startIndex[i], endIndex[i]); int dash = block.indexOf("-"); String b, e, s; if (dash < 0) { // no range; assume entire block is a single number (e.g., <15>) b = e = block.substring(1, block.length() - 1); s = "1"; } else { int colon = block.indexOf(":"); b = block.substring(1, dash); if (colon < 0) { e = block.substring(dash + 1, block.length() - 1); s = "1"; } else { e = block.substring(dash + 1, colon); s = block.substring(colon + 1, block.length() - 1); } } try { begin[i] = new BigInteger(b); end[i] = new BigInteger(e); if (begin[i].compareTo(end[i]) > 0) { msg = "Begin value cannot be greater than ending value."; return; } step[i] = new BigInteger(s); if (step[i].compareTo(BigInteger.ONE) < 0) { msg = "Step value must be at least one."; return; } count[i] = end[i].subtract(begin[i]).divide(step[i]).intValue() + 1; fixed[i] = b.length() == e.length(); int z = 0; for (z=0; z<e.length(); z++) { if (e.charAt(z) != '0') break; } zeroes[i] = z; } catch (NumberFormatException exc) { msg = "Invalid numerical range values."; return; } } // build file listing Vector v = new Vector(); buildFiles("", num, v); files = new String[v.size()]; v.copyInto(files); valid = true; } // -- FilePattern API methods -- /** Gets the file pattern string. */ public String getPattern() { return pattern; } /** Gets whether the file pattern string is valid. */ public boolean isValid() { return valid; } /** Gets the file pattern error message, if any. */ public String getErrorMessage() { return msg; } /** Gets the first number of each numerical block. */ public BigInteger[] getFirst() { return begin; } /** Gets the last number of each numerical block. */ public BigInteger[] getLast() { return end; } /** Gets the step increment of each numerical block. */ public BigInteger[] getStep() { return step; } /** Gets the total count of each numerical block. */ public int[] getCount() { return count; } /** Gets a listing of all files matching the given file pattern. */ public String[] getFiles() { return files; } /** Gets the specified numerical block. */ public String getBlock(int i) { if (i < 0 || i >= startIndex.length) return null; return pattern.substring(startIndex[i], endIndex[i]); } /** Gets each numerical block. */ public String[] getBlocks() { String[] s = new String[startIndex.length]; for (int i=0; i<s.length; i++) s[i] = getBlock(i); return s; } /** Gets the pattern's text string before any numerical ranges. */ public String getPrefix() { int s = pattern.lastIndexOf(File.separator) + 1; int e; if (startIndex.length > 0) e = startIndex[0]; else { int dot = pattern.lastIndexOf("."); e = dot < s ? pattern.length() : dot; } return s <= e ? pattern.substring(s, e) : ""; } /** Gets the pattern's text string after all numerical ranges. */ public String getSuffix() { return endIndex.length > 0 ? pattern.substring(endIndex[endIndex.length - 1]) : pattern; } /** Gets the pattern's text string before the given numerical block. */ public String getPrefix(int i) { if (i < 0 || i >= startIndex.length) return null; int s = i > 0 ? endIndex[i - 1] : (pattern.lastIndexOf(File.separator) + 1); int e = startIndex[i]; return s <= e ? pattern.substring(s, e) : null; } /** Gets the pattern's text string before each numerical block. */ public String[] getPrefixes() { String[] s = new String[startIndex.length]; for (int i=0; i<s.length; i++) s[i] = getPrefix(i); return s; } // -- Utility methods -- /** * Identifies the group pattern from a given file within that group. * @param file The file to use as a template for the match. */ public static String findPattern(Location file) { return findPattern(file.getName(), file.getAbsoluteFile().getParent()); } /** * Identifies the group pattern from a given file within that group. * @param file The file to use as a template for the match. */ public static String findPattern(File file) { return findPattern(file.getName(), file.getAbsoluteFile().getParent()); } /** * Identifies the group pattern from a given file within that group. * @param name The filename to use as a template for the match. * @param dir The directory in which to search for matching files. */ public static String findPattern(String name, String dir) { if (dir == null) dir = ""; // current directory else if (!dir.equals("") && !dir.endsWith(File.separator)) { dir += File.separator; } Location dirFile = new Location(dir.equals("") ? "." : dir); // list files in the given directory Location[] f = dirFile.listFiles(); if (f == null) return null; String[] nameList = new String[f.length]; for (int i=0; i<nameList.length; i++) nameList[i] = f[i].getName(); return findPattern(name, dir, nameList); } /** * Identifies the group pattern from a given file within that group. * @param name The filename to use as a template for the match. * @param dir The directory prefix to use for matching files. * @param nameList The names through which to search for matching files. */ public static String findPattern(String name, String dir, String[] nameList) { if (dir == null) dir = ""; // current directory else if (!dir.equals("") && !dir.endsWith(File.separator)) { dir += File.separator; } // compile list of numerical blocks int len = name.length(); int bound = (len + 1) / 2; int[] indexList = new int[bound]; int[] endList = new int[bound]; int q = 0; boolean num = false; int ndx = -1, e = 0; for (int i=0; i<len; i++) { char c = name.charAt(i); if (c >= '0' && c <= '9') { if (num) e++; else { num = true; ndx = i; e = ndx + 1; } } else if (num) { num = false; indexList[q] = ndx; endList[q] = e; q++; } } if (num) { indexList[q] = ndx; endList[q] = e; q++; } // analyze each block, building pattern as we go StringBuffer sb = new StringBuffer(dir); for (int i=0; i<q; i++) { int last = i > 0 ? endList[i - 1] : 0; sb.append(name.substring(last, indexList[i])); String pre = name.substring(0, indexList[i]); String post = name.substring(endList[i]); NumberFilter filter = new NumberFilter(pre, post); String[] list = matchFiles(nameList, filter); if (list == null || list.length == 0) return null; if (list.length == 1) { // false alarm; this number block is constant sb.append(name.substring(indexList[i], endList[i])); continue; } boolean fix = true; for (int j=0; j<list.length; j++) { if (list[j].length() != len) { fix = false; break; } } if (fix) { // tricky; this fixed-width block could represent multiple numberings int width = endList[i] - indexList[i]; // check each character for duplicates boolean[] same = new boolean[width]; for (int j=0; j<width; j++) { same[j] = true; int jx = indexList[i] + j; char c = name.charAt(jx); for (int k=0; k<list.length; k++) { if (list[k].charAt(jx) != c) { same[j] = false; break; } } } // break down each sub-block int j = 0; while (j < width) { int jx = indexList[i] + j; if (same[j]) { sb.append(name.charAt(jx)); j++; } else { while (j < width && !same[j]) j++; String p = findPattern(name, nameList, jx, indexList[i] + j, ""); if (p == null) { // unable to find an appropriate breakdown of numerical blocks return null; } sb.append(p); } } } else { // assume variable-width block represents only one numbering BigInteger[] numbers = new BigInteger[list.length]; for (int j=0; j<list.length; j++) { numbers[j] = filter.getNumber(list[j]); } Arrays.sort(numbers); String bounds = getBounds(numbers, false); if (bounds == null) return null; sb.append(bounds); } } sb.append(q > 0 ? name.substring(endList[q - 1]) : name); return sb.toString(); } // -- Utility helper methods -- /** Recursive method for parsing a fixed-width numerical block. */ private static String findPattern(String name, String[] nameList, int ndx, int end, String p) { if (ndx == end) return p; for (int i=end-ndx; i>=1; i--) { NumberFilter filter = new NumberFilter( name.substring(0, ndx), name.substring(ndx + i)); String[] list = matchFiles(nameList, filter); BigInteger[] numbers = new BigInteger[list.length]; for (int j=0; j<list.length; j++) { numbers[j] = new BigInteger(list[j].substring(ndx, ndx + i)); } Arrays.sort(numbers); String bounds = getBounds(numbers, true); if (bounds == null) continue; String pat = findPattern(name, nameList, ndx + i, end, p + bounds); if (pat != null) return pat; } // no combination worked; this parse path is infeasible return null; } /** * Gets a string containing start, end and step values * for a sorted list of numbers. */ private static String getBounds(BigInteger[] numbers, boolean fixed) { if (numbers.length < 2) return null; BigInteger b = numbers[0]; BigInteger e = numbers[numbers.length - 1]; BigInteger s = numbers[1].subtract(b); if (s.equals(BigInteger.ZERO)) { // step size must be positive return null; } for (int i=2; i<numbers.length; i++) { if (!numbers[i].subtract(numbers[i - 1]).equals(s)) { // step size is not constant return null; } } String sb = b.toString(); String se = e.toString(); StringBuffer bounds = new StringBuffer("<"); if (fixed) { int zeroes = se.length() - sb.length(); for (int i=0; i<zeroes; i++) bounds.append("0"); } bounds.append(sb); bounds.append("-"); bounds.append(se); if (!s.equals(BigInteger.ONE)) { bounds.append(":"); bounds.append(s); } bounds.append(">"); return bounds.toString(); } /** Filters the given list of filenames according to the specified filter. */ private static String[] matchFiles(String[] inFiles, NumberFilter filter) { Vector v = new Vector(); for (int i=0; i<inFiles.length; i++) { if (filter.accept(inFiles[i])) v.add(inFiles[i]); } String[] s = new String[v.size()]; v.copyInto(s); return s; } // -- Helper methods -- /** Recursive method for building filenames for the file listing. */ private void buildFiles(String prefix, int ndx, Vector fileList) { // compute bounds for constant (non-block) pattern fragment int num = startIndex.length; int n1 = ndx == 0 ? 0 : endIndex[ndx - 1]; int n2 = ndx == num ? pattern.length() : startIndex[ndx]; String pre = pattern.substring(n1, n2); if (ndx == 0) fileList.add(pre + prefix); else { // for (int i=begin[ndx]; i<end[ndx]; i+=step[ndx]) BigInteger bi = begin[--ndx]; while (bi.compareTo(end[ndx]) <= 0) { String s = bi.toString(); int z = zeroes[ndx]; if (fixed[ndx]) z += end[ndx].toString().length() - s.length(); for (int j=0; j<z; j++) s = "0" + s; buildFiles(s + pre + prefix, ndx, fileList); bi = bi.add(step[ndx]); } } } // -- Main method -- /** Method for testing file pattern logic. */ public static void main(String[] args) { String pat = null; if (args.length > 0) { // test file pattern detection based on the given file on disk Location file = new Location(args[0]); LogTools.println("File = " + file.getAbsoluteFile()); pat = findPattern(file); } else { // test file pattern detection from a virtual file list String[] nameList = new String[2 * 4 * 3 * 12 + 1]; nameList[0] = "outlier.ext"; int count = 1; for (int i=1; i<=2; i++) { for (int j=1; j<=4; j++) { for (int k=0; k<=2; k++) { for (int l=1; l<=12; l++) { String sl = (l < 10 ? "0" : "") + l; nameList[count++] = "hypothetical" + sl + k + j + "c" + i + ".ext"; } } } } pat = findPattern(nameList[1], null, nameList); } if (pat == null) LogTools.println("No pattern found."); else { LogTools.println("Pattern = " + pat); FilePattern fp = new FilePattern(pat); if (fp.isValid()) { LogTools.println("Pattern is valid."); LogTools.println("Files:"); String[] ids = fp.getFiles(); for (int i=0; i<ids.length; i++) { LogTools.println(" #" + i + ": " + ids[i]); } } else LogTools.println("Pattern is invalid: " + fp.getErrorMessage()); } } } // -- Notes -- // Some patterns observed: // // TAABA1.PIC TAABA2.PIC TAABA3.PIC ... TAABA45.PIC // // 0m.tiff 3m.tiff 6m.tiff ... 36m.tiff // // cell-Z0.C0.tiff cell-Z1.C0.tiff cell-Z2.C0.tiff ... cell-Z39.C0.tiff // cell-Z0.C1.tiff cell-Z1.C1.tiff cell-Z2.C1.tiff ... cell-Z39.C1.tiff // // CRG401.PIC // // TST00101.PIC TST00201.PIC TST00301.PIC // TST00102.PIC TST00202.PIC TST00302.PIC // // 800102.pic 800202.pic 800302.pic ... 805902.pic // 800103.pic 800203.pic 800303.pic ... 805903.pic // // nd400102.pic nd400202.pic nd400302.pic ... nd406002.pic // nd400103.pic nd400203.pic nd400303.pic ... nd406003.pic // // WTERZ2_Series13_z000_ch00.tif ... WTERZ2_Series13_z018_ch00.tif // // -------------------------------------------------------------------------- // // The file pattern notation defined here encompasses all patterns above. // // TAABA<1-45>.PIC // <0-36:3>m.tiff // cell-Z<0-39>.C<0-1>.tiff // CRG401.PIC // TST00<1-3>0<1-2>.PIC // 80<01-59>0<2-3>.pic // nd40<01-60>0<2-3>.pic // WTERZ2_Series13_z0<00-18>_ch00.tif // // In general: <B-E:S> where B is the start number, E is the end number, and S // is the step increment. If zero padding has been used, the start number B // will have leading zeroes to indicate that. If the step increment is one, it // can be omitted. // // -------------------------------------------------------------------------- // // If file groups not limited to numbering need to be handled, we can extend // the notation as follows: // // A pattern such as: // // ABCR.PIC ABCG.PIC ABCB.PIC // // Could be represented as: // // ABC<R|G|B>.PIC // // If such cases come up, they will need to be identified heuristically and // incorporated into the detection algorithm. // // -------------------------------------------------------------------------- // // Here is a sketch of the algorithm for determining the pattern from a given // file within a particular group: // // 01 - Detect number blocks within the file name, marking them with stars. // For example: // // xyz800303b.pic -> xyz<>b.pic // // Where <> represents a numerical block with unknown properties. // // 02 - Get a file listing for all files matching the given pattern. In the // example above, we'd get: // // xyz800102b.pic, xyz800202b.pic, ..., xyz805902b.pic, // xyz800103b.pic, xyz800203b.pic, ..., xyz805903b.pic // // 03 - There are two possibilities: "fixed width" and "variable width." // // Variable width: Not all filenames are the same length in characters. // Assume the block only covers a single number. Extract that number // from each filename, sort them and analyze as described below. // // Fixed width: All filenames are the same length in characters. The // block could represent more than one number. // // First, for each character, determine if that character varies between // filenames. If not, lock it down, splitting the block as necessary // into fixed-width blocks. When finished, the above example looks like: // // xyz80<2>0<1>b.pic // // Where <N> represents a numerical block of width N. // // For each remaining block, extract the numbers from each matching // filename, sort the lists, and analyze as described below. // // 04 - In either case, analyze each list of numbers. The first on the list // is B. The last one is E. And S is the second one minus B. But check // the list to make sure no numbers are missing for that step size. // // NOTE: The fixed width algorithm above is insufficient for patterns like // "0101.pic" through "2531.pic," where no fixed constant pads the two // numerical counts. An additional step is required, as follows: // // 05 - For each fixed-width block, recursively divide it into pieces, and // analyze the numerical scheme according to those pieces. For example, // in the problem case given above, we'd have: // // <4>.pic // // Recursively, we'd analyze: // // <4>.pic // <3><R1>.pic // <2><R2>.pic // <1><R3>.pic // // The <Rx> blocks represent recursive calls to analyze the remainder of // the width. // // The function decides if a given combination of widths is valid by // determining if each individual width is valid. An individual width // is valid if the computed B, S and E properly cover the numerical set. // // If no combination of widths is found to be valid, the file numbering // is screwy. Print an error message.