package org.opensextant.xtext; import java.io.File; import java.io.IOException; import java.util.Date; import net.sf.json.JSONObject; import org.apache.commons.codec.binary.Base64; import org.apache.commons.io.FilenameUtils; import org.opensextant.ConfigException; import org.opensextant.util.FileUtility; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * PathManager -- a group of routines related to caching conversions and archive collections. * It manages the path decisions given a variety of output parameters and the input object. * * @author ubaldino * */ public class PathManager { private final Logger log = LoggerFactory.getLogger(getClass()); /** * The folder where conversions are saved. */ private String conversionCache = null; /** * a Prefix path caller wishes to remove from input files and archives. This helps shorten paths in the cache. */ private String stripPrefixPath = null; /** * inputRootName = the name of the input in the output cache and export cache. */ private String inputRootName = null; /** * Embedded mode */ private boolean saveConversionsWithOriginals = false; /** * saveExtractedChildrenWithOriginals - determines how embedded items are archived, e.g., Email attachments, or embedded images. * They are children to some parent container -- XText yields two things: the original child, and the conversion of the child. * * Example: a.doc (child) saved from A.eml (parent) * * saveExtractedChildrenWithOriginals = True; a is saved in same folder where A exists * saveExtractedChildrenWithOriginals = False; a is saved in a separate output archive. */ private boolean saveExtractedChildrenWithOriginals = false; private boolean saving = false; /** * The overall flag to save converted output or not. DEFAULT: true = save * it; provided caller specifies either saveWithInput or provides an * archiveRoot * * @param b true to allow saving/caching conversions */ public void enableSaving(boolean b) { saving = b; } public boolean isSaving() { return saving; } public String getConversionCache() { return conversionCache; } /** * Set the prefix that will be removed from the leading part of paths as conversions are cached. * Have a long, long file path? And want to shorten it in your cache... choose this prefix after thinking about it. * If you strip too much you may end up with name conflicts or not enough organization to the cached stuff. * * NOTE: an error/warning is printed only if the prefix does not exist. This is not an exception or error, as you * might get the paths to items from some other method and they may not actually exist physically on disk. * * @param p a prefix path that would be found in the absolute path of documents being converted. */ public void setStripPrefixPath(String p) { stripPrefixPath = p; if (p != null) { if (!new File(p).exists()) { log.error("Warning prefix Path does not exist: {}", p); } } } public String getStipPrefixPath() { return stripPrefixPath; } public boolean hasInputRoot() { return (inputRootName != null); } /** * From the provided caching parameters set ahead of time, infer the location * where this input should be located within the archive, relatively. This * should only be set once at the top level, that is * <ul><li>if the call is to convert a single file, set it once for the file. * <li>if the call is to convert a folder, set it once. * <li>... for archives, etc. set it once! * </ul> * Do not set it for each file when traversing a folder contents. * <br> * NOTA BENE: Set conversion cache location first. * * <pre> * cache = /output/converted/ * * input = /my/original/abc.zip ==> /output/converted/abc_zip * input = /my/original/abc.doc ==> /output/converted/abc.doc.txt * input = /my/original/abc/ ==> /output/converted/abc/ * * * prefix set, as prefix=/my * * input = /my/original/abc.zip ==> /output/converted/original/abc_zip * input = /my/original/abc.doc ==> /output/converted/original/abc.doc.txt * input = /my/original/abc/ ==> /output/converted/original/abc/ * * if saved-in-input, none of this applies. * </pre> * * If you are caching conversions in an archive folder, A, then * this generally sets your ouputNode to /A/name/ * * An items saved here will be of the form /A/name/relative/path * For an input that came from /some/input/name/relative/path * @param input root of your input which will be crawled * @throws IOException if path does not exist or is not a directory */ public void setInputRoot(File input) throws IOException { if (!saving) { return; } // Reset globals. // inputRootName = (input.isDirectory() ? input.getName() : input.getParentFile().getName()); String appendRoot = (input.isDirectory() ? "." : input.getParentFile().getName()); outputNode = null; if (conversionCache != null) { // DEFAULT: for files and when not using strip path, the cache folder is literally used. outputNode = conversionCache; if (stripPrefixPath != null) { File testDir = input.isDirectory() ? input : input.getParentFile(); outputNode = createPath(conversionCache, this.getStrippedInputPath(testDir)); } else if (input.isDirectory()) { outputNode = createPath(conversionCache, appendRoot); } } } /** * This enables saving in an archive and disables saving with input. * * @param root container where conversions will be cached. * @throws IOException on err * @see #enableSaveInCache(boolean) * @see #enableSaveWithInput(boolean) */ public void setConversionCache(String root) throws IOException { if (root == null) { throw new IOException("Archive cannot be null"); } // User tried setting a non-null archive... so implicitly they are not saving with input // this.enableSaveInCache(true); conversionCache = fixPath(root); File test = new File(conversionCache); if (!test.exists() || !test.isDirectory()) { throw new IOException("Archive root directory must exist. Non-existant DIR=" + test); } conversionCache = test.getAbsolutePath(); } /** * Save converted content with input. Xtext creates a new "xtext" folder in * the containing folder of the current file. This is disabled if a * non-null, pre-existing archive root is set. * * <pre> * input is: a/b/c.doc * saved as: a/b/xtext/c.doc.txt * * DEFAULT: do not save in input folder * </pre> * * @param b flag, true = save conversions close to given input * @see #setConversionCache(java.lang.String) */ public void enableSaveWithInput(boolean b) { saveConversionsWithOriginals = b; } public boolean isSaveWithInput() { return saveConversionsWithOriginals; } /** * Experimental. * * ON by default. If you have email, for example, folder/A.eml * then children will appear at folder/A_eml/child.doc for some child.doc attachment. * Behavior may differ in each case. But essentially, this flag directs XText to write back to inputRoot * * Embedded parent/child docs (email, compound docs, etc) are special cases, * @param b true if children objects should be extracted and save near input */ public void enableSaveChildrenWithInput(boolean b) { saveExtractedChildrenWithOriginals = b; } /** * Saving to an archive specified by the caller; This is inferred if a * non-null, pre-existing archive root is set; DEFAULT: do not save in * archive. * * <pre> * input is: a/b/c.doc * output is: archiveRoot/a/b/c.doc.txt * </pre> * * @param b true if save in cache, not with input. * @see #setConversionCache(java.lang.String) */ public void enableSaveInCache(boolean b) { saveConversionsWithOriginals = !b; } private String outputNode; /** * Caller is responsible for checking null. * * @param path a path * @return trimmed path */ public static String trimLeadingSlash(String path) { if (path.length() == 0) { return path; } if (path.charAt(0) == '/') { return path.substring(1); } return path; } /** * Prepares a relative path, stripped of the prefix if one is provided. * Otherwise, the input path is returned less a leading slash. * @param obj a file. * @return stripped path */ public String getStrippedInputPath(File obj) { String root = obj.getAbsolutePath(); if (stripPrefixPath != null && root.startsWith(stripPrefixPath)) { root = root.substring(stripPrefixPath.length()); } root = trimLeadingSlash(root); return root; } /** * Most of the path mechanics are string-based, rather than file-system based, * so path adjustments are best done to be sure all paths from configuration * or from inputs should conform to a common convention. paths will be more like URLs, using * "/" as the standard path separator. * * TODO: commons-io FilenameUtils.normalize() does not work quite right across platforms. Review, Retest. * * @param p path * @return fixed path */ protected static String fixPath(String p) { if (p == null) { return null; } String relPath = p.replace('\\', '/').replace("/./", "/"); return relPath.startsWith("./") ? relPath.substring(2) : relPath; } /** * NOTE: Use of File() or FilenameUtils.concat() are OS dependent, here * what we want is more like a URL string representation always using /a/b/c/ * Instead of potentially \ and/or / mixed. * @param dir containing dir * @param item item to save in subfolder * @return path resulting path * @throws IOException on err, e.g. permissions or disk full, etc. */ protected static String createPath(String dir, String item) throws IOException { File f = new File(String.format("%s/%s", dir, item)); return fixPath(f.getAbsolutePath()); } private String extractedChildrenCache = null; public void setExtractedChildrenCache(String folder) { extractedChildrenCache = folder; } public String getExtractedChildrenCache() { return extractedChildrenCache; } /** * Run by XText.setup() to verify path issues. * * @throws IOException on err */ public void configure() throws IOException { if (saving && !this.saveConversionsWithOriginals && this.conversionCache == null) { throw new IOException( "If not saving conversions with your input folders, you must provide an archive path"); } if (extractedChildrenCache != null) { if (!new File(extractedChildrenCache).exists()) { throw new IOException( "If saving child items from archives or PST files, you must create the parent folder first. Dir does not exist:" + extractedChildrenCache); } } } /** * Wrapper around logic to save a conversion. Save with input or save in other output folder. * * @param textDoc converted doc to save * @throws IOException on err */ public void saveConversion(ConvertedDocument textDoc) throws IOException { log.debug("FILE={}, cache-in={}", textDoc.getFile(), outputNode); if (this.saveConversionsWithOriginals) { // Saves close to original in ./text/ folder where // original resides. textDoc.saveEmbedded(); } else { String searchPath = String.format("/%s/", inputRootName); textDoc.setPathRelativeTo(searchPath, this.saveExtractedChildrenWithOriginals); textDoc.save(outputNode); } } public ConvertedDocument getCachedConversion(File input) throws IOException { if (this.saveConversionsWithOriginals) { // Uncache a file close to the original F <== ./xtext/F.txt return getEmbeddedConversion(input); } else if (this.inputRootName != null) { // Only if the caller is using the XText API extracText(), then // will this work. // If user is trying to call convertFile(path) directly all the // various optimizations here // will not necessarily make sense. // // // Uncache a file in some other tree of archives that aligns // with the tree of the original source. // .../mine/source/path/F <==== /archive/source/path/F.txt return getCachedConversion(this.outputNode, this.inputRootName, input); } // Either no cache set or item was not converted. // Item may have not been converted due to error or simply it was already plain text. return null; } public File getArchiveExportDir(File input) throws ConfigException, IOException { String aName = FilenameUtils.getBaseName(input.getName()); String aExt = FilenameUtils.getExtension(input.getName()); String outputName = String.format("%s_%s", aName, aExt.toLowerCase()); // Set output name to input name. That is, once we extract A.zip to ./(originals)/A_zip/ this de-archived folder will // Also exist in ./(converted)/A_zip/ or ./(originals)/A_zip/xtext/ embedded. // //setOutputNode(inputNode); String saveTo = null; // unpack, traverse, convert, save if (extractedChildrenCache != null) { // Save converted items in a parallel archive for this zip archive. saveTo = PathManager.createPath(extractedChildrenCache, outputName); } else if (this.saveExtractedChildrenWithOriginals) { saveTo = PathManager.createPath(input.getParentFile().getAbsolutePath(), outputName); } else { throw new ConfigException( "Archive Files cannot be dearchived without a target folder to store child binaries"); } File saveFolder = new File(saveTo); if (!saveFolder.exists()) { FileUtility.makeDirectory(saveFolder); } log.debug("ARCHIVE FILE={}, node-in={}, cache-in={}, export={}", input, outputName, outputNode, saveFolder); return saveFolder; } public boolean verifyArchiveExport(String input) { if (!this.saveConversionsWithOriginals && !this.saveExtractedChildrenWithOriginals && this.conversionCache == null) { log.error( "Sorry -- if not saving in input folder, you must provide a separate " + "archive to contain ZIP and other archives that are extracted. Ignoring FILE={}", input); return false; } return true; } public static String DEFAULT_EMBED_FOLDER = "xtext"; /** * This provides some means for retrieving previously converted files. .... * to avoid converted them. * * @param obj item to retrieve from cache * @return doc ConvertedDocument from cache, otherwise null * @throws IOException on err */ public static ConvertedDocument getEmbeddedConversion(File obj) throws IOException { String cacheFolder = makePath(PathManager.fixPath(obj.getParent()), DEFAULT_EMBED_FOLDER); // I now have a path name that was likely the one stored in cache. // Return the ConvertedDocument if exists at this path. // Otherwise it is not in cache.... so converter must convert and save. // // This instance finds file:./xtext/F.ext.txt for a file:./F.ext // return _uncacheConversion(cacheFolder, obj.getName()); } /** * Given file /a/b/c.txt find me just the relative part to some root. That * is, for example, if we care more about the b folder regardless of that it * is physically located in /a. Perform:<pre> * * getRelativePath( "/a", "/a/b/c.txt") ===> b/c.txt</pre> * @param root prefix path * @param p full path to an item. * @return relative path wrt root */ public static String getRelativePath(String root, String p) { String _path = PathManager.fixPath(p); int x = _path.indexOf(root); // Possibly a relative root. if (x < 0) { return p; // "root" not found in p. No relation between root and path given. } return trimLeadingSlash(_path.substring(x)); } /** * Pass in a folder. and the name of the object to uncache. * * @param path containing folder * @param fname original file name sought * @return previously converted document or null if not found. * @throws IOException on error, likely from getCachedDocument */ private static ConvertedDocument _uncacheConversion(String path, String fname) throws IOException { // Common String targetPath = null; if (fname.endsWith(".txt")) { String cachedFile = FilenameUtils.getBaseName(fname); targetPath = String.format("%s/%s-utf8.txt", path, cachedFile); } else { targetPath = String.format("%s/%s.txt", path, fname); } File target = new File(targetPath); if (target.exists()) { return getCachedDocument(target); } return null; } /** * This provides some means for retrieving previously converted files. .... * to avoid converted them. This method takes the arguments and tries to infer the * actual location of a cached item. * TODO: For compound documents this needs more work. * * @param cacheDir shadow dir or separate archive path * @param inputDir original input folder where this item came from * @param obj the requested file. * @return the cached version of the conversion; null if not found or if no conversion was made. * @throws IOException on err */ public static ConvertedDocument getCachedConversion(String cacheDir, String inputDir, File obj) throws IOException { String rel_path = getRelativePath(inputDir, obj.getParentFile().getAbsolutePath()); // This folder contains the cached Item. String cacheFolder = makePath(cacheDir, rel_path); // I now have a path name that was likely the one stored in cache. // Return the ConvertedDocument if exists at this path. // Otherwise it is not in cache.... so converter must convert and save. // // This instance finds file:/<output-path>/<input-dir-name>/<relative-path-to-file>.txt // (shorter: /O/D/relpath/file.ext.txt ) // // for binary /inputpath/D/relpath/file.ext // // you gave me: C:\data\source\ // you said output goes to // D:\archives\ // // I found file C:\data\source\something\file.doc // // Which is to be cached at: // D:\archives\source\something\file.doc.txt // ^^^^^^^^^^^|inputdir|relpath^^^^^^^^^^^^^^ // outputdir | | // // IFFF a conversion happened. // If no conversion was made, then the original file is either // unconvertable or it is already valid UTF-8 or ASCII-only text/plain. // return _uncacheConversion(cacheFolder, obj.getName()); } /** * Apache Commons file utils "concat(dir, file)" makes a mess of file names. * Java can support "/" equally well on all platforms. * there is no apparent need to use platform specific file separators in this context. * @param dir containing dir * @param fname file name * @return full path. */ protected static String makePath(File dir, String fname) { return makePath(dir.getAbsolutePath(), fname); } /** * Apache Commons file utils "concat(dir, file)" makes a mess of file names. * Java can support "/" equally well on all platforms. * there is no apparent need to use platform specific file separators in this context. * @param dir containing dir * @param fname file name * @return full path. */ protected static String makePath(String dir, String fname) { return String.format("%s%s%s", dir, ConvertedDocument.UNIVERSAL_PATH_SEP, fname); } public static String getEmbeddedPath(String container, String item) { StringBuilder path = new StringBuilder(); path.append(container); path.append(ConvertedDocument.UNIVERSAL_PATH_SEP); path.append(DEFAULT_EMBED_FOLDER); path.append(ConvertedDocument.UNIVERSAL_PATH_SEP); path.append(item); return path.toString(); } public final static String DEFAULT_EMBED_FOLDER_IN_PATH = String.format("/%s/", DEFAULT_EMBED_FOLDER); public final static String DEFAULT_EMBED_FOLDER_IN_WINPATH = String.format("\\%s\\", DEFAULT_EMBED_FOLDER); /** * Simple test to see if filepath contains "./xtext/" for windows path or unix path. * @param filepath path to test * @return true if file parent is "/xtext/" or "\xtext\, case sensitive is found anywhere in path. */ public final static boolean isXTextCache(String filepath) { if (filepath.contains(DEFAULT_EMBED_FOLDER_IN_PATH)) { return true; } // Less often used: if (filepath.contains(DEFAULT_EMBED_FOLDER_IN_WINPATH)) { return true; } return false; } /** * If a File is provided, this only checks the immediate parent folder. * * @param obj path to test. * @return true if file parent is "xtext", case sensitive. */ public final static boolean isXTextCache(File obj) { return DEFAULT_EMBED_FOLDER.equals(obj.getParentFile().getName()); } /** * Given a path, retrieve a document. * * @param filepath file to retireve. * @return the cached document * @throws IOException on err */ public static ConvertedDocument getCachedDocument(String filepath) throws IOException { return getCachedDocument(new File(filepath)); } /** * Given a path, retrieve a document parsing out the XText format. * * @param fconv file conversion path to check * @return the cached document, if exists * @throws IOException on err */ public static ConvertedDocument getCachedDocument(File fconv) throws IOException { String buf = FileUtility.readFile(fconv); int x = buf.lastIndexOf("\n\n"); // Get Base64 encoded header String header = buf.substring(x).trim(); if (!header.startsWith(ConvertedDocument.XT_LABEL)) { // NOT an XText cache return null; } // Decode JSON String json = new String(Base64.decodeBase64(header.substring(ConvertedDocument.XT_LABEL .length()))); JSONObject doc_meta = JSONObject.fromObject(json); String fpath = doc_meta.getString("filepath"); ConvertedDocument doc = new ConvertedDocument(new File(fpath)); doc.meta = doc_meta; // Set plain text buffer doc.buffer = buf.substring(0, x); // Retrieve values for useful attrs. doc.encoding = doc.getProperty("encoding"); doc.filepath = fpath; /* note: path should already have been normalized, using "/" */ doc.filesize = Long.parseLong(doc.getProperty("filesize")); doc.textpath = fconv.getAbsolutePath(); doc.is_cached = true; doc.is_converted = true; doc.filetime = new Date(Long.parseLong(doc.getProperty("filetime"))); doc.setCreateDate(); // DocInput requirement: provided id + file paths // If there is another Identifier to use,... caller will have an opportunity to set it // when the get the instance. // String idvalue = doc.meta.optString("xtext_id"); doc.setId(idvalue != null ? idvalue : doc.filepath); return doc; } }