/** * * Copyright 2009-2013 The MITRE Corporation. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. * * ************************************************************************** * NOTICE This software was produced for the U. S. Government under Contract No. * W15P7T-12-C-F600, and is subject to the Rights in Noncommercial Computer * Software and Noncommercial Computer Software Documentation Clause * 252.227-7014 (JUN 1995) * * (c) 2012 The MITRE Corporation. All Rights Reserved. * ************************************************************************** */ package org.opensextant.xtext; import java.io.File; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.security.NoSuchAlgorithmException; import java.util.ArrayList; import java.util.Date; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Set; import java.util.HashSet; import java.util.Arrays; import javax.activation.MimeType; import net.sf.json.JSONObject; import org.apache.commons.codec.binary.Base64; import org.apache.commons.io.FilenameUtils; import org.apache.commons.lang3.StringUtils; import org.joda.time.DateTime; import org.joda.time.format.DateTimeFormat; import org.joda.time.format.DateTimeFormatter; import org.opensextant.util.FileUtility; import org.opensextant.util.TextUtils; import org.opensextant.data.DocInput; /** * * @author Marc C. Ubaldino, MITRE, ubaldino at mitre dot org */ public final class ConvertedDocument extends DocInput { /** */ public final static char UNIVERSAL_PATH_SEP = '/'; private static final DateTimeFormatter dtfmt = DateTimeFormat.forPattern("yyyy-MM-dd"); /** * The url where this document (html, image, doc download) was found * The url-referrer the page containing the url. */ public final static String URL_FIELD = "url"; public final static String URL_REFERRER_FIELD = "url-referrer"; public final static String[] fields = { // Dublin Core style metadata fields "title", "author", "creator_tool", "pub_date", "keywords", "subject", "filepath", "encoding", // // XText metadata. "filtered", "converter", "conversion_date", "encrypted", "filesize", "textsize", // Consideration for compound documents; if this instance is a child doc then what is the parent? "xtext_id", // REQUIRED -- the current document ID. "xtext_parent_id", "xtext_parent_path", // Additional metadata for web content. URL_FIELD, URL_REFERRER_FIELD }; /** * Converters will populate metadatata. If the entry is an object or a file, its name will reflect that. * Interpreting the entry name as a file name on a file system is up to the recipient. E.g., Mail attachments * might be file names; Embedded objects may be object IDs. */ public final static String CHILD_ENTRY_KEY = "entry.name"; /** * if you are a child document/object, then */ public ConvertedDocument parent = null; private List<ConvertedDocument> children = null; private List<Content> childrenContent = null; public final static Set<String> valid_fields = new HashSet<String>(Arrays.asList(fields)); public String filepath = null; public String filename = null; public String extension = null; public String basename = null; public Date filetime = null; public Date create_date = null; public String create_date_text = null; public String relative_path = null; public String textpath = null; public String encoding = null; private MimeType mimeType = null; JSONObject meta = new JSONObject(); protected static boolean overwrite = true; /** * Duration in Milliseconds to convert */ protected int conversion_time = 0; public boolean is_plaintext = false; public boolean is_converted = false; /** * Mail messages are ridiculous complex compound documents. * The parent document and all its attachments are marked as is_RFC822_attachment = true. * HTML and text formats are most susceptible to encoding issues. */ public boolean is_RFC822_attachment = false; public boolean is_webArchive = false; public boolean do_convert = true; /** * Represents if conversion was actually saved or not OR if file was * retrieved from cache successfully. */ public boolean is_cached = false; public static boolean CONVERT_TO_UNIX_EOL = true; private File file = null; private File folder = null; public long filesize = -1; private boolean isChild = false; private boolean isParent = true; // Default public ConvertedDocument() { // Used only for uncaching previously saved converted docs. super(null, null); } public boolean WINDOWS_OS = FileUtility.isWindowsSystem(); /** * Instantiates a new converted document. * * @param item file on disk */ public ConvertedDocument(File item) { super(null, null); if (item != null) { this.file = item; this.filepath = item.getAbsolutePath(); if (WINDOWS_OS) { // An effort to normalize paths. This should have no effect on existing // Caches of data on existing *nix deployments. TOOD: Look at how URL could be used // e.g., file:/xyz/file.txt // file:/C:/xyz/file.txt are OS-independent and use "/" always. // this.filepath = PathManager.fixPath(filepath); } this.folder = item.getParentFile(); this.filename = item.getName(); this.filetime = getFiletime(); this.is_plaintext = FileUtility.isPlainText(filename); this.filesize = file.length(); this.extension = FilenameUtils.getExtension(filename); this.basename = FilenameUtils.getBaseName(filename); addProperty("filesize", this.filesize); addProperty("filepath", this.filepath); addProperty("conversion_date", dtfmt.print(new Date().getTime())); // Fill out TextInput basics: setId(this.filepath); } } /** * Record a URL that represents the source of the document. * * @param url the url to the item, e.g., http:/a.b.com/folder/my.doc * @param referringURL the url where the doc was found, e.g., http:/a.b.com/folder/ */ public void addSourceURL(String url, String referringURL) { this.addProperty(URL_FIELD, url); this.addProperty(URL_REFERRER_FIELD, referringURL); } /** * Not that helpful: isChild or not is more meaningful. * @return true if this instance is a Parent item. */ public boolean isParent() { return isParent; } /** * @return true if instance is a child document/object that was contained in or attached to some other document. */ public boolean isChild() { return isChild; } public void setIsChild(boolean b) { isChild = b; isParent = !isChild; } protected File parentContainer = null; /** * Representation of the parent containing document. * @param par parent obj */ public void setParent(ConvertedDocument par) { // supporting only one level of nesting here. Parents have children. // Parents do not have parents, etc. Children do not have children. // isChild = true; isParent = false; parent = par; if (parent != null) { meta.put("xtext_parent_id", this.parent.id); meta.put("xtext_parent_path", this.parent.filepath); // Currently, Parent must be alread converted; and if any text output exists // and was cached, then park children in the same parent folder. // parentContainer = parent.parentContainer; } } /** * If this doc is a Parent doc, then evaluate what its "container" should be, that is to house child objects and their conversions. * If it is a child, ignore -- ensure child.parentContainer = child.parent.parentContainer * Children do not get to choose. * @deprecated -- prefer to have children archived with originals always. If you are pulling off binary data from originals (email, compound docs, etc) you will go nuts tracking it all. * @param saveEmbedded if embedded children should be saved to disk */ @Deprecated public void evalParentContainer(boolean saveEmbedded) { if (!isParent) { return; } if (parentContainer != null) { return; } // This is the parent now. String parPath = null; parentContainer = null; String parName = String.format("%s_%s", basename, extension); if (saveEmbedded) { // parent obj is at Parent.xyz // parent textpath is at xtext/Parent.xyz.txt // create ./xtext/../Parent // child is at // parent textpath ../../Parent/ parPath = new File(textpath).getParentFile().getParent(); } else { parPath = new File(textpath).getParent(); } parentContainer = new File(PathManager.makePath(parPath, parName)); } /** * Evaluate a folder for archiving children close to where the parent original resides. * Whereas evalParentContainer(boolean) which tries to choose if children are archived embedded with parents in ./xtext * or in a parallel archive. */ public void evalParentChildContainer() { if (!isParent) { return; } if (parentContainer != null) { return; } String parName = String.format("%s_%s", basename, extension); String parPath = folder.getAbsolutePath(); parentContainer = new File(PathManager.makePath(parPath, parName)); } /** * Add children converte docs. * @param ch child doc */ public void addChild(ConvertedDocument ch) { if (children == null) { children = new ArrayList<ConvertedDocument>(); } /** You are adding a child item to a parent that is marked as an RFC822 document, so naturally the * child is now an RFC822 attachment. */ if (is_RFC822_attachment) { ch.is_RFC822_attachment = true; } children.add(ch); } /** * * @return true if this is a parent and has ConvertedDocument children. */ public boolean hasChildren() { return (children != null && !children.isEmpty()); } public void addRawChild(Content child) { if (childrenContent == null) { childrenContent = new ArrayList<Content>(); } childrenContent.add(child); } /** * true if this is a parent and has raw Content children, e.g., raw bytes + metadata * which can in turn be saved as Files and then Converted to children * * @return true if instance is a parent and it has non-trivial children */ public boolean hasRawChildren() { return (childrenContent != null && !childrenContent.isEmpty()); } public List<Content> getRawChildren() { return childrenContent; } public List<ConvertedDocument> getChildren() { return children; } /** * All properties are added as a string * @return new Map of properties; Copy of the internal JSON properties */ public Map<String, String> getProperties() { Map<String, String> props = new HashMap<String, String>(); for (Object fld : meta.keySet()) { props.put(fld.toString(), meta.getString(fld.toString())); } return props; } /** * DocInput abstraction. "Identity" of a document is subjective. By default it is the * filepath, but could easily be set to MD5 digest, UUID, or some external record ID for this item. * @param ident id of this instance. */ public void setId(String ident) { this.id = ident; } /** * @param enc text encoding */ public void setEncoding(String enc) { this.encoding = enc; addProperty("encoding", enc); } /** * get the charset encoding. * * @return the character set encoding set by metadata discovery or by the setEncoding() method. */ public String getEncoding() { return this.encoding; } /** * Get the mime type of the document, may be {@code null}. * * @return The mime type of the document, if available. */ public MimeType getMimeType() { return mimeType; } /** * Set the mime type of the document, may be {@code null}. * * @param mimeType the mime type of the document. */ public void setMimeType(MimeType mimeType) { this.mimeType = mimeType; } /** * get Filetime from original file. * * @return the filetime date obj */ public Date getFiletime() { if (filetime != null) { return filetime; } if (this.file != null) { return new Date(this.file.lastModified()); } return null; } /** * DocInput interface: getText * * @return buffer - the text */ @Override public String getText() { return this.buffer; } /** * DocInput interface: getFilepath * * @return path to file */ @Override public String getFilepath() { return this.filepath; } public File getFolder() { return this.folder; } public File getFile() { return this.file; } /** * DocInput interface: getTextpath * * @return path to text file conversion. Null if original is either ASCII or * Unicode text. */ @Override public String getTextpath() { return this.textpath; } /** * Reports if the doc has text available, after it was converted. * NOTE: this is false if you ask before it is converted. * * @return true if there is text available. false if the converters have not tried to set text or they tried and found no text. */ public boolean hasText() { return buffer.length() > 0; } /** * Set default ID only after all conversion and all metadata has been acquired. * MD5 hash of text, if text is available, or of the filepath if file is empty. * * @throws IOException on err * @throws NoSuchAlgorithmException on err */ public void setDefaultID() throws IOException, NoSuchAlgorithmException { if (hasText()) { id = TextUtils.text_id(getText()); } else { id = TextUtils.text_id(filepath); } } /** * The whole point of this mess: get the text from the original. It is set here and line endings normalized to unix line endings, \n * * @param buf textual data for this document object * @throws UnsupportedEncodingException on err */ public void setText(String buf) throws UnsupportedEncodingException { this.buffer = buf; if (StringUtils.isBlank(buffer)) { return; } // Now figure out if we have a converted document or not. if (do_convert) { if (CONVERT_TO_UNIX_EOL) { buffer = buffer.replace("\r\n", "\n"); } buffer = buffer.trim(); is_converted = true; } else if (is_plaintext) { is_converted = false; textpath = this.filepath; } meta.put("textsize", buffer.length()); } /** * @param k key for property * @return metadata value for k */ public String getProperty(String k) { return meta.optString(k, null); } private boolean checkField(String k) { return valid_fields.contains(k); } public void addProperty(String k, String v) { if (!checkField(k)) { return; } meta.put(k, v); } /* * Add a custom property of your own. No validation here. * Use addProperty to add only valid core fields. */ public void addUserProperty(String k, String v) { meta.put(k, v); } public void addProperty(String k, long i) { if (!checkField(k)) { return; } meta.put(k, i); } public void addProperty(String k, boolean b) { if (!checkField(k)) { return; } meta.put(k, b); } /** * Create date text is added only on conversion. If doc conversion is * retrieved from cache, caller should rely more on the "pub_date" property. * * @param d date string */ public void addCreateDate(String d) { this.create_date_text = d; meta.put("pub_date", d); } /** * Create date obj is added only on conversion. If doc conversion is * retrieved from cache, caller should rely more on the "pub_date" property. * * @param d date object */ public void addCreateDate(java.util.Calendar d) { if (d == null) { return; } create_date = d.getTime(); meta.put("pub_date", dtfmt.print(create_date.getTime())); } /** * For convenience,... add using Date obj * @param d publication or creation date */ public void addCreateDate(java.util.Date d) { if (d == null) { return; } create_date = d; meta.put("pub_date", dtfmt.print(create_date.getTime())); } public void setCreateDate() { if (getProperty("pub_date") != null) { setCreateDate(getProperty("pub_date")); } } /** * string should be valid yyyy-mm-dd * @param ymd date string */ public void setCreateDate(String ymd) { if (StringUtils.isBlank(ymd)) { return; } DateTime joda = dtfmt.parseDateTime(ymd); if (joda != null) { create_date = joda.toDate(); } } public void addConverter(Class<?> c) { meta.put("converter", c.getName()); } public void addTitle(String a) { meta.put("title", a); } public void addAuthor(String a) { meta.put("author", a); } /** * Find the relative path where this item should reside. * <pre> * Given file /source/a/b/c.xyz * where will reside in archive? /archive/.../source/a/b/c.xyz * </pre> * Parent/Child relationship is complicated still. * * @param container folder that represents the parent document * @param childrenWithParent true if you want to save child near parent. */ public void setPathRelativeTo(String container, boolean childrenWithParent) { String relPath = container; if (isChild && parentContainer != null && !childrenWithParent) { relPath = parentContainer.getAbsolutePath(); } this.relative_path = PathManager.getRelativePath(relPath, this.filepath); } public final static String OUTPUT_ENCODING = "UTF-8"; public final static String CONVERTED_TEXT_EXT = "-utf8.txt"; /** * relative_path is original relative to input folder TOOD: cleanup */ private String getNewPath(String relpath) { if (!this.is_plaintext) { return relpath + ".txt"; } if (OUTPUT_ENCODING.equalsIgnoreCase(encoding)) { return relpath; } else { // Remove ".txt" at end of file replacing it with something to denote // It is a transcoded text file. return relpath.substring(0, relpath.length() - 4) + CONVERTED_TEXT_EXT; } } /** * Save buffer to a folder, outputDir; * * {HEADER_META}\n \n Buffer * * Text File is saved to your desintation output folder Files that are UTF-8 * already will not be saved, copied or moved. * * @param outputDir the output dir * @throws IOException on err */ public void save(String outputDir) throws IOException { if (outputDir == null) { throw new NullPointerException("outputDir was null"); } if (is_converted) { File target = new File(PathManager.makePath(outputDir, getNewPath(this.relative_path))); this._saveConversion(target); } } /** * Similar to save(), but forces the output folder to be ./xtext/ in the * same folder as the input archive. a/b/c/file.xxx to * a/b/c/xtext/file.xxx.txt * * @throws IOException on err */ public void saveEmbedded() throws IOException { if (is_converted) { String container = (parentContainer != null ? parentContainer.getAbsolutePath() : new File(this.filepath).getParent()); String targetPath = PathManager.getEmbeddedPath(container, getNewPath(this.filename)); File target = new File(targetPath); this._saveConversion(target); } } /** * Internal function for saving buffer in the XText format. * IF the converted original file as a date/time later than that of the cached conversion, * this conversion cache will be overwritten. * * @param target cache path * @throws IOException on err */ protected void _saveConversion(File target) throws IOException { if (!ConvertedDocument.overwrite && target.exists()) { // Don't save, Not overwriting. if (file.lastModified() < target.lastModified()) { this.is_cached = true; return; } } if (this.filetime == null) { this.filetime = new Date(target.lastModified()); } meta.put("filetime", this.filetime.getTime()); // Tracking Parent/Child objects. meta.put("xtext_id", this.id); FileUtility.makeDirectory(target.getParentFile()); saveBuffer(target); textpath = PathManager.fixPath(target.getAbsolutePath()); this.is_cached = true; } /** * * @return id of parent document. */ public String getParentID() { return getProperty("xtext_parent_id"); } /** * * @return path of parent document */ public String getParentPath() { return getProperty("xtext_parent_path"); } public static String XT_LABEL = "XT:"; /** * Internally save Buffer with its metadata to a given filepath * Expert mode: use this only if you know what you are doing. * You can add additional metadata to the meta sheet using addProperty() * Then overwrite existing doc conversions * @param target cached file to save a conversion. * @throws IOException on error saving content */ public void saveBuffer(File target) throws IOException { StringBuilder buf = new StringBuilder(); // META data cannot be empty. // if (meta.isEmpty()) { // buf.append("{}"); // SAVE conversions with a minimal Base64-encoded header // which when decoded is a JSON structure of metadata properties. buf.append(buffer); buf.append("\n\n"); buf.append(XT_LABEL); buf.append(Base64.encodeBase64String(meta.toString().getBytes())); buf.append("\n"); FileUtility.writeFile(buf.toString(), target.getAbsolutePath(), OUTPUT_ENCODING, false); } }