package org.opensextant.xtext.collectors.mailbox; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.List; import java.util.Vector; import org.apache.commons.io.FilenameUtils; import org.apache.commons.lang3.StringUtils; import org.joda.time.format.DateTimeFormat; import org.joda.time.format.DateTimeFormatter; import org.opensextant.ConfigException; import org.opensextant.util.FileUtility; import org.opensextant.util.TextUtils; import org.opensextant.xtext.XText; import org.opensextant.xtext.collectors.CollectionListener; import org.opensextant.xtext.collectors.Collector; import org.opensextant.xtext.converters.MessageConverter; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.pff.PSTAppointment; import com.pff.PSTAttachment; import com.pff.PSTContact; import com.pff.PSTDistList; import com.pff.PSTException; import com.pff.PSTFile; import com.pff.PSTFolder; import com.pff.PSTMessage; import com.pff.PSTObject; import com.pff.PSTRecipient; /** * OutlookPSTCrawler traverses a PST file and pulls out: E-Mail files + attachments, Contacts, Appointments, etc. * saving originals in a reasonable folder structure. * * Mail in particular will be foldered by date-line, then short subject name folder. * * Given SomeFile.PST, the resultant output folder created will look like the example below. * <pre> * SomeFile_PST/ * + Contacts * + Appointments * + Mail-Inbox * +--2014-04-05 * +-- RE__Shipping_Invoice/ * +-- RE__Shipping_Invoice.eml * +-- Invoice.PDF * </pre> * * Supposedly on April 5th, 2014, an email was received (Mail-Inbox); titled "RE: Shipping Invoice". * To make the archival of such content obvious, the subject line is converted to a safe folder/filename. * * Usage: * * <code> * crawler = OutlookPSTCrawler( path ) * //crawer... set modes, set listener, converter, etc. * crawler.configure() * crawler.collect() * * </code> * @author ubaldino * */ public class OutlookPSTCrawler implements Collector { /** * A collection listener to consult as far as how to record the found & converted content * as well as to determine what is worth saving. * */ protected CollectionListener listener = null; private final Logger log = LoggerFactory.getLogger(getClass()); private static final DateTimeFormatter FOLDER_DATE = DateTimeFormat.forPattern("yyyy-MM-dd"); private File pst = null; private String defaultOutputName = null; private File outputDir = null; // The parent folder that will contain the output. /tmp/ private File outputPSTDir = null; // The output folder. /tmp/My_pst /** * Incremental mode simply allows you to reuse the same folder to contain the output interleaving new items */ public boolean incrementalMode = true; /** * Overwite mode allows the crawler to overwrite existing folders, attachments, objects etc by the same name. */ public boolean overwriteMode = false; /** * * @param pstFilepath input PST * @throws IOException if file fails to load */ public OutlookPSTCrawler(String pstFilepath) throws IOException { this(new File(pstFilepath)); } /** * * @param pstFile input PST * @throws IOException if file fails to load */ public OutlookPSTCrawler(File pstFile) throws IOException { pst = pstFile; if (!pst.exists()) { throw new IOException("PST file does not exist: " + pstFile.getAbsolutePath()); } defaultOutputName = FilenameUtils.getBaseName(pst.getName()) + "_pst"; } /** * Beware -- you can set the path for the PST output (outputPSTDir) or you can set the path its parent path (outputDir). * Outside apps may want to control the path setup. To use the default, setOutputDir(); configure(); * @throws ConfigException if output folder could not be set */ public void configure() throws ConfigException { if (outputPSTDir == null) { if (outputDir == null) { throw new ConfigException("Output Dir is not configured"); } if (outputDir.exists()) { outputPSTDir = new File(String.format("%s/%s", outputDir.getAbsolutePath(), defaultOutputName)); if (!incrementalMode && outputPSTDir.exists()) { throw new ConfigException( "Output Dir contains target, but you are not in overwrite mode"); } if (!outputPSTDir.exists()) { try { FileUtility.makeDirectory(outputPSTDir); } catch (IOException err) { throw new ConfigException("Unable to create target", err); } } } else { throw new ConfigException( "Please create containing output directory. DIR does not exist:" + outputDir.getAbsolutePath()); } } log.info(" Input: PST = " + pst.getAbsolutePath()); log.info(" Modes: Incremental =" + incrementalMode); log.info(" Modes: Overwrite =" + overwriteMode); log.info(" Output: Target " + outputPSTDir); } @Override public String getName() { return "OutlookPSTCrawler by XText"; } @Override public void collect() throws IOException, ConfigException { // // Logic: Traverse PST file. // it contains mail, contacts, tasks, notes, other stuff? // // Replicate folder structure discovered. // Mail and date-oriented items should be filed by date. For now, YYYY-MM-DD is fine. // // For mail messages, review DefaultMailCralwer: // - for each message // save message to disk; create parent folder to contain message contents // run text conversion individually on attachments. // // - structure: // ./Mail/ // 2014-04-09/messageABC.eml // 2014-04-09/messageABC/attachment1.doc log.info("Traversing PST Folders for FILE={}", pst); try { PSTFile pstStore = new PSTFile(pst); processFolder(pstStore.getRootFolder()); } catch (PSTException err) { throw new ConfigException("Failure with PST traversal", err); } } private XText converter = null; /** * If a converter is provided, it will be used to convert attachments. * PST API does not provide access to full email stream, so most objects - tasks, mail, contacts, etc. * are retrieved as text already. * * Caller is responsible for mananging the XText caching options. * * @param conversionManager XText instance */ public void setConverter(XText conversionManager) { converter = conversionManager; } private int depth = 0; private final int maxDepth = 10; /** * * @param folder found folder from PST * @throws PSTException PST API error * @throws IOException I/O failure * @throws ConfigException XText configuration error */ protected void processFolder(PSTFolder folder) throws PSTException, IOException, ConfigException { log.info("Folder:" + folder.getDisplayName()); ++depth; if (depth >= maxDepth) { --depth; log.error("MAX DEPTH reached. Avoid infinite recursion"); return; } if (folder.hasSubfolders()) { Vector<PSTFolder> children = folder.getSubFolders(); for (PSTFolder child : children) { processFolder(child); } } log.info("\t\tProcessing content items"); int count = folder.getContentCount(); if (count > 0) { PSTObject msg = null; while ((msg = folder.getNextChild()) != null) { // As libPST is organized with PSTMessage (email) being a base class, it must only be used as a default. // Try all other subclasses first. // String savedItem = null; if (msg instanceof PSTContact) { savedItem = processContact("Contacts", folder.getDisplayName(), (PSTContact) msg); } else if (msg instanceof PSTDistList) { savedItem = processDistList("Lists", folder.getDisplayName(), (PSTDistList) msg); } else if (msg instanceof PSTAppointment) { savedItem = processAppointment("Appointments", folder.getDisplayName(), (PSTAppointment) msg); } else if (msg instanceof PSTMessage) { processMessage(folder.getDisplayName(), (PSTMessage) msg); } else { log.info("\tItem: {}; Type:{} created at {}", msg.getDisplayName(), msg.getMessageClass(), msg.getCreationTime()); } if (savedItem != null && listener != null) { listener.collected(new File(savedItem)); } } } --depth; } /** * * @param grp A major group of Outlook objects * @param sub a subgroup * @return resulting grouping string that represents an output subfolder * @throws IOException on I/O failure */ protected File createGroupFolder(String grp, String sub) throws IOException { String groupFolder = null; if (sub.equalsIgnoreCase(grp)) { groupFolder = grp; } else { groupFolder = String.format("%s/%s", grp, sub); } File grpFile = new File(String.format("%s/%s", this.outputPSTDir.getAbsolutePath(), groupFolder)); if (!grpFile.exists()) { FileUtility.makeDirectory(grpFile); } return grpFile; } /** * * @param groupName group name * @param folderName folder name * @param appt PST API object * @return file name of processed object * @throws IOException err * @throws PSTException err * @throws ConfigException err */ public String processAppointment(String groupName, String folderName, PSTAppointment appt) throws IOException, PSTException, ConfigException { File appts = createGroupFolder(groupName, folderName); String fname = MessageConverter.createSafeFilename(appt.getSubject()); StringBuilder buf = new StringBuilder(); buf.append(appt.getSubject()); buf.append(" appointment"); buf.append(ITEM_SEP); List<String> rList = getRecipients(appt); if (rList != null) { addHeaderText(buf, "X-recipients", StringUtils.join(rList, "; ")); } else { addHeaderText(buf, "X-recipients", "No Recipients"); } // Get a list of attachments. List<String> attFiles = processAttachments(appt, appts); if (attFiles != null && !attFiles.isEmpty()) { addHeaderText(buf, "X-attchments", StringUtils.join(attFiles, "; ")); } buf.append("\n\n"); buf.append(appt.getBody().trim()); //formatFields(parseValidEntries(appt.toString()), buf); String savedPath = makePath(appts, fname + ".txt"); FileUtility.writeFile(buf.toString(), savedPath); return savedPath; } /** * * @param appt PST API object * @return list of recipients * @throws PSTException err * @throws IOException err */ public List<String> getRecipients(PSTAppointment appt) throws PSTException, IOException { int recipients = appt.getNumberOfRecipients(); if (recipients > 0) { List<String> rList = new ArrayList<String>(); for (int r = 0; r < recipients; ++r) { PSTRecipient R = appt.getRecipient(r); rList.add(String.format("%s <%s>", R.getDisplayName(), R.getEmailAddress())); } return rList; } return null; } private static String ITEM_SEP = "\n=====================\n"; /** * Save contact to a file. This uses the less elegant "toString()" method, which prints to buffer all fields * even if they are empty or null. * * @param groupName string * @param folderName string * @param contact PST API object * @return saved path * @throws IOException on err */ public String processContact(String groupName, String folderName, PSTContact contact) throws IOException { File contacts = createGroupFolder(groupName, folderName); String fname = MessageConverter.createSafeFilename(contact.getDisplayName()); StringBuilder buf = new StringBuilder(); buf.append(contact.getDisplayName()); buf.append(" contact"); buf.append(ITEM_SEP); formatFields(parseValidEntries(contact.toString()), buf); String savedPath = makePath(contacts, fname + ".txt"); FileUtility.writeFile(buf.toString(), savedPath); return savedPath; } /** * Distribution Lists take from Contacts * * @param groupName string * @param folderName string * @param list PST API list * @return saved path * * @throws IOException on err * @throws PSTException on err */ public String processDistList(String groupName, String folderName, PSTDistList list) throws IOException, PSTException { File contacts = createGroupFolder(groupName, folderName); String fname = MessageConverter.createSafeFilename(list.getDisplayName()); StringBuilder buf = new StringBuilder(); buf.append(list.getDisplayName()); buf.append(" distribution list"); buf.append(ITEM_SEP); buf.append("Address:\t" + list.getEmailAddress() + "\n"); buf.append("Comment:\t" + list.getComment() + "\n"); buf.append("\n"); Object[] members = list.getDistributionListMembers(); for (Object member : members) { formatFields(parseValidEntries(member.toString()), buf); buf.append("\n\t-------\n"); } String savedPath = makePath(contacts, fname + ".txt"); FileUtility.writeFile(buf.toString(), savedPath); return savedPath; } protected static void formatFields(List<OutlookPSTCrawler.Field> fields, StringBuilder buf) { for (Field f : fields) { buf.append(f.label); buf.append(":\t"); buf.append(f.value); buf.append("\n"); } } protected static String formatField(Field f) { return String.format("%s:\t%s\n", f.label, f.value); } /** * Retrieve a variety of fields from a PSTObject.toString() -- most of them will be empty. * This method will try to find in the string the non-null entries. * * @param pff_string string. * @return list of field values. */ protected static List<OutlookPSTCrawler.Field> parseValidEntries(String pff_string) { if (StringUtils.isBlank(pff_string)) { return null; } List<OutlookPSTCrawler.Field> result = new ArrayList<OutlookPSTCrawler.Field>(); String[] kvPairs = pff_string.split("\n"); for (String kv : kvPairs) { String[] fieldVal = kv.split(":\\s*", 2); if (fieldVal.length == 2) { if (StringUtils.isNotBlank(fieldVal[1])) { Field f = new OutlookPSTCrawler.Field(fieldVal[0], fieldVal[1]); result.add(f); } } } return (result.isEmpty() ? null : result); } /** * Apache Commons file utils "concat(dir, file)" makes a mess of file names. * Java can support "/" equally well on all platforms. * there is no apparent need to use platform specific file separators in this context. * @param dir contianing dir * @param fname sub folder * @return full path. */ private static String makePath(File dir, String fname) { return String.format("%s/%s", dir.getAbsolutePath(), fname); } public void processMessage(String folderName, PSTMessage msg) throws PSTException, IOException, ConfigException { String dateKey = FOLDER_DATE.print(msg.getCreationTime().getTime()); log.info("\tItem: {}; created at {}", msg.getSubject(), dateKey); File dateFolder = new File(String.format("%s/%s/%s", this.outputPSTDir.getAbsolutePath(), folderName, dateKey)); if (!dateFolder.exists()) { FileUtility.makeDirectory(dateFolder); } String msgSubj = msg.getSubject(); if (StringUtils.isBlank(msg.getSubject())) { msgSubj = "NO_SUBJECT"; } // Create a folder to contain all the message content. File msgFolder = createFolder(dateFolder, msgSubj, msg.getInternetMessageId()); // Get a list of attachments. List<String> attFiles = processAttachments(msg, msgFolder); // Create the final text version of the email message. TODO: RFC822 dump would be ideal. String msgFile = saveMailMessage(msg, msgFolder, attFiles, msgSubj); /* TODO: Send msg to listener after attachments are saved, as you might want to report * attachment metadata along with msg file. */ if (listener != null) { listener.collected(new File(msgFile)); } } /** * Archive a PST Mail Message as a Text file. * * @param msg PST API message * @param msgFolder output folder * @param attFiles attachments * @param subj subject * @return final saved path * @throws IOException err saving message */ public String saveMailMessage(PSTMessage msg, File msgFolder, List<String> attFiles, String subj) throws IOException { String msgName = MessageConverter.createSafeFilename(subj); String msgText = msg.getBody(); StringBuilder buf = new StringBuilder(); // Replicating some of SMTP Header / RFC822 metadata here. // addHeaderText(buf, "From", getSender(msg)); addHeaderText(buf, "To", getRecipients(msg)); addHeaderText(buf, "Subject", subj); // msg.getSubject() addHeaderText(buf, "Date", msg.getCreationTime()); addHeaderText(buf, "MessageId", msg.getInternetMessageId()); addHeaderText(buf, "X-container-file", this.pst.getName()); //addHeaderText(buf, "X-saved-on", ); if (attFiles != null && !attFiles.isEmpty()) { addHeaderText(buf, "X-attchments", StringUtils.join(attFiles, "; ")); } buf.append("\n\n"); buf.append(msgText); String savedPath = makePath(msgFolder, msgName + ".txt"); FileUtility.writeFile(buf.toString(), savedPath); return savedPath; } private static String getSender(PSTMessage msg) { String sentBy = msg.getSenderName(); String sentByEmail = msg.getSenderEmailAddress(); if (StringUtils.isNotBlank(sentBy) && StringUtils.isNotBlank(sentByEmail)) { return String.format("%s <%s>", sentBy, sentByEmail); } if (StringUtils.isNotBlank(sentByEmail)) { return String.format("<%s>", sentByEmail); } if (StringUtils.isNotBlank(sentBy)) { return sentByEmail; } return "Unknown Sender"; } private static String getRecipients(PSTMessage msg) { return msg.getRecipientsString(); } private static void addHeaderText(StringBuilder buf, String field, Object value) { buf.append(field); buf.append(":\t"); if (value != null) { buf.append(value.toString()); } else { buf.append("(empty)"); } buf.append("\n"); } /** * length of message folder name that contains a complete email. */ public static int MESSAGE_FOLDER_LEN = 40; /** * An arbitrary method of finding a unique path for a MIME message, without opening up content on disk. * This creates the folder if needed. * * TODO: given we want the folder structure to be intuitive and readable, the file names may not reflect uniqueness * where subject lines for email may be repetitive. By contrast, message IDs are not duplicative. The length of file names * from using both message ID and subject line is an issue. This routine attempts to get a relatively unique path using both. * * @param container output container relative path * @param msgSubj message subject * @param msgId message ID * @return absolute path to file that will contain all related info for a given message. * @throws IOException err */ protected File createFolder(File container, String msgSubj, String msgId) throws IOException { if (msgId == null) { throw new IOException("RFC Error - MessageId is null."); } // Get the message String msgName = MessageConverter.createSafeFilename(msgSubj); if (msgName.length() > MESSAGE_FOLDER_LEN) { msgName = msgName.substring(0, MESSAGE_FOLDER_LEN); } // Message Ids for SMTP can be quite generic. No good way to find unique value, unless you try a hash/digest of some sort. // try { msgId = TextUtils.text_id(msgId); } catch (Exception err) { log.error("Hashing err - Message ID left as-is.", err); } // Get first 2 chars of message ID + last 2 chars. int l = msgId.length(); String uniqueness = msgId.substring(0, 2) + msgId.substring(l - 3, l - 1); File msgFolder = new File(String.format("%s/%s_%s", container, msgName, uniqueness)); if (!msgFolder.exists()) { FileUtility.makeDirectory(msgFolder); } return msgFolder; } /** * REFERENCE: libpst home page, https://code.google.com/p/java-libpst/ * @author com.pff * @author ubaldino -- cleaned up name checks. * @param msg PST API message * @param msgFolder output target * @return list of attachment filenames * @throws PSTException err * @throws IOException err * @throws ConfigException err */ public List<String> processAttachments(PSTMessage msg, File msgFolder) throws PSTException, IOException, ConfigException { int numberOfAttachments = msg.getNumberOfAttachments(); List<String> attachmentFilenames = new ArrayList<String>(); for (int x = 0; x < numberOfAttachments; x++) { PSTAttachment attach = msg.getAttachment(x); // both long and short filenames can be used for attachments String filename = attach.getLongFilename(); if (StringUtils.isBlank(filename)) { filename = attach.getFilename(); if (StringUtils.isBlank(filename)) { filename = String.format("attachment%d", x + 1); } } File attPath = new File(String.format("%s/%s", msgFolder.getAbsolutePath(), filename)); savePSTFile(attach.getFileInputStream(), attPath.getAbsolutePath()); attachmentFilenames.add(filename); if (listener != null) { listener.collected(attPath); } if (converter != null) { converter.convert(attPath); } } return attachmentFilenames; } public static final int PST_INTERNAL_BLOCK_SIZE = 8176; /** * Guidance on I/O from PFF library authors, regarding getting data from PST format. * * @param stream input * @param savePath ouput * @throws IOException err */ private static void savePSTFile(InputStream stream, String savePath) throws IOException { FileOutputStream out = new FileOutputStream(savePath); //InputStream attachmentStream = attach.getFileInputStream(); // 8176 is the block size used internally and should give the best performance byte[] buffer = new byte[PST_INTERNAL_BLOCK_SIZE]; int count = stream.read(buffer); while (count == PST_INTERNAL_BLOCK_SIZE) { out.write(buffer); count = stream.read(buffer); } byte[] endBuffer = new byte[count]; System.arraycopy(buffer, 0, endBuffer, 0, count); out.write(endBuffer); out.close(); stream.close(); } public File getOutputDir() { return outputDir; } /** * Set the parent container of PST output. * @param outputDir path to output */ public void setOutputDir(File outputDir) { this.outputDir = outputDir; } /** * set the location of the output. * @param pstDir path of PST output */ public void setOutputPSTDir(File pstDir) { this.outputPSTDir = pstDir; } /** * Map string buffer output to fielded items, by parsing line items. * */ static class Field { public String label = null; public String value = null; public Field(String l, String v) { label = l; value = v; } } }