package org.opensextant.xtext.converters; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.util.Date; import java.util.LinkedList; import java.util.List; import java.util.Properties; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.mail.Address; import javax.mail.Message; import javax.mail.MessagingException; import javax.mail.Multipart; import javax.mail.Part; import javax.mail.Session; import javax.mail.internet.MimeBodyPart; import javax.mail.internet.MimeMessage; //import com.sun.xml.internal.messaging.saaj.packaging.mime.internet.MimeUtility; import javax.mail.internet.MimeUtility; import org.apache.commons.io.FilenameUtils; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.opensextant.util.FileUtility; import org.opensextant.util.TextUtils; import org.opensextant.xtext.Content; import org.opensextant.xtext.ConvertedDocument; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /* * This Mail Message parser/converter should do its work on *.msg or *.eml files saved to disk as standard RFC822 * documents. A single message doc may have attachments, nested emails, etc. The input here is a single message file * * The organization of such files is determined by the caller app. If content is retrieved from an email account, * it could be organized to reflect the account's email folders or not. One thing is certain: document count multiplies * when we try to convert multimedia message into its individual artifacts. * * File.msg * +attached.doc * +imagery.jpg * * This one file (with two attachments) then becomes in "XText" speak: * * xtext/File.msg.txt text of message, here File.msg.txt backs the original based on file name alone. It contains the parent metadata of the mail msg. * File/attached.doc attachment one * File/imagery.jpg attachment two * * File/xtext/attached.doc.txt text of one, * File/xtext/imagery.jpg.txt text of two. * * ... 1 file becomes 5 additional files, in this example. * * Ho hum... * https://issues.apache.org/jira/browse/TIKA-1222 -- Attachments are not parsed. */ //import org.apache.tika.parser.mail.RFC822Parser; public class MessageConverter extends ConverterAdapter { protected Logger logger = LoggerFactory.getLogger(getClass()); private final Session noSession = Session.getDefaultInstance(new Properties()); private int attachmentNumber = 0; private final List<String> textEncodings = new LinkedList<String>(); /** * @param in stream * @param doc original file * @throws IOException on err */ @Override protected ConvertedDocument conversionImplementation(InputStream in, File doc) throws IOException { attachmentNumber = 0; textEncodings.clear(); try { // Connect to the message file // MimeMessage msg = new MimeMessage(noSession, in); return convertMimeMessage(msg, doc); } catch (Exception xerr) { throw new IOException("Unable to parse content", xerr); } finally { in.close(); } } /** * Convert the MIME Message with or without the File doc. * -- live email capture from a mailbox: you have the MimeMessage; there is no File object * -- email capture from a filesystem: you retrieved the MimeMessage from a File object * * @param msg javamail Message obj * @param doc converted doc for given message * @return doc conversion, likely a parent document with 1 or more child attachments * @throws MessagingException on err * @throws IOException on err */ public ConvertedDocument convertMimeMessage(Message msg, File doc) throws MessagingException, IOException { ConvertedDocument parentMsgDoc = new ConvertedDocument(doc); parentMsgDoc.is_RFC822_attachment = true; //parentMsgDoc.setEncoding(parseCharset(msg.getContentType())); setMailAttributes(parentMsgDoc, msg); StringBuilder rawText = new StringBuilder(); // Since content is taken from file system, use file name String messageFilePrefix = (doc != null ? FilenameUtils.getBaseName(doc.getName()) : parentMsgDoc.id); // Find all attachments and plain text. parseMessage(msg, parentMsgDoc, rawText, messageFilePrefix); parentMsgDoc.setText(rawText.toString()); return parentMsgDoc; } /** * Copy innate Message metadata into the ConvertedDocument properties to save that metadata in the normal place. * This metadata will also be replicated down through children items to reflect the fact the attachment was sent via this message. * * @param msgdoc doc conversion * @param message original mail message * @throws MessagingException on err */ private void setMailAttributes(ConvertedDocument msgdoc, Message message) throws MessagingException { String msg_id = getMessageID(message); if (msg_id == null) { return; } msgdoc.id = getShorterMessageID(msg_id); String mailSubj = message.getSubject(); msgdoc.addTitle(mailSubj); Address[] sender = message.getFrom(); String sender0 = null; if (sender != null && sender.length > 0) { sender0 = sender[0].toString(); msgdoc.addAuthor(sender0); } Date d = message.getSentDate(); String dt = (d != null ? d.toString() : ""); msgdoc.addCreateDate(d != null ? d : msgdoc.filetime); msgdoc.addUserProperty(MAIL_KEY_PREFIX + "msgid", msg_id); msgdoc.addUserProperty(MAIL_KEY_PREFIX + "sender", sender0); msgdoc.addUserProperty(MAIL_KEY_PREFIX + "date", dt); msgdoc.addUserProperty(MAIL_KEY_PREFIX + "subject", mailSubj); } /** * Retrieve the Identifier part of a message, that is <id@server> we want the "id" part. * * @param message mail message * @return ID for message * @throws MessagingException on err */ public static String getMessageID(Message message) throws MessagingException { String[] msgIds = message.getHeader("Message-Id"); if (msgIds == null || msgIds.length == 0) { //logger.error("No Message ID!"); return null; } String msgId = null; String msgLocalId = null; //String msgIdFilename = null; msgId = extractAngleValue(msgIds[0]); String[] msgid_parts = msgId.split("@"); msgLocalId = msgId; if (msgid_parts.length > 1) { msgLocalId = msgid_parts[0]; } return msgLocalId; } /** * Given a global msg ID, create an ID that should be relatively unique. * * @param globalId the full SMTP/MIME message ID * @return a shorter version of the ID cleaned of special chars */ public static String getShorterMessageID(String globalId) { String msgId = extractAngleValue(globalId); String[] msgid_parts = msgId.split("@"); String shorter = msgId; if (msgid_parts.length > 1) { shorter = msgid_parts[0]; } // Clean up MSG ID // The same ID that is used to archive will be used to record in DB. // shorter = TextUtils.replaceAny(shorter, "#$.%~", "_"); return shorter; } public static String MAIL_KEY_PREFIX = "mail:"; /** * Whacky... each child attachment will have some knowledge about the containing mail messsage which carried it. * * @param parent parent doc * @param child raw content */ private void copyMailAttrs(ConvertedDocument parent, Content child) { if (child.encoding != null) { child.meta.setProperty("encoding", child.encoding); } for (String key : parent.getProperties().keySet()) { if (key.startsWith(MAIL_KEY_PREFIX)) { String val = parent.getProperty(key); if (val != null) { child.meta.setProperty(key, val); } } } } /** * This is a recursive parser that pulls off attachments into Child content or saves plain text as main message text. * Calendar invites are ignored. * * @param bodyPart individual sub-part to append to buffer * @param parent parent doc * @param buf text to append * @param msgPrefixId msgId prefix * @throws IOException on error */ public void parseMessage(Part bodyPart, ConvertedDocument parent, StringBuilder buf, String msgPrefixId) throws IOException { InputStream partIO = null; ++attachmentNumber; try { PartMetadata meta = new PartMetadata(bodyPart); //String charset = (meta.charset == null ? "UTF-8" : meta.charset); textEncodings.add(meta.charset); String filename = bodyPart.getFileName(); String fileext = meta.getPossibleFileExtension(); if (filename != null) { fileext = FilenameUtils.getExtension(filename); logger.debug("original filename: " + filename); } boolean hasExtension = StringUtils.isNotBlank(fileext); if (!hasExtension) { logger.debug("Unknown message part"); fileext = "dat"; } if (filename == null && attachmentNumber > 1) { filename = String.format("%s-Att%d.%s", msgPrefixId, attachmentNumber, fileext); } logger.debug("Charset for part is {}", meta.charset); /* * Using isMimeType to determine the content type avoids fetching * the actual content data until we need it. */ // IGNORE types: calendar. if (meta.isCalendar()) { logger.debug("{}# Ignore item", msgPrefixId); return; } if (meta.isHTML()) { // logger.debug("{}# Save HTML part as its own file", msgPrefixId); } else if (bodyPart.isMimeType("multipart/*")) { Multipart mp = (Multipart) bodyPart.getContent(); int count = mp.getCount(); for (int i = 0; i < count; i++) { // This step does not actually save any content, it calls // itself to continue to break down the parts into the // finest grained elements, at which point parseMessage(mp.getBodyPart(i), parent, buf, msgPrefixId); } // Exit point return; } else if (bodyPart.isMimeType("message/rfc822")) { /* normal mail message body */ parseMessage((Part) bodyPart.getContent(), parent, buf, msgPrefixId); // Exit point return; } else { Object part = bodyPart.getContent(); boolean isTextPlain = bodyPart.isMimeType("text/plain"); if (part instanceof String) { /* We will take the first charset encoding found for the body text of hte message. * If there are HTML views of the data, those individual documents will be child documents with their own encodings. */ if (meta.charset != null && parent.getEncoding() == null) { parent.setEncoding(meta.charset); } String text = (String) part; if (!isTextPlain) { // Decode TEXT from MIME base64 or QP encoded data. // TODO: Is this necessary? The mime libraries seem to handle base64 unencoding automatically // (at least for text/plain attachments). -jgibson logger.debug("{}# Save String MIME part", msgPrefixId); if (meta.isQP() || meta.isBase64()) { try { partIO = IOUtils.toInputStream(text); byte[] textBytes = decodeMIMEText(partIO, meta.transferEncoding); if (meta.charset != null) { text = new String(textBytes, meta.charset); } else { text = new String(textBytes); } } catch (Exception decodeErr) { logger.error("Decoding error with bare text in body of message"); } } else { logger.debug("Other encoding is unaccounted: {}", meta.transferEncoding); } } if (meta.isAttachment()) { Content child = createBaseChildContent(filename, meta); if (child.encoding == null) { child.encoding = "UTF-8"; } child.content = text.getBytes(child.encoding); copyMailAttrs(parent, child); parent.addRawChild(child); } else { // Note, before trying any of these decoding trick buf.append(TextUtils.delete_controls(text)); buf.append("\n*******************\n"); // Note, the "=XX" sequence is reserved for RFC822 encoding of special chars and non-ASCII. // So I avoid using "=====".... as a separator. } // Exit point return; } else if (part instanceof InputStream) { // Retrieve byte stream. partIO = (InputStream) part; Content child = createChildContent(filename, partIO, meta); copyMailAttrs(parent, child); parent.addRawChild(child); // Exit point. return; } else { /* MCU: identify unknown MIME parts */ logger.debug("Skipping this an unknown bodyPart type: " + part.getClass().getName()); //return; } } if (bodyPart instanceof MimeBodyPart && !bodyPart.isMimeType("multipart/*")) { logger.debug("{}# Saving {} ", msgPrefixId, filename); if (meta.disposition == null || meta.isAttachment) { partIO = ((MimeBodyPart) bodyPart).getRawInputStream(); Content child = createChildContent(filename, partIO, meta); copyMailAttrs(parent, child); if (meta.isHTML() && (meta.isInline() || (!meta.isAttachment()))) { child.meta.setProperty(MAIL_KEY_PREFIX + "html-body", "true"); } parent.addRawChild(child); return; } } } catch (MessagingException e2) { logger.error("Extraction Failed on Messaging Exception", e2); } finally { if (partIO != null) { partIO.close(); } } } /** * Abstract the encoding issue. * @param stm raw stream * @param enc a transfer encoding named in the multipart header, see MimeUtility.decode() for more detail * @return byte data for the stream. Caller still has to decode to proper charset. * @throws Exception on error */ private static byte[] decodeMIMEText(InputStream stm, String enc) throws Exception { InputStream decodedContent = null; try { decodedContent = MimeUtility.decode(stm, enc); return IOUtils.toByteArray(decodedContent); } finally { if (decodedContent != null) { decodedContent.close(); } } } /** * More conveniently create Child item. This will attempt to decode the multipart encoding, mainly "quoted-printable" data should be decoded prior to saving. * Lastly, the content bytes are always left as their native charset encoding. * Versus, text strings, which will be automatically in parseMessage() and saved as UTF-8 * * @param file_id file ID * @param input stream * @return content raw child object * @throws IOException on err */ private Content createChildContent(String file_id, InputStream input, PartMetadata meta) throws IOException { Content child = createBaseChildContent(file_id, meta); // Plain text is likely handled up above as (String)part are encountered in-line. // Here HTML attachments need to be decoded. if (meta.isHTML() && (meta.isQP() || meta.isBase64())) { try { child.content = decodeMIMEText(input, meta.transferEncoding); } catch (Exception decoderErr) { logger.error("MIME Decoding failed with parameters: {}", meta.mimeType); } } else { logger.debug("Other encoding is unaccounted: {}", meta.transferEncoding); } // Default or last resort. if (child.content == null) { child.content = IOUtils.toByteArray(input); } return child; } /** * Create a Child item with all of the metadata populated correctly. * * @param file_id file ID, if Tika found one, or a custom one. * @param meta metadata pulled from the MIME part * @return content abstraction for the child */ private Content createBaseChildContent(String file_id, PartMetadata meta) { Content child = new Content(); child.id = file_id; child.encoding = meta.charset; child.meta.setProperty(ConvertedDocument.CHILD_ENTRY_KEY, file_id); child.meta.setProperty(MAIL_KEY_PREFIX + "disposition", (meta.disposition == null ? "none" : meta.disposition)); if (meta.contentId != null) { child.meta.setProperty(MAIL_KEY_PREFIX + "content-id", meta.contentId); } child.mimeType = meta.mimeType; return child; } /** Parse out charset encoding spec from MIME content-type header */ private final static Pattern CHARSET_EXTRACTOR = Pattern.compile("charset=['\"]?([-_\\w]+)['\"]?"); /** * Help determine charset, object type, filename if any, and file extension * Mainly to guide how to parse, filter and employ the text content of this Part. * * @author ubaldino * */ class PartMetadata { public String mimeType = null; public String charset = null; public String transferEncoding = null; public String disposition = null; public String contentId = null; private boolean istext = false; private boolean ishtml = false; private boolean iscal = false; private boolean isImage = false; private boolean isAttachment = false; private boolean isInline = false; public String desc = "data"; @Override public String toString() { return mimeType + " charset=" + charset + " desc=" + desc; } public PartMetadata(Part bodyPart) throws MessagingException { mimeType = bodyPart.getContentType(); if (bodyPart.isMimeType("text/plain")) { istext = true; desc = "text"; } else if (bodyPart.isMimeType("text/html")) { ishtml = true; desc = "HTML"; } else if (bodyPart.isMimeType("text/calendar")) { iscal = true; desc = "Calendar-Invite"; } String filename = bodyPart.getFileName(); if (filename != null) { String ext = FilenameUtils.getExtension(filename); iscal = (iscal || (ext.equalsIgnoreCase("ics") || ext.equalsIgnoreCase("ical"))); isImage = (FileUtility.getFileDescription(filename) == FileUtility.IMAGE_MIMETYPE); desc = "Image"; } if (istext || ishtml) { String header = bodyPart.getContentType(); charset = parseCharset(header); if (charset == null) { String[] x = bodyPart.getHeader("Content-Type"); if (x.length > 0) { charset = parseCharset(x[0]); } } String[] headers = bodyPart.getHeader("Content-Transfer-Encoding"); if (headers != null && headers.length > 0) { transferEncoding = headers[0]; } } disposition = bodyPart.getDisposition(); if (Part.ATTACHMENT.equals(disposition)) { isAttachment = true; } else if (Part.INLINE.equals(disposition)) { isInline = true; } String[] contentIds = bodyPart.getHeader("Content-Id"); if (contentIds != null && contentIds.length > 0 && (!StringUtils.isBlank(contentIds[0]))) { contentId = extractAngleValue(contentIds[0]); } } /** is QP encoding * @return true if is quoted-printable */ public boolean isQP() { if (transferEncoding == null) { return false; } return "quoted-printable".equalsIgnoreCase(transferEncoding); } /** * @return true if is Base64 encoded */ public boolean isBase64() { if (transferEncoding == null) { return false; } return "base64".equalsIgnoreCase(transferEncoding); } public boolean isImage() { return isImage; } public boolean isCalendar() { return iscal; } public boolean isHTML() { return ishtml; } public boolean isText() { return istext; } public boolean isAttachment() { return isAttachment; } public boolean isInline() { return isInline; } /** * @return "html", if item is HTML, "txt" if item is plain text */ public String getPossibleFileExtension() { if (isHTML()) { return "html"; } if (isText()) { return "txt"; } return null; } } /** * * @param mimespec encoding spec * @return charset name */ public static String parseCharset(String mimespec) { //String cs = MimeUtility.javaCharset(mimespec); //if (cs != null) { return cs;} // Ah, thanks for nothing, JavaMail. MIME content-type given is "a/b; charset='c'" the response should be "c". // JavaMail cannot pull out the char set from content-type header. Matcher m = CHARSET_EXTRACTOR.matcher(mimespec); if (m.find()) { return m.group(1); } return null; } /** * Get File Extension for known types. Otherwise MIME part should provide a * file name for such things. TODO: possibly switch to MIME4J and Apache * James * * @param mimetype just the mime type, w/out charset * @return file extension to map to given MIME */ public static String getFileExtension(String mimetype) { if ("text/plain".equalsIgnoreCase(mimetype)) { return "txt"; } if ("text/html".equalsIgnoreCase(mimetype)) { return "html"; } return null; } /** * Create a safe filename from arbitrary text. That is no special shell * operators $, #, ?, >, <, *, ' ', etc. * * @param text text of a filename * @return file name constructed from input text and underscores in place of * special chars. */ public static String createSafeFilename(String text) { String tmp = TextUtils.squeeze_whitespace(text).replaceAll( "[\"'&;.“”)(%$?:<>\\*#~!@\\\\/ ]", "_"); // Trim trailing "__" from resulting file name. for (int x = tmp.length() - 1; x > 0; --x) { char ch = tmp.charAt(x); if (ch != '_') { tmp = tmp.substring(0, x + 1); break; } } return tmp; } private final static Pattern ANGLE_EXTRACTOR = Pattern.compile("<(.+)>"); /** * Parse 'value' from '<value>' Used for pulling emailaddress or msgId value from headers. * * @param value any text * @return value stripped of <, gt&; */ private static String extractAngleValue(String value) { Matcher regex = ANGLE_EXTRACTOR.matcher(value); if (regex.matches()) { String msgId = regex.group(1); return msgId; } return value; } }