/* * JBoss, Home of Professional Open Source * Copyright 2013 Red Hat Inc. and/or its affiliates and other contributors * as indicated by the @authors tag. All rights reserved. */ package org.searchisko.mbox.parser; import com.ibm.icu.text.CharsetDetector; import com.ibm.icu.text.CharsetMatch; import com.sun.xml.messaging.saaj.packaging.mime.MessagingException; import com.sun.xml.messaging.saaj.packaging.mime.internet.MimeUtility; import org.apache.commons.io.IOUtils; import org.apache.james.mime4j.dom.*; import org.apache.james.mime4j.message.BodyPart; import org.apache.tika.Tika; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.searchisko.mbox.dto.MailAttachment; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.*; import java.util.ArrayList; import java.util.List; /** * Represents the content of parsed mail body (no headers). * * @author Lukáš Vlček (lvlcek@redhat.com) */ public class MessageBodyParser { private static Logger log = LoggerFactory.getLogger(MessageBodyParser.class); /** * Supported message body subtypes. */ public enum SupportedMultiPartType { ALTERNATIVE("alternative"), MIXED("mixed"), RELATED("related"), SIGNED("signed"), /*,TODO: APPLEDOUBLE("appledouble")*/ UNKNOWN(""); private final String value; private SupportedMultiPartType(String value) { this.value = value; } @Override public String toString() { return this.value; } public static SupportedMultiPartType getValue(String value) { try { return valueOf(value.replaceAll("-", "_").toUpperCase()); } catch (Exception e) { return UNKNOWN; } } } /** * Represents parsed message body content. */ public static class MailBodyContent { private String messageId; private String firstTextContent; private String firstTextContentWithoutQuotes; private String firstHtmlContent; private List<String> textMessages = new ArrayList<>(); private List<String> htmlMessages = new ArrayList<>(); private List<MailAttachment> attachments = new ArrayList<>(); public void setMessageId(String id) { this.messageId = id; } public String getMessageId() { return this.messageId; } public void setFirstTextContent(String content) { this.firstTextContent = content; } public String getFirstTextContent() { return this.firstTextContent; } public void setFirstTextContentWithoutQuotes(String content) { this.firstTextContentWithoutQuotes = content; } public String getFirstTextContentWithoutQuotes() { return this.firstTextContentWithoutQuotes; } public void setFirstHtmlContent(String content) { this.firstHtmlContent = content; } public String getFirstHtmlContent() { return this.firstHtmlContent; } public void setTextMessages(List<String> textMessages) { this.textMessages = textMessages; } public List<String> getTextMessages() { return this.textMessages; } public void setHtmlMessages(List<String> htmlMessages) { this.htmlMessages = htmlMessages; } public List<String> getHtmlMessages() { return this.htmlMessages; } public List<MailAttachment> getAttachments() { return this.attachments; } } private static Tika tika; /** * Lazy initialize Tika instance. * @return Tika instance */ public static Tika getTika() { Tika t = tika; if (t == null) { synchronized (MessageParser.class) { t = tika; if (t == null) { tika = new Tika(); t = tika; } } } return t; } /** * * @param message * @return */ public static MailBodyContent parse(Entity message) throws MessageParseException, IOException { MailBodyContent content = new MailBodyContent(); return parse(content, message); } private static MailBodyContent parse(MailBodyContent content, Entity message) throws MessageParseException, IOException { Body body = message.getBody(); String mimeType = message.getMimeType().toLowerCase(); String contentTransferEncoding = message.getContentTransferEncoding(); String charset = message.getCharset(); String filename = message.getFilename(); if ("x-gbk".equalsIgnoreCase(charset)) { // hardcoded fix for java.io.UnsupportedEncodingException: x-gbk log.warn("Unsupported encoding found: 'x-gbk', using 'gbk' instead."); charset = "gbk"; } if (log.isTraceEnabled()) { log.trace("parsing Entity, mimeType: '{}', filename: '{}'", new Object[]{mimeType, filename}); log.trace("contentTransferEncoding: '{}'", contentTransferEncoding); log.trace("charset: '{}'", charset); } if (body instanceof Multipart) { parseMultipartBody(content, (Multipart)body); } else if (body instanceof TextBody) { parseTextBody(content, (TextBody)body, mimeType, contentTransferEncoding, charset, filename); } else if (body instanceof BinaryBody) { parseBinaryBody(content, (BinaryBody)body, mimeType, contentTransferEncoding, charset, filename); } else if (body instanceof Message) { parseMessage(content, (Message)body); } else { throw new MessageParseException("Message body of type [" + body.getClass().getSimpleName() + "] is not supported."); } return content; } private static MailBodyContent parseMultipartBody(MailBodyContent content, Multipart body) throws MessageParseException, IOException { String subType = body.getSubType().toLowerCase(); switch(SupportedMultiPartType.getValue(subType)) { case UNKNOWN: throw new MessageParseException(subType + " is unsupported body multipart subtype."); case ALTERNATIVE: BodyPart thePart = null; for (Entity part : body.getBodyParts()) { if (part.getMimeType().toLowerCase().equals("text/plain")) { thePart = (BodyPart)part; } } if (thePart == null) { for (Entity part : body.getBodyParts()) { if (part.getMimeType().toLowerCase().equals("text/html")) { thePart = (BodyPart)part; } } } if (thePart != null) return parseTextBody(content, (TextBody) thePart.getBody(), thePart.getMimeType(), thePart.getContentTransferEncoding(), thePart.getCharset(), thePart.getFilename()); else { for (Entity part : body.getBodyParts()) { if (part.getBody() instanceof Entity) { parse(content, (Entity)part.getBody()); } else if (part.getBody() instanceof Multipart) { parseMultipartBody(content, (Multipart)part.getBody()); } else { log.warn("Body of type [{}] not supported! Ignoring.", part.getBody().getClass().getCanonicalName()); } } } break; default: for (Entity part : body.getBodyParts()) { parse(content, part); } break; } return content; } private static MailBodyContent parseMessage(MailBodyContent content, Message message) throws MessageParseException, IOException { String mimeType = message.getMimeType().toLowerCase(); String contentTransferEncoding = message.getContentTransferEncoding(); String charset = message.getCharset(); String filename = message.getFilename(); Body body = message.getBody(); if (body instanceof Multipart) { parseMultipartBody(content, (Multipart)body); } else if (body instanceof TextBody) { parseTextBody(content, (TextBody)body, mimeType, contentTransferEncoding, charset, filename); } else if (body instanceof BinaryBody) { parseBinaryBody(content, (BinaryBody)body, mimeType, contentTransferEncoding, charset, filename); } else if (body instanceof Message) { parseMessage(content, (Message)body); } else { throw new MessageParseException("Body of type [" + body.getClass().getSimpleName() + "] is not supported."); } return content; } private static MailBodyContent parseTextBody(MailBodyContent bodyContent, TextBody body, String mimeType, String contentTransferEncoding, String charset, String filename) throws IOException { if (log.isTraceEnabled()) { log.trace("parsing text body, mimeType: '{}', contentTransferEncoding: '{}', charset: '{}', filename: '{}'", new Object[]{mimeType, contentTransferEncoding, charset, filename}); } if (filename != null) { addAttachment(bodyContent, body, mimeType, filename); } else { String content = null; InputStream output = null; if (contentTransferEncoding != null && contentTransferEncoding.length() > 0) { if (log.isTraceEnabled()) { log.trace("decoding: '{}'", contentTransferEncoding); log.trace("charset: '{}'", charset); } try { // com.sun.xml.messaging.saaj.packaging.mime.util.BASE64DecoderStream.decode() seems to be buggy if ("base64".equalsIgnoreCase(contentTransferEncoding)) { output = body.getInputStream(); } else { output = MimeUtility.decode(body.getInputStream(), contentTransferEncoding.toLowerCase()); } // check for 'ISO-8859'* and aliases if (charset.toUpperCase().startsWith("ISO-8859") || charset.toUpperCase().startsWith("ISO8859")) { CharsetMatch detectedCharset = detectCharset(output); if (detectedCharset != null) { int conf = detectedCharset.getConfidence(); if (conf >= 80) { log.trace("Heuristics: overriding charset from '{}' to '{}' with confidence: {}", new Object[]{detectedCharset.getName(), charset, conf}); charset = detectedCharset.getName(); } } } StringWriter writer = new StringWriter(); IOUtils.copy(output, writer, charset); content = writer.toString(); } catch (MessagingException e) { log.trace("Error decoding transfer coding.", e); content = getTextBodyContent(body.getReader()).replaceAll("=\n",""); } } else { content = getTextBodyContent(body.getReader()).replaceAll("=\n", ""); } if (mimeType.equals("text/plain")) { content = content // .replaceAll(">",">") // .replaceAll("<", "<") .replaceAll("^>From","From"); if (bodyContent.getFirstTextContent() == null && bodyContent.getFirstHtmlContent() == null) { bodyContent.setFirstTextContentWithoutQuotes(filterOutQuotedContent(content)); // if (bodyContent.getFirstTextContentWithoutQuotes().length() > 0) { // bodyContent.setFirstTextContentWithoutQuotes(bodyContent.getFirstTextContentWithoutQuotes().replaceAll(">",">")); // } bodyContent.setFirstTextContent(content/*.replaceAll(">",">")*/); } else { bodyContent.getTextMessages().add(content/*.replaceAll(">",">")*/); } } else if (mimeType.equals("text/html")) { // TODO clean possible html tags? if (bodyContent.getFirstTextContent() == null && bodyContent.getFirstHtmlContent() == null) { bodyContent.setFirstHtmlContent(content); } else { bodyContent.getHtmlMessages().add(content); } } else { // TODO just in case we are missing something (?) // text/richtext, text/xml, text/x-vhdl, text/x-vcard, text/x-patch, text/x-log, text/css, text/java, text/rtf, text/x-diff, text/x-java bodyContent.getTextMessages().add(content); } } return bodyContent; } private static CharsetMatch detectCharset(InputStream inputStream) throws IOException { CharsetDetector cd = new CharsetDetector(); // CharDetector requires support of mark/reset inputStream = inputStream.markSupported() ? inputStream : new BufferedInputStream(inputStream); cd.setText(inputStream); cd.enableInputFilter(true); return cd.detect(); } private static MailBodyContent parseBinaryBody(MailBodyContent content, BinaryBody body, String mimeType, String contentTransferEncoding, String charset, String filename) throws IOException { log.trace("parsing binary body, mimeType: '{}', contentTransferEncoding: '{}', charset: '{}', filename: '{}'", new Object[]{mimeType, contentTransferEncoding, charset, filename}); if (mimeType != null && !mimeType.equals("application/pgp-signature") && !mimeType.equals("application/ms-tnef") && !mimeType.startsWith("image/")) { // (!mimeType.startsWith("image/") || mimeType.equalsIgnoreCase("image/svg+xml"))) { if (filename != null) { addAttachment(content, body, mimeType, filename); // } else // if (mimeType.equalsIgnoreCase("application/pdf") /*|| mimeType.equalsIgnoreCase("image/svg+xml")*/) { // fix for malformed filename > see "filename*0=" in msgs // image/svg+xml commented out because binary body content needs to be first decoded using transfer encoding. // addAttachment(body, mimeType, null); // log.info("adding binary body, mimeType: {}, contentTransferEncoding: {}, charset: {}, filename: {}", new Object[]{mimeType, contentTransferEncoding, charset, filename}); } else { log.trace("Ignoring binary mimeType: '{}', contentTransferEncoding: '{}', charset: '{}', filename: '{}'", new Object[]{mimeType, contentTransferEncoding, charset, filename}); } } else { log.trace("Ignoring binary mimeType: '{}', contentTransferEncoding: '{}', charset: '{}', filename: '{}'", new Object[]{mimeType, contentTransferEncoding, charset, filename}); } return null; } /** * Silently fail if Tika fails parsing the content. * @param bodyContent * @param content * @param mimeType * @param filename * @throws IOException */ private static void addAttachment(MailBodyContent bodyContent, SingleBody content, String mimeType, String filename) throws IOException { log.trace("processing attachment: Mime-Type='{}', filename='{}'", new Object[]{mimeType, filename}); MailAttachment attachment = new MailAttachment(); attachment.setContentType(mimeType); attachment.setFileName(filename); if (content instanceof BinaryBody) { Metadata metadata = new Metadata(); // TODO: add length limit String fileContent = null; try { fileContent = removeWhiteSpaces(getTika().parseToString(content.getInputStream(), metadata, 100000)); attachment.setContent(fileContent); bodyContent.getAttachments().add(attachment); } catch (TikaException e) { log.warn("ignoring attachment: parsing error", e); } } else if (content instanceof TextBody) { Metadata metadata = new Metadata(); // TODO: add length limit String fileContent = null; try { fileContent = removeWhiteSpaces(getTika().parseToString(content.getInputStream(), metadata, 100000)); attachment.setContent(fileContent); bodyContent.getAttachments().add(attachment); } catch (TikaException e) { log.warn("ignoring attachment: parsing error", e); } } else { log.warn("ignoring attachment: unsupported attachment type: '{}'", content.getClass().getCanonicalName()); } } private static String removeWhiteSpaces(String input) { return input.replaceAll("\r\n"," ").replaceAll("\n"," ").replaceAll("\\s+"," ").trim(); } private static String filterOutQuotedContent(String content) { StringBuilder noQuotes = new StringBuilder(); for (String line : content.split("\n")) { line = line.trim(); if (line.length() > 0 && !line.startsWith(">")) { noQuotes.append(line).append(" "); } } String result = noQuotes.toString().trim(); return result; } private static String getTextBodyContent(Reader reader) throws IOException { StringBuilder sb = new StringBuilder(); int c; while ((c = reader.read()) != -1) { sb.append((char) c); } reader.close(); return sb.toString(); } }