/* * Copyright (c) 2011-2012 Lockheed Martin Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.eurekastreams.server.service.email; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.mail.BodyPart; import javax.mail.Message; import javax.mail.MessagingException; import javax.mail.Multipart; import javax.mail.Part; import javax.mail.internet.ContentType; import org.eurekastreams.commons.logging.LogFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Extracts the user-provided content from an email. This implementation is somewhat simplistic. It uses the first * non-empty non-attachment plain text part found. It returns the text as a simple string (no support for attaching * links to activity posts). It removes any forwarded or replied-to content, and does so assuming that the user's entry * is above the prior message. */ public class MessageContentExtractor { /** Log. */ private final Logger log = LoggerFactory.getLogger(LogFactory.getClassName()); /** List of literal string that mark the beginning of a reply message. */ private final Collection<String> replyMarkersLiteral; /** List of regex patterns that mark the beginning of a reply message. */ private final Collection<Pattern> replyMarkersRegex; /** * Constructor. * * @param inReplyMarkersLiteral * List of literal string that mark the beginning of a reply message. * @param inReplyMarkersRegex * List of regex patterns that mark the beginning of a reply message. */ public MessageContentExtractor(final Collection<String> inReplyMarkersLiteral, final Collection<String> inReplyMarkersRegex) { replyMarkersLiteral = inReplyMarkersLiteral == null ? Collections.EMPTY_LIST : inReplyMarkersLiteral; replyMarkersRegex = new ArrayList<Pattern>(); if (inReplyMarkersRegex != null) { for (String regex : inReplyMarkersRegex) { replyMarkersRegex.add(Pattern.compile(regex, Pattern.CASE_INSENSITIVE)); } } } /** * Extracts the user-provided content from an email. * * @param message * The email message. * @return The content text. * @throws MessagingException * On error. * @throws IOException * On error. */ public String extract(final Message message) throws MessagingException, IOException { return findAndExtract(message); } /** * Recursive method to find content in a message part. * * @param part * Part to check. * @return The content text if found in the part or a subpart, else null. * @throws MessagingException * On error. * @throws IOException * On error. */ private String findAndExtract(final Part part) throws MessagingException, IOException { ContentType contentType = new ContentType(part.getContentType()); // check if usable plain text content if (part.getDisposition() == null && "text/plain".equals(contentType.getBaseType())) { String text = extractUserText(part); if (text != null) { log.debug("Extracted plain text content (length {}).", text.length()); return text; } log.debug("Found plain text part with no suitable content (null/empty/blank or entirely a forward)."); } // recurse if multipart content if ("multipart".equals(contentType.getPrimaryType())) { Object content = part.getContent(); if (content instanceof Multipart) { Multipart mp = (Multipart) content; int count = mp.getCount(); for (int i = 0; i < count; i++) { BodyPart childPart = mp.getBodyPart(i); String text = findAndExtract(childPart); if (text != null) { return text; } } } } return null; } /** * Extracts the content text from the current part, removing any forwarded or replied-to message text and unwanted * whitespace. * * @param part * Part containing content. * @return Content text. * @throws IOException * On error. * @throws MessagingException * On error. */ private String extractUserText(final Part part) throws IOException, MessagingException { String content = (String) part.getContent(); if (content == null) { log.warn("Text part of message had unexpected null content."); return null; } // look for the beginning of a forwarded or replied message int end = content.length(); for (String marker : replyMarkersLiteral) { int pos = content.indexOf(marker); if (pos >= 0 && pos < end) { end = pos; } } for (Pattern regex : replyMarkersRegex) { Matcher matcher = regex.matcher(content); if (matcher.find()) { int pos = matcher.start(); if (pos < end) { end = pos; } } } // remove trailing newlines and whitespace while (end > 0 && Character.isWhitespace(content.charAt(end - 1))) { end--; } // remove leading newlines and whitespace int start = 0; while (start < end && Character.isWhitespace(content.charAt(start))) { start++; } return start < end ? content.substring(start, end) : null; } }