/* * JBoss, Home of Professional Open Source * Copyright 2013 Red Hat Inc. and/or its affiliates and other contributors * as indicated by the @authors tag. All rights reserved. */ package org.searchisko.mbox.parser; import org.apache.james.mime4j.MimeException; import org.apache.james.mime4j.dom.FieldParser; import org.apache.james.mime4j.dom.Message; import org.apache.james.mime4j.dom.MessageBuilder; import org.apache.james.mime4j.dom.address.Mailbox; import org.apache.james.mime4j.dom.field.*; import org.apache.james.mime4j.field.AddressListFieldImpl; import org.apache.james.mime4j.field.DateTimeFieldLenientImpl; import org.apache.james.mime4j.field.LenientFieldParser; import org.apache.james.mime4j.field.MailboxListFieldImpl; import org.apache.james.mime4j.message.DefaultMessageBuilder; import org.apache.james.mime4j.stream.Field; import org.apache.james.mime4j.stream.MimeConfig; import org.joda.time.DateTimeZone; import org.joda.time.format.DateTimeFormatter; import org.joda.time.format.ISODateTimeFormat; import org.jsoup.Jsoup; import org.jsoup.safety.Whitelist; import org.searchisko.mbox.dto.Mail; import org.searchisko.mbox.dto.MailAttachment; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.util.*; /** * Provides various static methods for parsing Messages. * * @author Lukáš Vlček (lvlcek@redhat.com) */ public class MessageParser { private static Logger log = LoggerFactory.getLogger(MessageParser.class); /** * We are interested in parsing only the following message header fields */ public enum MessageHeader { FROM("from"), TO("to"), SUBJECT("subject"), DATE("date"), MESSAGE_ID("message-id"), REFERENCES("references"), IN_REPLY_TO("in-reply-to"), IGNORE(""); private final String value; private MessageHeader(String value) { this.value = value; } @Override public String toString() { return this.value; } public static MessageHeader getValue(String value) { try { return valueOf(value.replaceAll("-", "_").toUpperCase()); } catch (Exception e) { return IGNORE; } } } public final static DateTimeFormatter defaultDatePrinter = ISODateTimeFormat.dateTime().withZone(DateTimeZone.UTC); private static MessageBuilder messageBuilder; private MessageParser() {}; /** * Lazy initialize MessageBuilder instance. * @return MessageBuilder instance * @throws MimeException */ public static MessageBuilder getMessageBuilder() throws MimeException { MessageBuilder mb = messageBuilder; if (mb == null) { synchronized(MessageParser.class) { mb = messageBuilder; if (mb == null) { MimeConfig config = new MimeConfig(); config.setMaxLineLen(10000); FieldParser<MailboxListField> fromMailboxListParser = MailboxListFieldImpl.PARSER; FieldParser<AddressListField> toAddressListParser = AddressListFieldImpl.PARSER; FieldParser<DateTimeField> dateParser = DateTimeFieldLenientImpl.PARSER; LenientFieldParser fieldParser = new LenientFieldParser(); fieldParser.setFieldParser(FieldName.TO, toAddressListParser); fieldParser.setFieldParser(FieldName.REPLY_TO, toAddressListParser); fieldParser.setFieldParser(FieldName.FROM, fromMailboxListParser); fieldParser.setFieldParser(FieldName.RESENT_FROM, fromMailboxListParser); fieldParser.setFieldParser(FieldName.DATE, dateParser); fieldParser.setFieldParser(FieldName.RESENT_DATE, dateParser); DefaultMessageBuilder _mb = new DefaultMessageBuilder(); _mb.setMimeEntityConfig(config); _mb.setFieldParser(fieldParser); mb = _mb; messageBuilder = mb; } } } return mb; } /** * It is the same as calling #parse(message, null). * * @param message * @return */ public static Mail parse(Message message) throws MessageParseException { return parse(message, null); } /** * Parse given Message into Mail. * * @param message * @param idsuffix This value gets appended to the message-id. * @return */ public static Mail parse(Message message, /*Map<String, String> data,*/ String idsuffix) throws MessageParseException { String author_name = null; String author_email = null; String[] to = null; String subject_original = null; String subject = null; String date = null; String message_id_original = null; String message_id = null; String[] references = null; String in_reply_to = null; String message_snippet = null; String first_text_message = null; String first_text_message_without_quotes = null; String first_html_message = null; String[] text_messages = null; Integer text_messages_cnt = null; String[] html_messages = null; Integer html_messages_cnt = null; MailAttachment[] message_attachments = null; Integer message_attachments_cnt = null; Map<String, Field> headers = getMessageHeaders(message); boolean messageIdPresent = false; for (String fieldName : headers.keySet()) { Field f = headers.get(fieldName); switch (MessageHeader.getValue(f.getName())) { case FROM: Author author = extractValue((MailboxListField)f); author_name = author.name; author_email = author.email; break; case TO: List<String> tos = new ArrayList<>(); for (String recipient : extractValue((AddressListField)f)) { tos.add(recipient); } to = tos.toArray(new String[tos.size()]); break; case SUBJECT: subject_original = extractValue((UnstructuredField)f); subject = normalizeSubject(subject_original); break; case DATE: Date d = extractValue((DateTimeField)f); if (d != null) { date = defaultDatePrinter.print(d.getTime()); } else { String mid = headers.get(MessageHeader.MESSAGE_ID.toString()).getBody(); log.warn("Unable to parse header field '{}' for message-id: '{}'", f, mid); throw new MessageParseException("Unable to parsed a date field. Skipping message ["+mid+"]"); } break; case MESSAGE_ID: String id = extractValue((UnstructuredField)f); if (isNullOrEmpty(id)) { throw new MessageParseException("Message-ID header is null or empty."); } message_id_original = id; message_id = id; if (!isNullOrEmpty(idsuffix)) { message_id += idsuffix; } messageIdPresent = true; break; case REFERENCES: List<String> _references = new ArrayList<>(); for (String value : extractValue((UnstructuredField)f).trim().split("\\s+")) { _references.add(value); } references = _references.toArray(new String[_references.size()]); break; case IN_REPLY_TO: in_reply_to = extractValue((UnstructuredField)f); break; } } if (!messageIdPresent) throw new MessageParseException("Message-ID header not found."); MessageBodyParser.MailBodyContent content; try { content = MessageBodyParser.parse(message); } catch (IOException e) { throw new MessageParseException(e); } String snippet = ""; if (content.getFirstTextContentWithoutQuotes() != null) { snippet = content.getFirstTextContentWithoutQuotes(); } else if (content.getFirstTextContent() != null) { snippet = content.getFirstTextContent(); } else if (content.getFirstHtmlContent() != null) { snippet = Jsoup.parse( Jsoup.clean(content.getFirstHtmlContent(), Whitelist.relaxed()) ).text(); } else { // TODO get text snippet from other fields } snippet = snippet.substring(0,(snippet.length() > 250 ? 250 : (snippet.length() > 0 ? snippet.length()-1 : 0))) // index can be -1 if length = 0 !!! // .replaceAll(">*", "") .replaceAll("^>From", "From") .replaceAll("\\s+", " ") .trim(); message_snippet = snippet; first_text_message = content.getFirstTextContent(); first_text_message_without_quotes = content.getFirstTextContentWithoutQuotes(); first_html_message = content.getFirstHtmlContent(); List<String> testMessages = new ArrayList<>(); for (String part : content.getTextMessages()) { testMessages.add(part); } text_messages = testMessages.toArray(new String[testMessages.size()]); text_messages_cnt = content.getTextMessages().size(); List<String> htmlMessages = new ArrayList<>(); for (String part : content.getHtmlMessages()) { htmlMessages.add(part); } html_messages = htmlMessages.toArray(new String[htmlMessages.size()]); html_messages_cnt = content.getHtmlMessages().size(); if (content.getAttachments().size() > 0) { message_attachments_cnt = content.getAttachments().size(); message_attachments = content.getAttachments().toArray(new MailAttachment[message_attachments_cnt]); } else { message_attachments_cnt = 0; } return new Mail( message_id, message_id_original, to, subject_original, subject, author_name, author_email, date, in_reply_to, references, message_snippet, first_text_message, first_text_message_without_quotes, first_html_message, text_messages, text_messages_cnt, html_messages, html_messages_cnt, message_attachments, message_attachments_cnt ); } /** * Extract only those header fields that are listed in MessageHeader * All keys are lower-cased. * * @param message * @return */ public static Map<String, Field> getMessageHeaders(Message message) { Map<String, Field> h = new HashMap<String, Field>(); for (Field f : message.getHeader().getFields()) { if (MessageHeader.IGNORE.equals(MessageHeader.getValue(f.getName().toLowerCase()))) { continue; } h.put(f.getName().toLowerCase(), f); } return h; } private static class Author { protected Author(String name, String email) { this.name = name; this.email = email; } protected String name; protected String email; } /** * Parse field and return #Author instance. If it is not possible to split (or identify) name and email parts * in the incoming field then returned Author instance can have name field null and email contains the most of * the content extracted from the field. * @param field * @return */ public static Author extractValue(MailboxListField field) { if (field.getMailboxList() == null) { return new Author(null, field.getBody()); } String name = field.getMailboxList().get(0).getName(); String address = field.getMailboxList().get(0).getAddress(); return new Author(name, address); } /** * * @param field * @return */ public static List<String> extractValue(AddressListField field) { List<String> items = new ArrayList<String>(); for (Mailbox mb : field.getAddressList().flatten()) { items.add(prepareAddress(mb.getName(), mb.getAddress())); } return items; } /** * * @param field * @return */ public static Date extractValue(DateTimeField field) { if (field.isValidField()) { return field.getDate(); } if (field.getParseException() != null) { log.warn("The date field [{}] can not be parsed: {}", field, field.getParseException()); } return null; } /** * * @param field * @return */ public static String extractValue(UnstructuredField field) { return field.getValue(); } /** * Normalize Subject value. * * @param subject * @return */ public static String normalizeSubject(String subject) { String s = subject; if (s != null) { s = s.replaceAll("\\[.*?\\][^$]","") // remove any brackets at the beginning except the one having end of line after the "]" .replaceAll("^\\s*(-*\\s*[a-zA-Z]{2,3}:\\s*)*","") // remove all Re: Fw: Aw: ... etc including leading dash .replaceAll("\\s+[a-zA-Z]{2,3}:","") // remove any additional Re: having white space prefix .replaceAll("^\\s*-\\s*","") // remove any left dashes at the beginning .replaceAll("\\s+"," ") // finally replace multi-white space with one white space .trim(); } return s; } private static String prepareAddress(String name, String address) { return name == null ? address : name + " <" + address + ">"; } private static boolean isNullOrEmpty(String value) { return value == null || value.trim().length() == 0; } }