/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.handler.dataimport;
import com.sun.mail.imap.IMAPMessage;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.utils.ParseUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.mail.*;
import javax.mail.internet.AddressException;
import javax.mail.internet.ContentType;
import javax.mail.internet.InternetAddress;
import javax.mail.internet.MimeMessage;
import javax.mail.search.AndTerm;
import javax.mail.search.ComparisonTerm;
import javax.mail.search.ReceivedDateTerm;
import javax.mail.search.SearchTerm;
import java.io.InputStream;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.*;
/**
* An EntityProcessor instance which can index emails along with their attachments from POP3 or IMAP sources. Refer to
* <a href="http://wiki.apache.org/solr/DataImportHandler">http://wiki.apache.org/solr/DataImportHandler</a> for more
* details. <b>This API is experimental and subject to change</b>
*
* @version $Id: MailEntityProcessor.java 945245 2010-05-17 17:18:10Z rmuir $
* @since solr 1.4
*/
public class MailEntityProcessor extends EntityProcessorBase {
public static interface CustomFilter {
public SearchTerm getCustomSearch(Folder folder);
}
public void init(Context context) {
super.init(context);
// set attributes using XXX getXXXFromContext(attribute, defualtValue);
// applies variable resolver and return default if value is not found or null
// REQUIRED : connection and folder info
user = getStringFromContext("user", null);
password = getStringFromContext("password", null);
host = getStringFromContext("host", null);
protocol = getStringFromContext("protocol", null);
folderNames = getStringFromContext("folders", null);
// validate
if (host == null || protocol == null || user == null || password == null
|| folderNames == null)
throw new DataImportHandlerException(DataImportHandlerException.SEVERE,
"'user|password|protocol|host|folders' are required attributes");
//OPTIONAL : have defaults and are optional
recurse = getBoolFromContext("recurse", true);
String excludes = getStringFromContext("exclude", "");
if (excludes != null && !excludes.trim().equals("")) {
exclude = Arrays.asList(excludes.split(","));
}
String includes = getStringFromContext("include", "");
if (includes != null && !includes.trim().equals("")) {
include = Arrays.asList(includes.split(","));
}
batchSize = getIntFromContext("batchSize", 20);
customFilter = getStringFromContext("customFilter", "");
String s = getStringFromContext("fetchMailsSince", "");
if (s != null)
try {
fetchMailsSince = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").parse(s);
} catch (ParseException e) {
throw new DataImportHandlerException(DataImportHandlerException.SEVERE, "Invalid value for fetchMailSince: " + s, e);
}
fetchSize = getIntFromContext("fetchSize", 32 * 1024);
cTimeout = getIntFromContext("connectTimeout", 30 * 1000);
rTimeout = getIntFromContext("readTimeout", 60 * 1000);
processAttachment = getBoolFromContext("processAttachement", true);
logConfig();
}
public Map<String, Object> nextRow() {
Message mail;
Map<String, Object> row = null;
do {
// try till there is a valid document or folders get exhausted.
// when mail == NULL, it means end of processing
mail = getNextMail();
if (mail != null)
row = getDocumentFromMail(mail);
} while (row == null && mail != null);
return row;
}
private Message getNextMail() {
if (!connected) {
if (!connectToMailBox())
return null;
connected = true;
}
if (folderIter == null) {
createFilters();
folderIter = new FolderIterator(mailbox);
}
// get next message from the folder
// if folder is exhausted get next folder
// loop till a valid mail or all folders exhausted.
while (msgIter == null || !msgIter.hasNext()) {
Folder next = folderIter.hasNext() ? folderIter.next() : null;
if (next == null) {
return null;
}
msgIter = new MessageIterator(next, batchSize);
}
return msgIter.next();
}
private Map<String, Object> getDocumentFromMail(Message mail) {
Map<String, Object> row = new HashMap<String, Object>();
try {
addPartToDocument(mail, row, true);
return row;
} catch (Exception e) {
return null;
}
}
public void addPartToDocument(Part part, Map<String, Object> row, boolean outerMost) throws Exception {
if (part instanceof Message) {
addEnvelopToDocument(part, row);
}
String ct = part.getContentType();
ContentType ctype = new ContentType(ct);
if (part.isMimeType("multipart/*")) {
Multipart mp = (Multipart) part.getContent();
int count = mp.getCount();
if (part.isMimeType("multipart/alternative"))
count = 1;
for (int i = 0; i < count; i++)
addPartToDocument(mp.getBodyPart(i), row, false);
} else if (part.isMimeType("message/rfc822")) {
addPartToDocument((Part) part.getContent(), row, false);
} else {
String disp = part.getDisposition();
if (!processAttachment || (disp != null && disp.equalsIgnoreCase(Part.ATTACHMENT))) return;
InputStream is = part.getInputStream();
String fileName = part.getFileName();
String content = ParseUtils.getStringContent(is, TikaConfig.getDefaultConfig(), ctype.getBaseType().toLowerCase(Locale.ENGLISH));
if (disp != null && disp.equalsIgnoreCase(Part.ATTACHMENT)) {
if (row.get(ATTACHMENT) == null)
row.put(ATTACHMENT, new ArrayList<String>());
List<String> contents = (List<String>) row.get(ATTACHMENT);
contents.add(content);
row.put(ATTACHMENT, contents);
if (row.get(ATTACHMENT_NAMES) == null)
row.put(ATTACHMENT_NAMES, new ArrayList<String>());
List<String> names = (List<String>) row.get(ATTACHMENT_NAMES);
names.add(fileName);
row.put(ATTACHMENT_NAMES, names);
} else {
if (row.get(CONTENT) == null)
row.put(CONTENT, new ArrayList<String>());
List<String> contents = (List<String>) row.get(CONTENT);
contents.add(content);
row.put(CONTENT, contents);
}
}
}
private void addEnvelopToDocument(Part part, Map<String, Object> row) throws MessagingException {
MimeMessage mail = (MimeMessage) part;
Address[] adresses;
if ((adresses = mail.getFrom()) != null && adresses.length > 0)
row.put(FROM, adresses[0].toString());
List<String> to = new ArrayList<String>();
if ((adresses = mail.getRecipients(Message.RecipientType.TO)) != null)
addAddressToList(adresses, to);
if ((adresses = mail.getRecipients(Message.RecipientType.CC)) != null)
addAddressToList(adresses, to);
if ((adresses = mail.getRecipients(Message.RecipientType.BCC)) != null)
addAddressToList(adresses, to);
if (to.size() > 0)
row.put(TO_CC_BCC, to);
row.put(MESSAGE_ID, mail.getMessageID());
row.put(SUBJECT, mail.getSubject());
Date d = mail.getSentDate();
if (d != null) {
row.put(SENT_DATE, d);
}
List<String> flags = new ArrayList<String>();
for (Flags.Flag flag : mail.getFlags().getSystemFlags()) {
if (flag == Flags.Flag.ANSWERED)
flags.add(FLAG_ANSWERED);
else if (flag == Flags.Flag.DELETED)
flags.add(FLAG_DELETED);
else if (flag == Flags.Flag.DRAFT)
flags.add(FLAG_DRAFT);
else if (flag == Flags.Flag.FLAGGED)
flags.add(FLAG_FLAGGED);
else if (flag == Flags.Flag.RECENT)
flags.add(FLAG_RECENT);
else if (flag == Flags.Flag.SEEN)
flags.add(FLAG_SEEN);
}
flags.addAll(Arrays.asList(mail.getFlags().getUserFlags()));
row.put(FLAGS, flags);
String[] hdrs = mail.getHeader("X-Mailer");
if (hdrs != null)
row.put(XMAILER, hdrs[0]);
}
private void addAddressToList(Address[] adresses, List<String> to) throws AddressException {
for (Address address : adresses) {
to.add(address.toString());
InternetAddress ia = (InternetAddress) address;
if (ia.isGroup()) {
InternetAddress[] group = ia.getGroup(false);
for (InternetAddress member : group)
to.add(member.toString());
}
}
}
private boolean connectToMailBox() {
try {
Properties props = new Properties();
props.setProperty("mail.store.protocol", protocol);
props.setProperty("mail.imap.fetchsize", "" + fetchSize);
props.setProperty("mail.imap.timeout", "" + rTimeout);
props.setProperty("mail.imap.connectiontimeout", "" + cTimeout);
Session session = Session.getDefaultInstance(props, null);
mailbox = session.getStore(protocol);
mailbox.connect(host, user, password);
LOG.info("Connected to mailbox");
return true;
} catch (MessagingException e) {
throw new DataImportHandlerException(DataImportHandlerException.SEVERE,
"Connection failed", e);
}
}
private void createFilters() {
if (fetchMailsSince != null) {
filters.add(new MailsSinceLastCheckFilter(fetchMailsSince));
}
if (customFilter != null && !customFilter.equals("")) {
try {
Class cf = Class.forName(customFilter);
Object obj = cf.newInstance();
if (obj instanceof CustomFilter) {
filters.add((CustomFilter) obj);
}
} catch (Exception e) {
throw new DataImportHandlerException(DataImportHandlerException.SEVERE,
"Custom filter could not be created", e);
}
}
}
private void logConfig() {
if (!LOG.isInfoEnabled()) return;
StringBuffer config = new StringBuffer();
config.append("user : ").append(user).append(System.getProperty("line.separator"));
config.append("pwd : ").append(password).append(System.getProperty("line.separator"));
config.append("protocol : ").append(protocol).append(System.getProperty("line.separator"));
config.append("host : ").append(host).append(System.getProperty("line.separator"));
config.append("folders : ").append(folderNames).append(System.getProperty("line.separator"));
config.append("recurse : ").append(recurse).append(System.getProperty("line.separator"));
config.append("exclude : ").append(exclude.toString()).append(System.getProperty("line.separator"));
config.append("include : ").append(include.toString()).append(System.getProperty("line.separator"));
config.append("batchSize : ").append(batchSize).append(System.getProperty("line.separator"));
config.append("fetchSize : ").append(fetchSize).append(System.getProperty("line.separator"));
config.append("read timeout : ").append(rTimeout).append(System.getProperty("line.separator"));
config.append("conection timeout : ").append(cTimeout).append(System.getProperty("line.separator"));
config.append("custom filter : ").append(customFilter).append(System.getProperty("line.separator"));
config.append("fetch mail since : ").append(fetchMailsSince).append(System.getProperty("line.separator"));
LOG.info(config.toString());
}
class FolderIterator implements Iterator<Folder> {
private Store mailbox;
private List<String> topLevelFolders;
private List<Folder> folders = null;
private Folder lastFolder = null;
public FolderIterator(Store mailBox) {
this.mailbox = mailBox;
folders = new ArrayList<Folder>();
getTopLevelFolders(mailBox);
}
public boolean hasNext() {
return !folders.isEmpty();
}
public Folder next() {
try {
boolean hasMessages = false;
Folder next;
do {
if (lastFolder != null) {
lastFolder.close(false);
lastFolder = null;
}
if (folders.isEmpty()) {
mailbox.close();
return null;
}
next = folders.remove(0);
if (next != null) {
String fullName = next.getFullName();
if (!excludeFolder(fullName)) {
hasMessages = (next.getType() & Folder.HOLDS_MESSAGES) != 0;
next.open(Folder.READ_ONLY);
lastFolder = next;
LOG.info("Opened folder : " + fullName);
}
if (recurse && ((next.getType() & Folder.HOLDS_FOLDERS) != 0)) {
Folder[] children = next.list();
LOG.info("Added its children to list : ");
for (int i = children.length - 1; i >= 0; i--) {
folders.add(0, children[i]);
LOG.info("child name : " + children[i].getFullName());
}
if (children.length == 0)
LOG.info("NO children : ");
}
}
}
while (!hasMessages);
return next;
} catch (MessagingException e) {
//throw new DataImportHandlerException(DataImportHandlerException.SEVERE,
// "Folder open failed", e);
}
return null;
}
public void remove() {
throw new UnsupportedOperationException("Its read only mode...");
}
private void getTopLevelFolders(Store mailBox) {
if (folderNames != null)
topLevelFolders = Arrays.asList(folderNames.split(","));
for (int i = 0; topLevelFolders != null && i < topLevelFolders.size(); i++) {
try {
folders.add(mailbox.getFolder(topLevelFolders.get(i)));
} catch (MessagingException e) {
// skip bad ones unless its the last one and still no good folder
if (folders.size() == 0 && i == topLevelFolders.size() - 1)
throw new DataImportHandlerException(DataImportHandlerException.SEVERE,
"Folder retreival failed");
}
}
if (topLevelFolders == null || topLevelFolders.size() == 0) {
try {
folders.add(mailBox.getDefaultFolder());
} catch (MessagingException e) {
throw new DataImportHandlerException(DataImportHandlerException.SEVERE,
"Folder retreival failed");
}
}
}
private boolean excludeFolder(String name) {
for (String s : exclude) {
if (name.matches(s))
return true;
}
for (String s : include) {
if (name.matches(s))
return false;
}
return include.size() > 0;
}
}
class MessageIterator implements Iterator<Message> {
private Folder folder;
private Message[] messagesInCurBatch;
private int current = 0;
private int currentBatch = 0;
private int batchSize = 0;
private int totalInFolder = 0;
private boolean doBatching = true;
public MessageIterator(Folder folder, int batchSize) {
try {
this.folder = folder;
this.batchSize = batchSize;
SearchTerm st = getSearchTerm();
if (st != null) {
doBatching = false;
messagesInCurBatch = folder.search(st);
totalInFolder = messagesInCurBatch.length;
folder.fetch(messagesInCurBatch, fp);
current = 0;
LOG.info("Total messages : " + totalInFolder);
LOG.info("Search criteria applied. Batching disabled");
} else {
totalInFolder = folder.getMessageCount();
LOG.info("Total messages : " + totalInFolder);
getNextBatch(batchSize, folder);
}
} catch (MessagingException e) {
throw new DataImportHandlerException(DataImportHandlerException.SEVERE,
"Message retreival failed", e);
}
}
private void getNextBatch(int batchSize, Folder folder) throws MessagingException {
// after each batch invalidate cache
if (messagesInCurBatch != null) {
for (Message m : messagesInCurBatch) {
if (m instanceof IMAPMessage)
((IMAPMessage) m).invalidateHeaders();
}
}
int lastMsg = (currentBatch + 1) * batchSize;
lastMsg = lastMsg > totalInFolder ? totalInFolder : lastMsg;
messagesInCurBatch = folder.getMessages(currentBatch * batchSize + 1, lastMsg);
folder.fetch(messagesInCurBatch, fp);
current = 0;
currentBatch++;
LOG.info("Current Batch : " + currentBatch);
LOG.info("Messages in this batch : " + messagesInCurBatch.length);
}
public boolean hasNext() {
boolean hasMore = current < messagesInCurBatch.length;
if (!hasMore && doBatching
&& currentBatch * batchSize < totalInFolder) {
// try next batch
try {
getNextBatch(batchSize, folder);
hasMore = current < messagesInCurBatch.length;
} catch (MessagingException e) {
throw new DataImportHandlerException(DataImportHandlerException.SEVERE,
"Message retreival failed", e);
}
}
return hasMore;
}
public Message next() {
return hasNext() ? messagesInCurBatch[current++] : null;
}
public void remove() {
throw new UnsupportedOperationException("Its read only mode...");
}
private SearchTerm getSearchTerm() {
if (filters.size() == 0)
return null;
if (filters.size() == 1)
return filters.get(0).getCustomSearch(folder);
SearchTerm last = filters.get(0).getCustomSearch(folder);
for (int i = 1; i < filters.size(); i++) {
CustomFilter filter = filters.get(i);
SearchTerm st = filter.getCustomSearch(folder);
if (st != null) {
last = new AndTerm(last, st);
}
}
return last;
}
}
class MailsSinceLastCheckFilter implements CustomFilter {
private Date since;
public MailsSinceLastCheckFilter(Date date) {
since = date;
}
public SearchTerm getCustomSearch(Folder folder) {
return new ReceivedDateTerm(ComparisonTerm.GE, since);
}
}
// user settings stored in member variables
private String user;
private String password;
private String host;
private String protocol;
private String folderNames;
private List<String> exclude = new ArrayList<String>();
private List<String> include = new ArrayList<String>();
private boolean recurse;
private int batchSize;
private int fetchSize;
private int cTimeout;
private int rTimeout;
private Date fetchMailsSince;
private String customFilter;
private boolean processAttachment = true;
// holds the current state
private Store mailbox;
private boolean connected = false;
private FolderIterator folderIter;
private MessageIterator msgIter;
private List<CustomFilter> filters = new ArrayList<CustomFilter>();
private static FetchProfile fp = new FetchProfile();
private static final Logger LOG = LoggerFactory.getLogger(DataImporter.class);
// diagnostics
private int rowCount = 0;
static {
fp.add(FetchProfile.Item.ENVELOPE);
fp.add(FetchProfile.Item.FLAGS);
fp.add("X-Mailer");
}
// Fields To Index
// single valued
private static final String MESSAGE_ID = "messageId";
private static final String SUBJECT = "subject";
private static final String FROM = "from";
private static final String SENT_DATE = "sentDate";
private static final String XMAILER = "xMailer";
// multi valued
private static final String TO_CC_BCC = "allTo";
private static final String FLAGS = "flags";
private static final String CONTENT = "content";
private static final String ATTACHMENT = "attachment";
private static final String ATTACHMENT_NAMES = "attachmentNames";
// flag values
private static final String FLAG_ANSWERED = "answered";
private static final String FLAG_DELETED = "deleted";
private static final String FLAG_DRAFT = "draft";
private static final String FLAG_FLAGGED = "flagged";
private static final String FLAG_RECENT = "recent";
private static final String FLAG_SEEN = "seen";
private int getIntFromContext(String prop, int ifNull) {
int v = ifNull;
try {
String val = context.getEntityAttribute(prop);
if (val != null) {
val = context.replaceTokens(val);
v = Integer.valueOf(val);
}
} catch (NumberFormatException e) {
//do nothing
}
return v;
}
private boolean getBoolFromContext(String prop, boolean ifNull) {
boolean v = ifNull;
String val = context.getEntityAttribute(prop);
if (val != null) {
val = context.replaceTokens(val);
v = Boolean.valueOf(val);
}
return v;
}
private String getStringFromContext(String prop, String ifNull) {
String v = ifNull;
String val = context.getEntityAttribute(prop);
if (val != null) {
val = context.replaceTokens(val);
v = val;
}
return v;
}
}