//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.collectionreaders;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Enumeration;
import java.util.HashSet;
import java.util.List;
import java.util.Properties;
import java.util.Set;
import java.util.StringJoiner;
import javax.mail.Address;
import javax.mail.Authenticator;
import javax.mail.Flags;
import javax.mail.Folder;
import javax.mail.Header;
import javax.mail.Message;
import javax.mail.Message.RecipientType;
import javax.mail.MessagingException;
import javax.mail.MethodNotSupportedException;
import javax.mail.Multipart;
import javax.mail.NoSuchProviderException;
import javax.mail.Part;
import javax.mail.PasswordAuthentication;
import javax.mail.Session;
import javax.mail.Store;
import javax.mail.internet.InternetAddress;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.uima.UimaContext;
import org.apache.uima.collection.CollectionException;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import uk.gov.dstl.baleen.core.utils.BaleenDefaults;
import uk.gov.dstl.baleen.exceptions.InvalidParameterException;
import uk.gov.dstl.baleen.types.metadata.Metadata;
import uk.gov.dstl.baleen.uima.BaleenCollectionReader;
import uk.gov.dstl.baleen.uima.IContentExtractor;
/**
* Connects to a specified mail server, treating each message and/or it's attachments as documents.
*
* Optionally, messages can be deleted or marked as read after it has been retrieved.
* Be aware that this happens at the point the message is retrieved, and as such if the pipeline
* fails at a later point then the message will still have been deleted/marked read.
*/
public class EmailReader extends BaleenCollectionReader {
/**
* The mail server URL
*
* @baleen.config localhost
*/
public static final String PARAM_SERVER = "server";
@ConfigurationParameter(name = PARAM_SERVER, defaultValue = "localhost")
private String server;
/**
* The mail server port
*
* @baleen.config 110
*/
public static final String PARAM_PORT = "port";
@ConfigurationParameter(name = PARAM_PORT, defaultValue = "110")
private Integer port;
/**
* The e-mail address (user) to check
*
* @baleen.config baleen@localhost
*/
public static final String PARAM_USER = "username";
@ConfigurationParameter(name = PARAM_USER, defaultValue = "baleen@localhost")
private String user;
/**
* The password for the account
*
* @baleen.config
*/
public static final String PARAM_PASS = "password";
@ConfigurationParameter(name = PARAM_PASS, defaultValue = "")
private String pass;
/**
* Connection protocol (e.g. pop3 or imap)
*
* Must be a valid JavaMail protocol - pop3 and imap are supported by default,
* other protocols will require the relevant libraries to be on the classpath.
*
* @baleen.config pop3
*/
public static final String PARAM_PROTOCOL = "protocol";
@ConfigurationParameter(name = PARAM_PROTOCOL, defaultValue = "pop3")
private String protocol;
/**
* The name of the inbox on the server which we will monitor
*
* @baleen.config INBOX
*/
public static final String PARAM_INBOX = "inbox";
@ConfigurationParameter(name = PARAM_INBOX, defaultValue = "INBOX")
private String inbox;
/**
* How minimum wait in seconds between checking for new messages
*
* @baleen.config 120
*/
public static final String PARAM_WAIT = "wait";
@ConfigurationParameter(name = PARAM_WAIT, defaultValue = "120")
private Integer wait;
/**
* Should a message be deleted after it has been processed
*
* @baleen.config false
*/
public static final String PARAM_DELETE_EMAIL = "deleteEmails";
@ConfigurationParameter(name = PARAM_DELETE_EMAIL, defaultValue = "false")
private Boolean deleteEmailsAfterProcessing;
/**
* Should an attachment be deleted after it has been processed
*
* @baleen.config false
*/
public static final String PARAM_DELETE_ATTACHMENT = "deleteAttachments";
@ConfigurationParameter(name = PARAM_DELETE_ATTACHMENT, defaultValue = "false")
private Boolean deleteAttachmentsAfterProcessing;
/**
* The folder in which to save attachments
*
* @baleen.config <i>Current directory</i>
*/
public static final String PARAM_FOLDER = "attachmentFolder";
@ConfigurationParameter(name = PARAM_FOLDER, defaultValue = "")
private String folder;
/**
* Choose whether to process just message content, just attachments, or both.
*
* Valid options are: content, attachments, both
*
* @baleen.config both
*/
public static final String PARAM_PROCESS = "process";
@ConfigurationParameter(name = PARAM_PROCESS, defaultValue = BOTH)
private String process;
/**
* The content extractor to use to extract content from files
*
* @baleen.config Value of BaleenDefaults.DEFAULT_CONTENT_EXTRACTOR
*/
public static final String PARAM_CONTENT_EXTRACTOR = "contentExtractor";
@ConfigurationParameter(name = PARAM_CONTENT_EXTRACTOR, defaultValue=BaleenDefaults.DEFAULT_CONTENT_EXTRACTOR)
private String contentExtractor;
public static final String BOTH = "both";
public static final String CONTENT = "content";
public static final String ATTACHMENTS = "attachments";
private IContentExtractor extractor;
private Long lastCheck = 0L;
private Authenticator authenticator;
private Session session;
private Store store;
private Folder inboxFolder;
private List<Message> messageQueue = new ArrayList<>();
private List<File> attachmentQueue = new ArrayList<>();
private Set<String> alreadyProcessed = new HashSet<>();
@Override
protected void doInitialize(UimaContext context) throws ResourceInitializationException {
validateParams();
try{
extractor = getContentExtractor(contentExtractor);
}catch(InvalidParameterException ipe){
throw new ResourceInitializationException(ipe);
}
extractor.initialize(context, getConfigParameters(context));
authenticator = new Authenticator() {
@Override
protected PasswordAuthentication getPasswordAuthentication(){
return new PasswordAuthentication(user, pass);
}
};
Properties prop = new Properties();
prop.put("mail.store.protocol", protocol);
prop.put("mail.host", server);
prop.put("mail."+protocol+".port", port);
prop.put("mail.user", user);
session = Session.getInstance(prop, authenticator);
try {
store = session.getStore();
} catch (NoSuchProviderException e) {
throw new ResourceInitializationException(e);
}
try{
store.connect();
inboxFolder = store.getFolder(inbox);
reopenConnection();
}catch(MessagingException me){
throw new ResourceInitializationException(me);
}
}
@Override
protected void doGetNext(JCas jCas) throws IOException, CollectionException {
try{
reopenConnection();
}catch(MessagingException me){
throw new IOException("Unable to reconnect to mail server", me);
}
try{
if(CONTENT.equalsIgnoreCase(process)){
processMessage(jCas, messageQueue.remove(0), Collections.emptyList());
}else if(ATTACHMENTS.equalsIgnoreCase(process)){
if(attachmentQueue.isEmpty()){
attachmentQueue.addAll(saveAttachments(messageQueue.remove(0)));
}
processAttachment(jCas, attachmentQueue.remove(0));
}else{ //Both
if(!attachmentQueue.isEmpty()){
processAttachment(jCas, attachmentQueue.remove(0));
}else{
Message msg = messageQueue.remove(0);
List<File> attachments = saveAttachments(msg);
attachmentQueue.addAll(attachments);
processMessage(jCas, msg, attachments);
}
}
}catch(MessagingException me){
throw new IOException("Unable to process message or attachment", me);
}
}
@Override
protected void doClose() throws IOException {
try{
inboxFolder.close(true);
store.close();
}catch(MessagingException me){
throw new IOException(me);
}
}
@Override
public boolean doHasNext() throws IOException, CollectionException {
try{
reopenConnection();
}catch(MessagingException me){
throw new IOException("Unable to reconnect to mail server", me);
}
if(!attachmentQueue.isEmpty() || !messageQueue.isEmpty()){
return true;
}
try{
tryExpunge();
}catch(MessagingException me){
throw new IOException("Unable to expunge (delete) messages", me);
}
if(lastCheck + wait*1000 > System.currentTimeMillis()){
return false;
}
lastCheck = System.currentTimeMillis();
try{
for(Message msg : inboxFolder.getMessages()){
String uid = generateUniqueId(msg);
if(!alreadyProcessed.add(uid)){
continue;
}
if(!ATTACHMENTS.equalsIgnoreCase(process) || hasAttachments(msg)){
messageQueue.add(msg);
}
}
}catch(MessagingException me){
throw new IOException("Unable to check for messages", me);
}
return !messageQueue.isEmpty();
}
private String generateUniqueId(Message msg) throws MessagingException{
String sentDate = "NOSD";
String receivedDate = "NORD";
if(msg.getSentDate() != null){
sentDate = String.valueOf(msg.getSentDate().toInstant().toEpochMilli());
}
if(msg.getReceivedDate() != null){
receivedDate = String.valueOf(msg.getReceivedDate().toInstant().toEpochMilli());
}
String sender = getAddress(msg.getFrom()[0]);
return joinStrings(msg.getSubject(), sender, sentDate, receivedDate);
}
private String joinStrings(String... strings){
StringJoiner sj = new StringJoiner("_");
for(String s : strings){
if(s != null)
sj.add(s);
}
return sj.toString();
}
private void processMessage(JCas jCas, Message msg, List<File> attachments) throws MessagingException, IOException{
String content = getContent(msg);
String subject = msg.getSubject();
String sender = getAddress(msg.getFrom()[0]);
InputStream is = IOUtils.toInputStream(content, Charset.defaultCharset());
extractor.processStream(is, "mailto:"+sender + "#" + subject, jCas);
addMetadata(jCas, "sender", sender);
addMetadata(jCas, "subject", subject);
addAddressesMetadata(msg.getRecipients(RecipientType.TO), jCas, "toRecipient");
addAddressesMetadata(msg.getRecipients(RecipientType.CC), jCas, "ccRecipient");
for(String attachment : getAttachments(msg)){ //We don't use the attachments list here, because we want to add the list of attachments to the metadata regardless of whether we've saved them or not
addMetadata(jCas, "attachment", attachment);
}
if(!attachments.isEmpty()){
for(File attachment : attachments){
addMetadata(jCas, "attachmentSaveLocation", attachment.getAbsolutePath());
}
}
@SuppressWarnings("unchecked")
Enumeration<Header> headers = msg.getAllHeaders();
while(headers.hasMoreElements()){
Header header = headers.nextElement();
addMetadata(jCas, header.getName(), header.getValue());
}
//Delete message?
if(deleteEmailsAfterProcessing){
try {
msg.setFlag(Flags.Flag.DELETED, true);
} catch (MessagingException me) {
getMonitor().error("Unable to delete message", me);
}
try{
alreadyProcessed.remove(generateUniqueId(msg)); //We can save memory by removing messages we've deleted on the server
} catch (MessagingException me) {
getMonitor().warn("Unable to re-generate unique ID for message to remove from memory", me);
}
}
}
private void processAttachment(JCas jCas, File attachment) throws IOException{
try(
InputStream is = new FileInputStream(attachment);
){
extractor.processStream(is, attachment.getAbsolutePath(), jCas);
}
//Delete attachment?
if(deleteAttachmentsAfterProcessing){
try {
Files.delete(attachment.toPath());
} catch (IOException ioe) {
getMonitor().error("Unable to delete attachment", ioe);
}
}
}
private String getContent(Message msg) throws IOException, MessagingException{
Object messageContentObject = msg.getContent();
if(messageContentObject instanceof Multipart){
Multipart multipart = (Multipart) msg.getContent();
// Loop over the parts of the email
for(int i = 0; i < multipart.getCount(); i++) {
// Retrieve the next part
Part part = multipart.getBodyPart(i);
if(!Part.ATTACHMENT.equalsIgnoreCase(part.getDisposition()) && StringUtils.isBlank(part.getFileName())){
return part.getContent().toString();
}
}
}else{
return msg.getContent().toString().trim();
}
return "";
}
private Boolean hasAttachments(Message msg) throws MessagingException, IOException{
if(msg.isMimeType("multipart/mixed")){
Multipart mp = (Multipart)msg.getContent();
if(mp.getCount() > 1){
return true;
}
}
return false;
}
private List<String> getAttachments(Message msg) throws MessagingException, IOException{
Object messageContentObject = msg.getContent();
List<String> attachments = new ArrayList<>();
if (messageContentObject instanceof Multipart) {
Multipart multipart = (Multipart) msg.getContent();
for(int i = 0; i < multipart.getCount(); i++) {
Part part = multipart.getBodyPart(i);
if(!Part.ATTACHMENT.equalsIgnoreCase(part.getDisposition()) && StringUtils.isBlank(part.getFileName())){
continue;
}
attachments.add(part.getFileName());
}
}
return attachments;
}
private List<File> saveAttachments(Message msg) throws MessagingException, IOException{
Object messageContentObject = msg.getContent();
List<File> attachmentLocations = new ArrayList<>();
// Determine email type
if (messageContentObject instanceof Multipart) {
// Retrieve the Multipart object from the message
Multipart multipart = (Multipart) msg.getContent();
// Loop over the parts of the email
for(int i = 0; i < multipart.getCount(); i++) {
// Retrieve the next part
Part part = multipart.getBodyPart(i);
if(!Part.ATTACHMENT.equalsIgnoreCase(part.getDisposition()) && StringUtils.isBlank(part.getFileName())){
continue;
}
String fileName = part.getFileName();
File destFile = new File(folder, fileName);
int append = 0;
while(destFile.exists()){
append++;
destFile = new File(folder, fileName + "." + append);
}
if(append != 0){
getMonitor().info("File with the same name already exists in {} - attachment will be saved as {}", fileName, destFile.getName());
}
// Save the file to disk
writeFileToDisk(destFile, part.getInputStream());
attachmentLocations.add(destFile);
}
}
return attachmentLocations;
}
private void writeFileToDisk(File destFile, InputStream inputStream) throws IOException{
FileOutputStream output = null;
try{
output = new FileOutputStream(destFile);
byte[] buffer = new byte[4096];
int byteRead;
while ((byteRead = inputStream.read(buffer)) != -1) {
output.write(buffer, 0, byteRead);
}
}catch(IOException ex){
throw new IOException("Unable to save attachment", ex);
}finally{
if(inputStream != null){
try{
inputStream.close();
}catch(Exception e){
getMonitor().debug("Unable to close InputStream, or already closed", e);
}
}
if(output != null){
try{
output.close();
}catch(Exception e){
getMonitor().debug("Unable to close FileOutputStream, or already closed", e);
}
}
}
}
private void validateParams(){
if(folder == null || folder.isEmpty()){
folder = System.getProperty("user.dir");
}
if(process.isEmpty() || !(BOTH.equalsIgnoreCase(process) || CONTENT.equalsIgnoreCase(process) || ATTACHMENTS.equalsIgnoreCase(process))){
process = BOTH;
}
}
private void addMetadata(JCas jCas, String key, String value){
Metadata md = new Metadata(jCas);
md.setKey(key);
md.setValue(value);
getSupport().add(md);
}
private void tryExpunge() throws MessagingException{
try{
inboxFolder.expunge();
}catch(MethodNotSupportedException mnse){
getMonitor().debug("Expunge method not supported (e.g. POP3) - closing and reopening folder", mnse);
inboxFolder.close(true);
reopenConnection();
}
}
private void reopenConnection() throws MessagingException{
if(!inboxFolder.isOpen()){
inboxFolder.open(Folder.READ_WRITE);
}
}
private String getAddress(Address addr){
String address = ((InternetAddress) addr).getPersonal();
if (address == null) {
address = ((InternetAddress) addr).getAddress();
}
return address;
}
private void addAddressesMetadata(Address[] addresses, JCas jCas, String key){
if(addresses != null){
for(Address addr : addresses){
addMetadata(jCas, key, getAddress(addr));
}
}
}
}