package org.myrobotlab.service; import java.io.IOException; import java.text.ParseException; import java.util.Date; import java.util.Enumeration; import java.util.Properties; import java.util.UUID; import javax.mail.Address; import javax.mail.BodyPart; import javax.mail.Folder; import javax.mail.Header; import javax.mail.Message; import javax.mail.MessagingException; import javax.mail.NoSuchProviderException; import javax.mail.Part; import javax.mail.Session; import javax.mail.Store; import javax.mail.internet.InternetAddress; import javax.mail.internet.MailDateFormat; import javax.mail.internet.MimeMultipart; import org.myrobotlab.document.Document; import org.myrobotlab.document.connector.AbstractConnector; import org.myrobotlab.document.transformer.ConnectorConfig; import org.myrobotlab.framework.ServiceType; /** * * ImapEmailConnector - This connector can crawl the folders on an IMAP email * server. you can provide the user/pass/email server hostname. It publishes * documents that represents the emails messages that were crawled. * */ public class ImapEmailConnector extends AbstractConnector { private static final long serialVersionUID = 1L; private static final String MESSAGE_ID_HEADER = "message_id"; private String emailServer; private String username; private String password; private String folderName = "INBOX"; private String docIdPrefix = "email_"; private transient Store store; public ImapEmailConnector(String name) { super(name); } @Override public void setConfig(ConnectorConfig config) { // TODO Auto-generated method stub log.info("Set Config not yet implemented"); } public void startCrawling() { log.info("Sarting IMAP Email connector."); // connect to the email store Store store = connect(); if (store == null) { log.warn("Email Store was null. Check credentials and server name"); return; } else { log.info("connected to store"); } // Get INBOX folder typically. Folder folder = null; try { folder = store.getFolder(getFolderName()); folder = openFolder(folder); } catch (MessagingException e) { log.warn("Folder {} not found.", folder); e.printStackTrace(); return; } int count = 0; try { count = processFolder(folder); Folder[] folders = folder.list(); // process all sub folders. // TODO: check the recursion here and do it properly. for (Folder f : folders) { f = openFolder(f); count = count + processFolder(f); } } catch (MessagingException e) { log.warn("Message Exception processing subfolders : {}", e.getLocalizedMessage()); e.printStackTrace(); } disconnect(); log.info("Fetched " + count + " messages"); } private Folder openFolder(Folder folder) { try { // Read only! lets not accidentally blow away someones email. folder.open(Folder.READ_ONLY); } catch (MessagingException e) { log.info("Message Exception {}", e.getLocalizedMessage()); e.printStackTrace(); return null; } return folder; } private int processFolder(Folder folder) { log.info("Processing folder {}", folder.getName()); int numDocs = 0; try { numDocs = folder.getMessageCount(); log.info("Folder has {} docs.", numDocs); } catch (MessagingException e) { log.warn("Messaging Exception {}", e.getLocalizedMessage()); e.printStackTrace(); // TODO: bomb out here? return 0; } try { for (Message m : folder.getMessages()) { try { Document doc = processMessage(m); doc.setField("folder", folder.getName()); feed(doc); numDocs++; } catch (MessagingException | IOException e) { log.warn("process message failed. continuing to next message. {} ", e.getLocalizedMessage()); e.printStackTrace(); continue; } } } catch (MessagingException e) { // TODO Auto-generated catch block log.info("Messaging Exception getMessages {}", e.getLocalizedMessage()); e.printStackTrace(); return 0; } return numDocs; }; private Document processMessage(Message m) throws MessagingException, IOException { // create a unique(ish) doc id until we discover the true message id. String docId = docIdPrefix + UUID.randomUUID().toString(); Document doc = new Document(docId); Enumeration<Header> headers = m.getAllHeaders(); // walk every header and copy them to fields... String messageId = null; while (headers.hasMoreElements()) { Header header = headers.nextElement(); String fieldName = cleanFieldName(header.getName()); if (fieldName.equals(MESSAGE_ID_HEADER)) { // if we get a message id. use it. messageId = header.getValue(); docId = docIdPrefix + messageId; doc.setId(docId); } doc.addToField(fieldName, header.getValue()); } // TODO: grab the body of the email // TODO: grab the attachments. // specific stuff we really care about.. // We want to map all the From / To / CC / BCCs // // the "from" field should be handled in the "addHeadersToItem" method. // // Specially handle the TO field as this is multivalued. // not sure which other fields we care about this for. // TODO: this might be much faster to call this directly.. just need to // pass it // the header that we already copied to the to / bcc /cc fields of the // mime message. // InternetAddress.parseHeader(toHeader, this.strict) // // Address[] recipients = m.getAllRecipients(); // TODO: i don't like calling toString here. if (doc.hasField("to")) { // if the to field was found, we are going to override it here. Address[] recipients = InternetAddress.parse(doc.getField("to").toString()); if (recipients != null) { doc.removeField("to"); for (Address a : recipients) { doc.addToField("to", a.toString()); } } else { // this shouldn't happen, right? doc.setField("to", "unknown"); } } else { // this shouldn't happen? doc.setField("to", "unknown"); } // TODO: what to use with the sent date? // Date d = m.getSentDate(); Date sentdate = null; // TODO: make it so we don't call tostring here. // TODO: array out of bounds checking... if (!doc.hasField("date")) { sentdate = m.getSentDate(); doc.setField("sent_date", sentdate); } else { MailDateFormat mailDateFormat = new MailDateFormat(); try { // parse the string version of the field and make it a proper // java date object sentdate = mailDateFormat.parse(doc.getField("date").get(0).toString()); doc.setField("sent_date", sentdate); } catch (ParseException e) { log.warn("Date Parse Exception {}", e.getLocalizedMessage()); e.printStackTrace(); } } Date receivedDate = m.getReceivedDate(); if (receivedDate != null) { doc.setField("received_date", receivedDate); } Address[] replyTo = m.getReplyTo(); if (replyTo != null) { for (Address replyAddr : replyTo) { doc.addToField("reply_to", replyAddr.toString()); } } String subject = m.getSubject(); if (subject != null) { doc.setField("subject", subject); } else { log.debug("No subject"); } // the body of the email here Object content = m.getContent(); if (content instanceof String) { // This is already a string! ok... doc.addToField("text", (String) (content)); } else if (content instanceof MimeMultipart) { // multi-part mime docs are a pain. we'll just accumulate the // text from each part. int numParts = ((MimeMultipart) content).getCount(); // Walk all parts of the mime message. for (int i = 0; i < numParts; i++) { BodyPart bp = ((MimeMultipart) content).getBodyPart(i); // add the various metadata fields to the document for this body // part. try { parseBodyPart(bp, doc); } catch (Exception e) { log.warn("Exception in parse body part for message {}", e.getLocalizedMessage()); e.printStackTrace(); continue; } } } else { log.info("Unknown Type of content returned : " + content.getClass()); doc.addToField("text", content.toString()); } doc.setField("size", m.getSize()); return doc; } public void parseBodyPart(Part p, Document doc) throws Exception { // // switch on ismimetype for processing. (avoid fetching if we don't need // to!) // attachments can be large. if (p.isMimeType("text/plain")) { String body = (String) p.getContent(); doc.addToField("text", body); return; } else if (p.isMimeType("multipart/alternative")) { MimeMultipart mmp = (MimeMultipart) p.getContent(); for (int i = 0; i < mmp.getCount(); i++) { // TODO: check this recursion! nested body parts! parseBodyPart(mmp.getBodyPart(i), doc); } return; } else if (p.isMimeType("text/html")) { String body = (String) p.getContent(); // TODO: have the pipeline parse the html doc.addToField("html", body); return; } else if (p.isMimeType("application/ics")) { log.info("Skipping Calender entry: not supported yet."); Object icsEntry = p.getContent(); // TODO: add this to the doc doc.addToField("ics", icsEntry.toString()); return; } else { log.info("Unhandled Content Type {}", p.getContentType()); return; } } private String cleanFieldName(String name) { // TODO : centralize this as a util or something. (maybe move it to the // pipeline) String clean = name.trim().toLowerCase().replaceAll(" ", "_"); return clean; } @Override public void stopCrawling() { // TODO Auto-generated method stub // TODO: this isn't implemented yet.. I'd like to move this sort of // stuff to the base class. } public void disconnect() { try { store.close(); } catch (MessagingException e) { log.warn("error closing store ... " + e.getMessage()); e.printStackTrace(); } } public Store connect() { Properties props = System.getProperties(); props.setProperty("mail.store.protocol", "imaps"); Session session = Session.getDefaultInstance(props, null); Store store = null; try { store = session.getStore("imaps"); } catch (NoSuchProviderException e) { log.warn("No IMAPS support. {}", e.getLocalizedMessage()); e.printStackTrace(); } try { store.connect(getEmailServer(), getUsername(), getPassword()); } catch (MessagingException e) { // TODO Auto-generated catch block log.warn(e.getMessage()); e.printStackTrace(); return null; } return store; } public String getEmailServer() { return emailServer; } public void setEmailServer(String emailServer) { this.emailServer = emailServer; } public String getUsername() { return username; } public void setUsername(String username) { this.username = username; } public String getPassword() { return password; } public void setPassword(String password) { this.password = password; } public String getFolderName() { return folderName; } public void setFolderName(String folderName) { this.folderName = folderName; } public String getDocIdPrefix() { return docIdPrefix; } public void setDocIdPrefix(String docIdPrefix) { this.docIdPrefix = docIdPrefix; } public static void main(String[] args) throws Exception { ImapEmailConnector connector = (ImapEmailConnector) Runtime.start("email", "ImapEmailConnector"); connector.setEmailServer("imap.gmail.com"); connector.setUsername("YYY"); connector.setPassword("XXX"); connector.setBatchSize(1); Solr solr = (Solr) Runtime.start("solr", "Solr"); // for example... String solrUrl = "http://phobos:8983/solr/collection1"; solr.setSolrUrl(solrUrl); connector.addDocumentListener(solr); connector.startCrawling(); } /** * This static method returns all the details of the class without it having * to be constructed. It has description, categories, dependencies, and peer * definitions. * * @return ServiceType - returns all the data * */ static public ServiceType getMetaData() { ServiceType meta = new ServiceType(ImapEmailConnector.class.getCanonicalName()); meta.addDescription("This connector will connect to an IMAP based email server and crawl the emails"); meta.addCategory("data", "ingest"); return meta; } }