/** * License Agreement for OpenSearchServer * * Copyright (C) 2014-2015 Emmanuel Keller / Jaeksoft * * http://www.open-search-server.com * * This file is part of OpenSearchServer. * * OpenSearchServer is free software: you can redistribute it and/or * modify it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * OpenSearchServer is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with OpenSearchServer. * If not, see <http://www.gnu.org/licenses/>. **/ package com.jaeksoft.searchlib.crawler.mailbox.crawler; import java.io.IOException; import java.io.PrintWriter; import java.io.StringWriter; import java.util.Date; import java.util.List; import javax.activation.DataSource; import javax.mail.Address; import javax.mail.FetchProfile; import javax.mail.Flags.Flag; import javax.mail.Folder; import javax.mail.Message; import javax.mail.Message.RecipientType; import javax.mail.MessagingException; import javax.mail.Store; import javax.mail.internet.InternetAddress; import javax.mail.internet.MimeMessage; import org.apache.commons.mail.util.MimeMessageParser; import com.jaeksoft.searchlib.Logging; import com.jaeksoft.searchlib.SearchLibException; import com.jaeksoft.searchlib.crawler.FieldMapContext; import com.jaeksoft.searchlib.crawler.common.process.CrawlStatus; import com.jaeksoft.searchlib.crawler.mailbox.MailboxCrawlItem; import com.jaeksoft.searchlib.crawler.mailbox.MailboxCrawlThread; import com.jaeksoft.searchlib.crawler.mailbox.MailboxFieldEnum; import com.jaeksoft.searchlib.crawler.mailbox.MailboxProtocolEnum; import com.jaeksoft.searchlib.index.IndexDocument; import com.jaeksoft.searchlib.parser.Parser; import com.jaeksoft.searchlib.parser.ParserResultItem; import com.jaeksoft.searchlib.parser.ParserSelector; import com.jaeksoft.searchlib.util.IOUtils; import com.jaeksoft.searchlib.util.StringUtils; public abstract class MailboxAbstractCrawler { protected ParserSelector parserSelector; protected MailboxCrawlThread thread; protected MailboxProtocolEnum protocol; protected MailboxCrawlItem item; public void init(MailboxCrawlThread thread, MailboxProtocolEnum protocol, MailboxCrawlItem item) { this.thread = thread; this.protocol = protocol; this.item = item; FieldMapContext fieldMapContext = thread == null ? null : thread .getFieldMapContext(); this.parserSelector = fieldMapContext == null ? null : fieldMapContext.parserSelector; } protected abstract Store getStore() throws MessagingException; protected abstract void connect(Store store) throws MessagingException; public void read() throws MessagingException, IOException, SearchLibException { Store store = null; try { store = getStore(); connect(store); readFolder(store.getDefaultFolder()); } finally { if (store != null) store.close(); } } public String check() throws MessagingException, IOException, SearchLibException { Store store = null; StringWriter sw = null; PrintWriter pw = null; try { sw = new StringWriter(); pw = new PrintWriter(sw); pw.println(); store = getStore(); connect(store); checkFolder(store.getDefaultFolder(), pw); pw.println("OK"); return sw.toString(); } finally { if (store != null) store.close(); IOUtils.close(pw, sw); } } private void readMessagesFolder(Folder folder) throws MessagingException, IOException, SearchLibException { folder.open(Folder.READ_ONLY); String folderFullName = folder.getFullName(); try { int max = folder.getMessageCount(); int i = 0; final int buffer = item.getBufferSize(); while (i < max && !thread.isAborted()) { thread.setStatusInfo(CrawlStatus.CRAWL); int end = i + buffer; if (end > max) end = max; Message[] messages = folder.getMessages(i + 1, end); FetchProfile fp = new FetchProfile(); fp.add(FetchProfile.Item.ENVELOPE); folder.fetch(messages, fp); for (Message message : messages) { if (thread.isAborted()) break; i++; String messageId = getMessageId(folder, message); if (StringUtils.isEmpty(messageId)) continue; if (thread.isAlreadyIndexed(messageId)) { thread.incIgnored(); continue; } IndexDocument crawlIndexDocument = new IndexDocument( item.getLang()); IndexDocument parserIndexDocument = new IndexDocument( item.getLang()); crawlIndexDocument.addString( MailboxFieldEnum.folder.name(), folderFullName); try { readMessage(crawlIndexDocument, parserIndexDocument, folder, message, messageId); thread.addDocument(crawlIndexDocument, parserIndexDocument); } catch (Exception e) { Logging.warn(e); thread.incError(); } } } } finally { folder.close(false); } } protected abstract String getMessageId(Folder folder, Message message) throws MessagingException; protected void readFolder(Folder folder) throws MessagingException, IOException, SearchLibException { if (folder == null) return; if ((folder.getType() & Folder.HOLDS_MESSAGES) != 0) readMessagesFolder(folder); if ((folder.getType() & Folder.HOLDS_FOLDERS) != 0) readHoldsFolder(folder); } protected void checkFolder(Folder folder, PrintWriter pw) throws MessagingException, IOException, SearchLibException { if (folder == null) return; if ((folder.getType() & Folder.HOLDS_MESSAGES) != 0) { folder.open(Folder.READ_ONLY); try { pw.print("Folder "); pw.print(folder.getName()); pw.print(": "); pw.print(folder.getMessageCount()); pw.println(" msgs(s)."); } finally { folder.close(false); } } if ((folder.getType() & Folder.HOLDS_FOLDERS) != 0) { Folder[] folders = folder.list(); if (folders != null) for (Folder f : folders) checkFolder(f, pw); } } private void readHoldsFolder(Folder folder) throws MessagingException, IOException, SearchLibException { Folder[] folders = folder.list(); if (folders == null) return; for (Folder f : folders) readFolder(f); } private void putAddresses(IndexDocument document, Address[] addresses, String fieldEmail, String fieldPersonal) { if (addresses == null) return; for (Address address : addresses) { if (address == null) continue; if (!(address instanceof InternetAddress)) continue; InternetAddress ia = (InternetAddress) address; document.addString(fieldEmail, ia.getAddress()); document.addString(fieldPersonal, ia.getPersonal()); } } final public void readMessage(IndexDocument crawlIndexDocument, IndexDocument parserIndexDocument, Folder folder, Message message, String id) throws Exception { crawlIndexDocument.addString(MailboxFieldEnum.message_id.name(), id); crawlIndexDocument.addString(MailboxFieldEnum.message_number.name(), Integer.toString(message.getMessageNumber())); if (message instanceof MimeMessage) crawlIndexDocument.addString(MailboxFieldEnum.content_id.name(), ((MimeMessage) message).getContentID()); crawlIndexDocument.addString(MailboxFieldEnum.subject.name(), message.getSubject()); putAddresses(crawlIndexDocument, message.getFrom(), MailboxFieldEnum.from_address.name(), MailboxFieldEnum.from_personal.name()); putAddresses(crawlIndexDocument, message.getReplyTo(), MailboxFieldEnum.reply_to_address.name(), MailboxFieldEnum.reply_to_personal.name()); putAddresses(crawlIndexDocument, message.getRecipients(RecipientType.TO), MailboxFieldEnum.recipient_to_address.name(), MailboxFieldEnum.recipient_to_personal.name()); putAddresses(crawlIndexDocument, message.getRecipients(RecipientType.CC), MailboxFieldEnum.recipient_cc_address.name(), MailboxFieldEnum.recipient_cc_personal.name()); putAddresses(crawlIndexDocument, message.getRecipients(RecipientType.BCC), MailboxFieldEnum.recipient_bcc_address.name(), MailboxFieldEnum.recipient_bcc_personal.name()); Date dt = message.getSentDate(); if (dt != null) crawlIndexDocument.addString(MailboxFieldEnum.send_date.name(), dt.toString()); dt = message.getReceivedDate(); if (dt != null) crawlIndexDocument.addString(MailboxFieldEnum.received_date.name(), dt.toString()); if (message.isSet(Flag.ANSWERED)) crawlIndexDocument.addString(MailboxFieldEnum.flags.name(), "ANSWERED"); if (message.isSet(Flag.DELETED)) crawlIndexDocument.addString(MailboxFieldEnum.flags.name(), "DELETED"); if (message.isSet(Flag.DRAFT)) crawlIndexDocument .addString(MailboxFieldEnum.flags.name(), "DRAFT"); if (message.isSet(Flag.FLAGGED)) crawlIndexDocument.addString(MailboxFieldEnum.flags.name(), "FLAGGED"); if (message.isSet(Flag.SEEN)) crawlIndexDocument.addString(MailboxFieldEnum.flags.name(), "SEEN"); if (message instanceof MimeMessage) { MimeMessageParser mimeMessageParser = new MimeMessageParser( (MimeMessage) message).parse(); crawlIndexDocument.addString(MailboxFieldEnum.html_content.name(), mimeMessageParser.getHtmlContent()); crawlIndexDocument.addString(MailboxFieldEnum.plain_content.name(), mimeMessageParser.getPlainContent()); for (DataSource dataSource : mimeMessageParser.getAttachmentList()) { crawlIndexDocument.addString( MailboxFieldEnum.email_attachment_name.name(), dataSource.getName()); crawlIndexDocument.addString( MailboxFieldEnum.email_attachment_type.name(), dataSource.getContentType()); if (parserSelector == null) continue; Parser attachParser = parserSelector.parseStream(null, dataSource.getName(), dataSource.getContentType(), null, dataSource.getInputStream(), null, null, null); if (attachParser == null) continue; List<ParserResultItem> parserResults = attachParser .getParserResults(); if (parserResults != null) for (ParserResultItem parserResult : parserResults) crawlIndexDocument.addFieldIndexDocument( MailboxFieldEnum.email_attachment_content .name(), parserResult .getParserDocument()); } } } }