/**
* License Agreement for OpenSearchServer
*
* Copyright (C) 2014-2015 Emmanuel Keller / Jaeksoft
*
* http://www.open-search-server.com
*
* This file is part of OpenSearchServer.
*
* OpenSearchServer is free software: you can redistribute it and/or
* modify it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* OpenSearchServer is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with OpenSearchServer.
* If not, see <http://www.gnu.org/licenses/>.
**/
package com.jaeksoft.searchlib.crawler.mailbox.crawler;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.util.Date;
import java.util.List;
import javax.activation.DataSource;
import javax.mail.Address;
import javax.mail.FetchProfile;
import javax.mail.Flags.Flag;
import javax.mail.Folder;
import javax.mail.Message;
import javax.mail.Message.RecipientType;
import javax.mail.MessagingException;
import javax.mail.Store;
import javax.mail.internet.InternetAddress;
import javax.mail.internet.MimeMessage;
import org.apache.commons.mail.util.MimeMessageParser;
import com.jaeksoft.searchlib.Logging;
import com.jaeksoft.searchlib.SearchLibException;
import com.jaeksoft.searchlib.crawler.FieldMapContext;
import com.jaeksoft.searchlib.crawler.common.process.CrawlStatus;
import com.jaeksoft.searchlib.crawler.mailbox.MailboxCrawlItem;
import com.jaeksoft.searchlib.crawler.mailbox.MailboxCrawlThread;
import com.jaeksoft.searchlib.crawler.mailbox.MailboxFieldEnum;
import com.jaeksoft.searchlib.crawler.mailbox.MailboxProtocolEnum;
import com.jaeksoft.searchlib.index.IndexDocument;
import com.jaeksoft.searchlib.parser.Parser;
import com.jaeksoft.searchlib.parser.ParserResultItem;
import com.jaeksoft.searchlib.parser.ParserSelector;
import com.jaeksoft.searchlib.util.IOUtils;
import com.jaeksoft.searchlib.util.StringUtils;
public abstract class MailboxAbstractCrawler {
protected ParserSelector parserSelector;
protected MailboxCrawlThread thread;
protected MailboxProtocolEnum protocol;
protected MailboxCrawlItem item;
public void init(MailboxCrawlThread thread, MailboxProtocolEnum protocol,
MailboxCrawlItem item) {
this.thread = thread;
this.protocol = protocol;
this.item = item;
FieldMapContext fieldMapContext = thread == null ? null : thread
.getFieldMapContext();
this.parserSelector = fieldMapContext == null ? null
: fieldMapContext.parserSelector;
}
protected abstract Store getStore() throws MessagingException;
protected abstract void connect(Store store) throws MessagingException;
public void read() throws MessagingException, IOException,
SearchLibException {
Store store = null;
try {
store = getStore();
connect(store);
readFolder(store.getDefaultFolder());
} finally {
if (store != null)
store.close();
}
}
public String check() throws MessagingException, IOException,
SearchLibException {
Store store = null;
StringWriter sw = null;
PrintWriter pw = null;
try {
sw = new StringWriter();
pw = new PrintWriter(sw);
pw.println();
store = getStore();
connect(store);
checkFolder(store.getDefaultFolder(), pw);
pw.println("OK");
return sw.toString();
} finally {
if (store != null)
store.close();
IOUtils.close(pw, sw);
}
}
private void readMessagesFolder(Folder folder) throws MessagingException,
IOException, SearchLibException {
folder.open(Folder.READ_ONLY);
String folderFullName = folder.getFullName();
try {
int max = folder.getMessageCount();
int i = 0;
final int buffer = item.getBufferSize();
while (i < max && !thread.isAborted()) {
thread.setStatusInfo(CrawlStatus.CRAWL);
int end = i + buffer;
if (end > max)
end = max;
Message[] messages = folder.getMessages(i + 1, end);
FetchProfile fp = new FetchProfile();
fp.add(FetchProfile.Item.ENVELOPE);
folder.fetch(messages, fp);
for (Message message : messages) {
if (thread.isAborted())
break;
i++;
String messageId = getMessageId(folder, message);
if (StringUtils.isEmpty(messageId))
continue;
if (thread.isAlreadyIndexed(messageId)) {
thread.incIgnored();
continue;
}
IndexDocument crawlIndexDocument = new IndexDocument(
item.getLang());
IndexDocument parserIndexDocument = new IndexDocument(
item.getLang());
crawlIndexDocument.addString(
MailboxFieldEnum.folder.name(), folderFullName);
try {
readMessage(crawlIndexDocument, parserIndexDocument,
folder, message, messageId);
thread.addDocument(crawlIndexDocument,
parserIndexDocument);
} catch (Exception e) {
Logging.warn(e);
thread.incError();
}
}
}
} finally {
folder.close(false);
}
}
protected abstract String getMessageId(Folder folder, Message message)
throws MessagingException;
protected void readFolder(Folder folder) throws MessagingException,
IOException, SearchLibException {
if (folder == null)
return;
if ((folder.getType() & Folder.HOLDS_MESSAGES) != 0)
readMessagesFolder(folder);
if ((folder.getType() & Folder.HOLDS_FOLDERS) != 0)
readHoldsFolder(folder);
}
protected void checkFolder(Folder folder, PrintWriter pw)
throws MessagingException, IOException, SearchLibException {
if (folder == null)
return;
if ((folder.getType() & Folder.HOLDS_MESSAGES) != 0) {
folder.open(Folder.READ_ONLY);
try {
pw.print("Folder ");
pw.print(folder.getName());
pw.print(": ");
pw.print(folder.getMessageCount());
pw.println(" msgs(s).");
} finally {
folder.close(false);
}
}
if ((folder.getType() & Folder.HOLDS_FOLDERS) != 0) {
Folder[] folders = folder.list();
if (folders != null)
for (Folder f : folders)
checkFolder(f, pw);
}
}
private void readHoldsFolder(Folder folder) throws MessagingException,
IOException, SearchLibException {
Folder[] folders = folder.list();
if (folders == null)
return;
for (Folder f : folders)
readFolder(f);
}
private void putAddresses(IndexDocument document, Address[] addresses,
String fieldEmail, String fieldPersonal) {
if (addresses == null)
return;
for (Address address : addresses) {
if (address == null)
continue;
if (!(address instanceof InternetAddress))
continue;
InternetAddress ia = (InternetAddress) address;
document.addString(fieldEmail, ia.getAddress());
document.addString(fieldPersonal, ia.getPersonal());
}
}
final public void readMessage(IndexDocument crawlIndexDocument,
IndexDocument parserIndexDocument, Folder folder, Message message,
String id) throws Exception {
crawlIndexDocument.addString(MailboxFieldEnum.message_id.name(), id);
crawlIndexDocument.addString(MailboxFieldEnum.message_number.name(),
Integer.toString(message.getMessageNumber()));
if (message instanceof MimeMessage)
crawlIndexDocument.addString(MailboxFieldEnum.content_id.name(),
((MimeMessage) message).getContentID());
crawlIndexDocument.addString(MailboxFieldEnum.subject.name(),
message.getSubject());
putAddresses(crawlIndexDocument, message.getFrom(),
MailboxFieldEnum.from_address.name(),
MailboxFieldEnum.from_personal.name());
putAddresses(crawlIndexDocument, message.getReplyTo(),
MailboxFieldEnum.reply_to_address.name(),
MailboxFieldEnum.reply_to_personal.name());
putAddresses(crawlIndexDocument,
message.getRecipients(RecipientType.TO),
MailboxFieldEnum.recipient_to_address.name(),
MailboxFieldEnum.recipient_to_personal.name());
putAddresses(crawlIndexDocument,
message.getRecipients(RecipientType.CC),
MailboxFieldEnum.recipient_cc_address.name(),
MailboxFieldEnum.recipient_cc_personal.name());
putAddresses(crawlIndexDocument,
message.getRecipients(RecipientType.BCC),
MailboxFieldEnum.recipient_bcc_address.name(),
MailboxFieldEnum.recipient_bcc_personal.name());
Date dt = message.getSentDate();
if (dt != null)
crawlIndexDocument.addString(MailboxFieldEnum.send_date.name(),
dt.toString());
dt = message.getReceivedDate();
if (dt != null)
crawlIndexDocument.addString(MailboxFieldEnum.received_date.name(),
dt.toString());
if (message.isSet(Flag.ANSWERED))
crawlIndexDocument.addString(MailboxFieldEnum.flags.name(),
"ANSWERED");
if (message.isSet(Flag.DELETED))
crawlIndexDocument.addString(MailboxFieldEnum.flags.name(),
"DELETED");
if (message.isSet(Flag.DRAFT))
crawlIndexDocument
.addString(MailboxFieldEnum.flags.name(), "DRAFT");
if (message.isSet(Flag.FLAGGED))
crawlIndexDocument.addString(MailboxFieldEnum.flags.name(),
"FLAGGED");
if (message.isSet(Flag.SEEN))
crawlIndexDocument.addString(MailboxFieldEnum.flags.name(), "SEEN");
if (message instanceof MimeMessage) {
MimeMessageParser mimeMessageParser = new MimeMessageParser(
(MimeMessage) message).parse();
crawlIndexDocument.addString(MailboxFieldEnum.html_content.name(),
mimeMessageParser.getHtmlContent());
crawlIndexDocument.addString(MailboxFieldEnum.plain_content.name(),
mimeMessageParser.getPlainContent());
for (DataSource dataSource : mimeMessageParser.getAttachmentList()) {
crawlIndexDocument.addString(
MailboxFieldEnum.email_attachment_name.name(),
dataSource.getName());
crawlIndexDocument.addString(
MailboxFieldEnum.email_attachment_type.name(),
dataSource.getContentType());
if (parserSelector == null)
continue;
Parser attachParser = parserSelector.parseStream(null,
dataSource.getName(), dataSource.getContentType(),
null, dataSource.getInputStream(), null, null, null);
if (attachParser == null)
continue;
List<ParserResultItem> parserResults = attachParser
.getParserResults();
if (parserResults != null)
for (ParserResultItem parserResult : parserResults)
crawlIndexDocument.addFieldIndexDocument(
MailboxFieldEnum.email_attachment_content
.name(), parserResult
.getParserDocument());
}
}
}
}