/*
* JBoss, Home of Professional Open Source
* Copyright 2013 Red Hat Inc. and/or its affiliates and other contributors
* as indicated by the @authors tag. All rights reserved.
*/
package org.searchisko.mbox.task;
import org.apache.james.mime4j.MimeException;
import org.apache.james.mime4j.dom.Message;
import org.apache.james.mime4j.dom.MessageBuilder;
import org.searchisko.http.client.Client;
import org.searchisko.mbox.dto.Mail;
import org.searchisko.mbox.json.Converter;
import org.searchisko.mbox.parser.MessageParser;
import org.searchisko.mbox.util.ContentType;
import org.searchisko.mbox.util.StringUtil;
import org.searchisko.preprocessor.HTMLStripUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.*;
import java.net.URL;
import java.text.SimpleDateFormat;
import java.util.*;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicLong;
import static org.searchisko.http.client.Client.getConfig;
import static org.searchisko.mbox.parser.MessageParser.getMessageBuilder;
import static org.searchisko.mbox.parser.MessageParser.getMessageHeaders;
/**
* Given a single mbox archive file (can be huge) we read it line by line and every time a complete message is red we
* pass that message for processing to parallel thread. Yet, we are using ThreadPoolExecutor with BlockingQueue to
* throttle number of parallel tasks in order not to exhaust all system resources.
* <p/>
* Each thread is responsible for parsing the mail message, converting it to JSON and then sending it to Searchisko
* for indexing via HttpClient. When HttpClient sends Http request it blocks the thread until the response is received
* or until timeout.
* <p/>
* Client can specify number of parallel threads. Note the `main` thread is not included in that number but it can
* be used to handle the task as well. Now, the underlying HttpClient is using connection pool which is configured
* to allow for needed number of concurrent connections. In other words <code>numberOfThreads</code> of value `N` can
* result up to `N+1` active parallel connections to target <code>host</code> (contrary, a typical HttpClient connection
* pool does not allow for more then 2 parallel connection per <code>host</code>). So be sure your target service is
* able to handle this number of incoming connections.
* <p/>
* The <code>numberOffset</code> is used if numbering of individual messages in the public archive does not start
* from 0. This can be typically result of Mailman admin mistake during archive rebuilding or similar issue.
* Note this is an optional parameter but we need to provide it if we need to provide parameters following this. In such
* case we can use value 0.
* <p/>
* The <code>excludeMessageIdListPath</code> is used if we need to exclude specific messages from processing. This is
* an optional parameter.
*
* @author Lukáš Vlček (lvlcek@redhat.com)
*
* @see ThreadPoolExecutor
* @see ArrayBlockingQueue
* @see {https://today.java.net/pub/a/today/2008/10/23/creating-a-notifying-blocking-thread-pool-executor.html}
* @see {http://www.javacodegeeks.com/2011/12/using-threadpoolexecutor-to-parallelize.html}
*/
public class IndexMboxArchive {
private static Logger log = LoggerFactory.getLogger(IndexMboxArchive.class);
private static MessageBuilder mb;
private static Client httpClient;
private static AtomicLong taskCount = new AtomicLong();
private static long messageCount = 0;
/**
*
* @param messageString raw message as a string. Can be null.
* @param message parsed message. Can be null.
* @param mailListName
* @param mailListCategory
* @param cnt order # of this message within the single cumulative mbox archive file
* @return
*/
private static Runnable prepareTask(final String messageString, final Message message, final String mailListName, final String mailListCategory, final long cnt) {
return new Runnable() {
@Override
public void run() {
// 1. Convert mail to JSON representation with added metadata.
// 2. Send mail to the server, using blocking operation.
long taskId = taskCount.incrementAndGet();
log.debug("starting task [{}]", taskId);
if (messageString == null && message == null) {
log.error("Missing message source. Either raw message string or parsed message must be provided. Exit task {}", taskId);
return;
}
String messageId = null;
try {
Message msg;
if (messageString != null) {
msg = mb.parseMessage(new ByteArrayInputStream(messageString.getBytes()));
} else {
msg = message;
}
String document_url = getDocumentUrl(msg, mailListName, cnt);
// add missing metadata
Map<String, String> metadata = new HashMap<>();
metadata.put("sys_url_view", document_url);
metadata.put("project", StringUtil.getProjectName(mailListName, mailListCategory));
metadata.put("mail_list_category", mailListCategory);
Mail mail = MessageParser.parse(msg);
messageId = mail.message_id(); // "sys_content_id"
String sysContent = mail.first_text_message_without_quotes();
String sysContentContentType = ContentType.TEXT_PLAIN;
if (sysContent == null || sysContent.trim().isEmpty()) {
sysContent = mail.first_text_message();
}
if (sysContent == null || sysContent.trim().isEmpty()) {
sysContent = HTMLStripUtil.stripHTML(mail.first_html_message());
}
metadata.put("sys_content", sysContent);
metadata.put("sys_content_content-type", sysContentContentType);
metadata.put("sys_description", mail.message_snippet());
String messageJSON = Converter.toJSON(mail, metadata);
Object response = httpClient.post(messageJSON, messageId);
log.trace("{}", response);
} catch (Exception e) {
log.warn("Error processing message {} in task [{}], caused: {}", new Object[]{messageId, taskId, e.getMessage()});
}
}
};
}
/**
* Construct public URL for given message.
* TODO: this needs to be configurable going forward.
* @param message
* @param mailListName
* @param cnt order # of this message within mbox file (single cumulative file)
* @return
*/
protected static String getDocumentUrl(final Message message, final String mailListName, final long cnt) {
// our Mailman is in specific times zone, this has impact on how it constructs URLs
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MMMMM", Locale.US);
sdf.setTimeZone(TimeZone.getTimeZone("EST"));
return "http://lists.jboss.org/pipermail/"+mailListName+"/"+sdf.format(message.getDate())+"/"+String.format("%06d",cnt)+".html";
}
public static File getFile(String path) {
// try to get from fs
File file = new File(path);
if (file.exists()) { return file; }
// try to get from classpath
URL url = IndexMboxArchive.class.getClassLoader().getResource(path);
log.trace("file url: {}", url);
String filesPathAndName = url.getPath();
log.info("trying to get file {}", filesPathAndName);
return new File(filesPathAndName);
}
public static InputStream getInputStream(String filePath) throws FileNotFoundException {
InputStream is = null;
// try to get from fs
File f = new File(filePath);
if (f.exists()) is = new FileInputStream(f);
// try to get from classpath
if (is == null) {
is = IndexMboxArchive.class.getClassLoader().getResourceAsStream(filePath);
}
return is;
}
private static void processMessageBuffer(ThreadPoolExecutor executor, Properties excludeMessageIds, StringBuilder messageSB, String mailListName, String mailListCategory, int offset) throws IOException, MimeException {
if (messageSB.length() > 0) {
String messageString = messageSB.toString();
Message message = null;
boolean filterOut = false;
if (excludeMessageIds != null && !excludeMessageIds.isEmpty()) {
message = mb.parseMessage(new ByteArrayInputStream(messageString.getBytes()));
String messageId = getMessageHeaders(message).get(MessageParser.MessageHeader.MESSAGE_ID.toString()).getBody();
filterOut = excludeMessageIds.containsKey(messageId) ? true : false;
if (filterOut) log.info("skipping message [{}]", messageId);
}
if (!filterOut) {
executor.submit(prepareTask(messageString, message, mailListName, mailListCategory, messageCount+offset));
messageCount++;
}
messageSB.setLength(0);
}
}
/**
* @param args see Class JavaDoc
*/
public static void main(String[] args) {
log.info("Job started.");
IndexMboxArchiveOptions options = new IndexMboxArchiveOptions();
options.parseArgs(args);
if (options.isValid()) {
int offset = options.getNumberOffset() == null ? 0 : options.getNumberOffset();
File excludeMessageIdListPath = options.getExcludeMessageIdListPath() == null ? null : options.getExcludeMessageIdListPath();
if (log.isDebugEnabled()) {
log.debug("CL parameters:");
log.debug("----------------------------------");
log.debug("mboxFilePath: {}", options.getMboxFilePath());
log.debug("numberOfThreads: {} (avail_cores: {})", new Object[]{options.getNumberOfThreads(), Runtime.getRuntime().availableProcessors()});
log.debug("mailListName: {}", options.getMailListName());
log.debug("mailListCategory: {}", options.getMailListCategory());
log.debug("offset: {}", offset);
log.debug("excludeMessageIdListPath: {}", excludeMessageIdListPath.getAbsolutePath());
log.debug("----------------------------------");
}
if (options.getNumberOfThreads() < 1) {
throw new IllegalArgumentException("numberOfThreads must be at least 1");
}
httpClient = new Client(getConfig()
.connectionsPerRoute(options.getNumberOfThreads() + 1) // because task can be executed in the `main` thread as well
.serviceHost(options.getServiceHost())
.servicePath(options.getServicePath())
.contentType(options.getContentType())
.username(options.getUsername())
.password(options.getPassword())
);
FileReader mboxFileReader = null;
FileReader excludedIdsFileReader = null;
BufferedReader br = null;
ThreadPoolExecutor executor = new ThreadPoolExecutor(
options.getNumberOfThreads(),
options.getNumberOfThreads(),
3, TimeUnit.SECONDS,
new ArrayBlockingQueue<Runnable>(options.getNumberOfThreads(), true),
new ThreadPoolExecutor.CallerRunsPolicy());
try {
mb = getMessageBuilder();
log.info("Processing file {}", options.getMboxFilePath());
mboxFileReader = new FileReader(options.getMboxFilePath());
Properties excludeMessageIds = new Properties();
// Note that if there are any Message-Ids to be excluded then we have to parse all messages
// in the main thread before they are handed to another thread for processing.
if (excludeMessageIdListPath != null) {
excludeMessageIds.load(new FileInputStream(options.getExcludeMessageIdListPath()));
}
br = new BufferedReader(mboxFileReader);
String line;
StringBuilder messageSB = new StringBuilder();
String separator = System.getProperty("line.separator");
Date start = new Date();
while ((line = br.readLine()) != null) {
if (line.startsWith("From ")) {
processMessageBuffer(executor, excludeMessageIds, messageSB, options.getMailListName(), options.getMailListCategory(), offset);
}
messageSB.append(line).append(separator);
}
// process last message
processMessageBuffer(executor, excludeMessageIds, messageSB, options.getMailListName(), options.getMailListCategory(), offset);
executor.shutdown();
executor.awaitTermination(10L, TimeUnit.SECONDS);
Date end = new Date();
log.info("Processed {} mails in {} millis", messageCount, end.getTime() - start.getTime());
log.debug("Tasks created: {}", taskCount.get());
} catch (IOException e) {
log.error("Error occurred", e);
} catch (MimeException e) {
log.error("Unable to instantiate MessageBuilder", e);
} catch (/*InterruptedException | */ Throwable e) {
log.error("Unexpected exception", e);
} finally {
if (br != null) {
try {
br.close();
} catch (IOException e) {
e.printStackTrace();
log.error("Error closing BufferedReader", e);
}
}
if (mboxFileReader != null) {
try {
mboxFileReader.close();
} catch (IOException e) {
e.printStackTrace();
log.error("Error closing mboxFileReader", e);
}
}
if (excludedIdsFileReader != null) {
try {
excludedIdsFileReader.close();
} catch (IOException e) {
e.printStackTrace();
log.error("Error closing excludedIdsFileReader", e);
}
}
// try to force executor termination if needed
if (!executor.isTerminated()) {
log.warn("Executor not terminated, forcing termination.");
executor.shutdownNow();
Thread.currentThread().interrupt();
}
log.info("Job finished.");
}
}
}
}