/*
* JBoss, Home of Professional Open Source
* Copyright 2013 Red Hat Inc. and/or its affiliates and other contributors
* as indicated by the @authors tag. All rights reserved.
*/
package org.searchisko.mbox.task;
import org.apache.james.mime4j.MimeException;
import org.apache.james.mime4j.dom.Message;
import org.apache.james.mime4j.dom.MessageBuilder;
import org.searchisko.http.client.Client;
import org.searchisko.mbox.dto.Mail;
import org.searchisko.mbox.json.Converter;
import org.searchisko.mbox.parser.MessageParser;
import org.searchisko.mbox.util.ContentType;
import org.searchisko.mbox.util.DirUtil;
import org.searchisko.mbox.util.StringUtil;
import org.searchisko.preprocessor.HTMLStripUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.*;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import static org.searchisko.http.client.Client.getConfig;
import static org.searchisko.mbox.parser.MessageParser.getMessageBuilder;
/**
* Given path to a folder <code>pathToDeltaArchive</code> we read list of files in it (no recursion).
* We assume that files have name in special format (the name is base64 encoded and contains URL link of individual
* entry in public Mailman archive).
* <p/>
* Next we filter list of files and exclude all that do not contain allowed project name in its name. List of allowed
* projects is provided as a property file located at <code>activeMailListsConf<code/> path. All excluded files are
* <b>deleted</b> from fs immediately.
* <p/>
* Next we process remaining files in parallel. We are using ThreadPoolExecutor with BlockingQueue to
* throttle number of parallel tasks in order not to exhaust all system resources.
* <p/>
* Each thread is responsible for parsing one file, parsing it to a message and converting it to JSON and then sending
* it to Searchisko for indexing via HttpClient. When HttpClient sends Http request it blocks the thread until the
* response is received or until timeout.
* <p/>
* Client can specify number of parallel threads. Note the `main` thread is not included in that number but it can
* be used to handle the task as well. Now, the underlying HttpClient is using connection pool which is configured
* to allow for needed number of concurrent connections. In other words <code>numberOfThreads</code> of value `N` can
* result up to `N+1` active parallel connections to target <code>host</code> (contrary, a typical HttpClient connection
* pool does not allow for more then 2 parallel connection per <code>host</code>). So be sure your target service is
* able to handle this number of incoming connections.
* <p/>
* Each remaining file is <b>deleted</b> immediately after it is processed successfully.
*
* @author Lukáš Vlček (lvlcek@redhat.com)
*
* @see StringUtil
* @see ThreadPoolExecutor
* @see ArrayBlockingQueue
* @see {https://today.java.net/pub/a/today/2008/10/23/creating-a-notifying-blocking-thread-pool-executor.html}
* @see {http://www.javacodegeeks.com/2011/12/using-threadpoolexecutor-to-parallelize.html}
*/
public class IndexDeltaFolder {
private static Logger log = LoggerFactory.getLogger(IndexDeltaFolder.class);
private static MessageBuilder mb;
private static Client httpClient;
private static Runnable prepareTask(final File file) {
return new Runnable() {
@Override
public void run() {
String mailURL = StringUtil.decodeFilenameSafe(file.getName());
// Note: StringUtil.getInfo() can fire unchecked exception but as long as
// #filter() is called before #index() we should not get file with invalid name
StringUtil.URLInfo info = StringUtil.getInfo(file.getName());
String messageId;
try {
Map<String, String> metadata = new HashMap<>();
metadata.put("sys_view_url", mailURL);
metadata.put("project", info.getProject());
metadata.put("mail_list_category", info.getListType());
Message message = mb.parseMessage(new FileInputStream(file));
Mail mail = MessageParser.parse(message);
messageId = mail.message_id(); // "sys_content_id"
String sysContent = mail.first_text_message_without_quotes();
String sysContentContentType = ContentType.TEXT_PLAIN;
if (sysContent == null || sysContent.trim().isEmpty()) {
sysContent = mail.first_text_message();
}
if (sysContent == null || sysContent.trim().isEmpty()) {
sysContent = HTMLStripUtil.stripHTML(mail.first_html_message());
}
metadata.put("sys_content", sysContent);
metadata.put("sys_content_content-type", sysContentContentType);
metadata.put("sys_description", mail.message_snippet());
String messageJSON = Converter.toJSON(mail, metadata);
Object response = httpClient.post(messageJSON, messageId);
log.trace("{}", response);
if (!file.delete()) {
log.error("Could not delete file after successful processing {}, does it exist?", file.getName(), file.exists());
}
} catch (Throwable e) {
log.error("Error processing mail [{}]", mailURL);
log.debug("Error details", e);
}
}
};
}
/**
* Calls #read(deltaArchivePath, 2000)
*
* @param deltaArchivePath
* @return
* @see #read(File, long)
*/
public static File[] read(File deltaArchivePath) {
return read(deltaArchivePath, 2000);
}
/**
* Reader files found at given path. It ignores all files that have been "lastModified" before 2 seconds or less.
*
* @param deltaArchivePath
* @param fileAge
* @return
*/
public static File[] read(File deltaArchivePath, long fileAge) {
List<File> filesToProcess = new ArrayList<>();
log.info("Reading folder {}", deltaArchivePath);
try {
File[] files = DirUtil.listFiles(deltaArchivePath);
log.info("Checking {} files", files.length);
for (File file : files) {
// If file can not be deleted then do not process it,
// that would lead to repetitive processing of the same file.
if (file.canWrite()) {
// Process only files older then 2 seconds. This is to make sure that we do not process files that
// are currently being created and thus are incomplete. (Not sure if Python file creation operation
// is atomic).
if (System.currentTimeMillis() - file.lastModified() > fileAge) {
filesToProcess.add(file);
}
}
}
} catch (FileNotFoundException e) {
log.error("Could not read resource: {}", deltaArchivePath, e);
}
return filesToProcess.toArray(new File[filesToProcess.size()]);
}
/**
* Filter out all files that do not belong to any of provided mail list collection.
* Files that are filtered out are also immediately deleted from the filesystem.
*
* @param filesToProcess
* @param activeMailLists
* @return
*/
public static File[] filter(File[] filesToProcess, Collection<String> activeMailLists) {
List<File> filesFiltered = new ArrayList<>();
int countOfOriginalFiles = filesToProcess.length;
for (File file : filesToProcess) {
// decode
StringUtil.URLInfo info = null;
try {
info = StringUtil.getInfo(file.getName());
} catch (Throwable e) {
log.error("Can not extract info from file name [{}]. Skipping this file", file.getName());
}
if (info != null && info.getProject() != null) {
// get lookup key
String key = info.getProject();
if (info.getListType() != null) {
key += "-" + info.getListType();
}
// if found among active projects
if (activeMailLists.contains(key)) {
filesFiltered.add(file);
} else {
// just delete it
if (!file.delete()) {
// may be the file has been already deleted by some other process...
log.error("Could not delete file {}, does it exist? {}", file.getName(), file.exists());
}
}
} else {
// this should probably not happen
log.error("Could not parse project name from file name [{}]. Skipping this file.", file.getName());
}
}
log.info("Filtered {} files out in total", countOfOriginalFiles - filesFiltered.size());
return filesFiltered.toArray(new File[filesFiltered.size()]);
}
/**
* This method is not thread safe.
*
* @param filesToProcess
* @param executor
*/
public static void index(File[] filesToProcess, ThreadPoolExecutor executor) {
log.info("Starting to index {} files", filesToProcess.length);
if (filesToProcess.length > 0) {
try {
mb = getMessageBuilder(); // not thread safe
} catch (MimeException e) {
log.error("Could not get MessageBuilder", e);
throw new RuntimeException(e);
}
for (File file : filesToProcess) {
executor.submit(prepareTask(file));
}
}
log.info("Done.");
}
public static void main(String[] args) {
log.info("Job started.");
IndexDeltaFolderOptions options = new IndexDeltaFolderOptions();
options.parseArgs(args);
if (options.isValid()) {
if (log.isDebugEnabled()) {
log.debug("CL parameters:");
log.debug("----------------------------------");
log.debug("pathToDeltaArchive: {}", options.getPathToDeltaArchive());
log.debug("numberOfThreads: {} (avail_cores: {})", new Object[]{options.getNumberOfThreads(), Runtime.getRuntime().availableProcessors()});
log.debug("activeMailListsConf: {}", options.getActiveMailListsConf());
log.debug("----------------------------------");
}
if (options.getNumberOfThreads() < 1) {
throw new IllegalArgumentException("numberOfThreads must be at least 1");
}
httpClient = new Client(getConfig()
.connectionsPerRoute(options.getNumberOfThreads() + 1) // because task can be executed in the `main` thread as well
.serviceHost(options.getServiceHost())
.servicePath(options.getServicePath())
.contentType(options.getContentType())
.username(options.getUsername())
.password(options.getPassword())
);
ThreadPoolExecutor executor = new ThreadPoolExecutor(
options.getNumberOfThreads(),
options.getNumberOfThreads(),
3, TimeUnit.SECONDS,
new ArrayBlockingQueue<Runnable>(options.getNumberOfThreads(), true),
new ThreadPoolExecutor.CallerRunsPolicy());
// load properties conf
Properties prop = new Properties();
try {
prop.load(new FileInputStream(options.getActiveMailListsConf()));
Collection<String> activeMailLists = prop.stringPropertyNames();
File[] files = read(options.getPathToDeltaArchive());
files = filter(files, activeMailLists);
index(files, executor);
executor.shutdown();
executor.awaitTermination(10L, TimeUnit.SECONDS);
} catch (IOException e) {
log.error("Error occurred", e);
} catch (InterruptedException e) {
log.error("Unexpected exception", e);
} finally {
// try to force executor termination if needed
if (!executor.isTerminated()) {
log.warn("Executor not terminated, forcing termination.");
executor.shutdownNow();
Thread.currentThread().interrupt();
}
log.info("Job finished.");
}
}
}
}