IndexMboxArchive.java example

Explorer

mbox_tools-master
- controller
  - src
    - main
      - java
        org
        searchisko
        mbox
        command
        Starter.java
- mbox_indexer
  - src
    - main
      - java
        org
        searchisko
        http
        client
        Client.java
        mbox
        task
        IndexDeltaFolder.java
        IndexDeltaFolderOptions.java
        IndexMboxArchive.java
        IndexMboxArchiveOptions.java
        util
        ContentType.java
        DirUtil.java
        StringUtil.java
        preprocessor
        HTMLStripUtil.java
    - test
      - java
        org
        searchisko
        http
        client
        ClientTest.java
        mbox
        task
        IndexDeltaFolderTest.java
        IndexMboxArchiveTest.java
        util
        StringUtilTest.java
        preprocessor
        HTMLStripUtilTest.java
- mbox_parser
  - src
    - main
      - java
        org
        searchisko
        mbox
        dto
        Mail.java
        MailAttachment.java
        json
        Converter.java
        parser
        MessageBodyParser.java
        MessageParseException.java
        MessageParser.java
    - test
      - java
        org
        searchisko
        mbox
        MessageTestSupport.java
        json
        ConverterMetadataTest.java
        ConverterTest.java
        ConverterTestSupport.java
        parser
        MessageBodyParsingTest.java
        MessageHeaderParsingTest.java
        MessageParserTest.java
- test_support
  - src
    - main
      - java
        org
        searchisko
        BaseTestSupport.java

/*
 * JBoss, Home of Professional Open Source
 * Copyright 2013 Red Hat Inc. and/or its affiliates and other contributors
 * as indicated by the @authors tag. All rights reserved.
 */

package org.searchisko.mbox.task;

import org.apache.james.mime4j.MimeException;
import org.apache.james.mime4j.dom.Message;
import org.apache.james.mime4j.dom.MessageBuilder;
import org.searchisko.http.client.Client;
import org.searchisko.mbox.dto.Mail;
import org.searchisko.mbox.json.Converter;
import org.searchisko.mbox.parser.MessageParser;
import org.searchisko.mbox.util.ContentType;
import org.searchisko.mbox.util.StringUtil;
import org.searchisko.preprocessor.HTMLStripUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.*;
import java.net.URL;
import java.text.SimpleDateFormat;
import java.util.*;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicLong;

import static org.searchisko.http.client.Client.getConfig;
import static org.searchisko.mbox.parser.MessageParser.getMessageBuilder;
import static org.searchisko.mbox.parser.MessageParser.getMessageHeaders;

/**
 * Given a single mbox archive file (can be huge) we read it line by line and every time a complete message is red we
 * pass that message for processing to parallel thread. Yet, we are using ThreadPoolExecutor with BlockingQueue to
 * throttle number of parallel tasks in order not to exhaust all system resources.
 * <p/>
 * Each thread is responsible for parsing the mail message, converting it to JSON and then sending it to Searchisko
 * for indexing via HttpClient. When HttpClient sends Http request it blocks the thread until the response is received
 * or until timeout.
 * <p/>
 * Client can specify number of parallel threads. Note the `main` thread is not included in that number but it can
 * be used to handle the task as well. Now, the underlying HttpClient is using connection pool which is configured
 * to allow for needed number of concurrent connections. In other words <code>numberOfThreads</code> of value `N` can
 * result up to `N+1` active parallel connections to target <code>host</code> (contrary, a typical HttpClient connection
 * pool does not allow for more then 2 parallel connection per <code>host</code>). So be sure your target service is
 * able to handle this number of incoming connections.
 * <p/>
 * The <code>numberOffset</code> is used if numbering of individual messages in the public archive does not start
 * from 0. This can be typically result of Mailman admin mistake during archive rebuilding or similar issue.
 * Note this is an optional parameter but we need to provide it if we need to provide parameters following this. In such
 * case we can use value 0.
 * <p/>
 * The <code>excludeMessageIdListPath</code> is used if we need to exclude specific messages from processing. This is
 * an optional parameter.
 *
 * @author Lukáš Vlček (lvlcek@redhat.com)
 *
 * @see ThreadPoolExecutor
 * @see ArrayBlockingQueue
 * @see {https://today.java.net/pub/a/today/2008/10/23/creating-a-notifying-blocking-thread-pool-executor.html}
 * @see {http://www.javacodegeeks.com/2011/12/using-threadpoolexecutor-to-parallelize.html}
 */
public class IndexMboxArchive {

	private static Logger log = LoggerFactory.getLogger(IndexMboxArchive.class);
	private static MessageBuilder mb;
	private static Client httpClient;
	private static AtomicLong taskCount = new AtomicLong();
	private static long messageCount = 0;

	/**
	 *
	 * @param messageString raw message as a string. Can be null.
	 * @param message parsed message. Can be null.
	 * @param mailListName
	 * @param mailListCategory
	 * @param cnt order # of this message within the single cumulative mbox archive file
	 * @return
	 */
	private static Runnable prepareTask(final String messageString, final Message message, final String mailListName, final String mailListCategory, final long cnt) {
		return new Runnable() {
			@Override
			public void run() {
				// 1. Convert mail to JSON representation with added metadata.
				// 2. Send mail to the server, using blocking operation.
				long taskId = taskCount.incrementAndGet();
				log.debug("starting task [{}]", taskId);
				if (messageString == null && message == null) {
					log.error("Missing message source. Either raw message string or parsed message must be provided. Exit task {}", taskId);
					return;
				}
				String messageId = null;
				try {
					Message msg;
					if (messageString != null) {
						msg = mb.parseMessage(new ByteArrayInputStream(messageString.getBytes()));
					} else {
					    msg = message;
					}

					String document_url = getDocumentUrl(msg, mailListName, cnt);

					// add missing metadata
					Map<String, String> metadata = new HashMap<>();
					metadata.put("sys_url_view", document_url);
					metadata.put("project", StringUtil.getProjectName(mailListName, mailListCategory));
					metadata.put("mail_list_category", mailListCategory);

					Mail mail = MessageParser.parse(msg);
					messageId = mail.message_id(); // "sys_content_id"

					String sysContent = mail.first_text_message_without_quotes();
					String sysContentContentType = ContentType.TEXT_PLAIN;
					if (sysContent == null || sysContent.trim().isEmpty()) {
						sysContent = mail.first_text_message();
					}
					if (sysContent == null || sysContent.trim().isEmpty()) {
						sysContent = HTMLStripUtil.stripHTML(mail.first_html_message());
					}
					metadata.put("sys_content", sysContent);
					metadata.put("sys_content_content-type", sysContentContentType);

					metadata.put("sys_description", mail.message_snippet());
					String messageJSON = Converter.toJSON(mail, metadata);

					Object response = httpClient.post(messageJSON, messageId);

					log.trace("{}", response);

				} catch (Exception e) {
					log.warn("Error processing message {} in task [{}], caused: {}", new Object[]{messageId, taskId, e.getMessage()});
				}

			}
		};
	}

	/**
	 * Construct public URL for given message.
	 * TODO: this needs to be configurable going forward.
	 * @param message
	 * @param mailListName
	 * @param cnt order # of this message within mbox file (single cumulative file)
	 * @return
	 */
	protected static String getDocumentUrl(final Message message, final String mailListName, final long cnt) {
		// our Mailman is in specific times zone, this has impact on how it constructs URLs
		SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MMMMM", Locale.US);
		sdf.setTimeZone(TimeZone.getTimeZone("EST"));
		return "http://lists.jboss.org/pipermail/"+mailListName+"/"+sdf.format(message.getDate())+"/"+String.format("%06d",cnt)+".html";
	}

	public static File getFile(String path) {
		// try to get from fs
		File file = new File(path);
		if (file.exists()) { return file; }
		// try to get from classpath
		URL url = IndexMboxArchive.class.getClassLoader().getResource(path);
		log.trace("file url: {}", url);
		String filesPathAndName = url.getPath();
		log.info("trying to get file {}", filesPathAndName);
		return new File(filesPathAndName);
	}

	public static InputStream getInputStream(String filePath) throws FileNotFoundException {
		InputStream is = null;
		// try to get from fs
		File f = new File(filePath);
		if (f.exists()) is = new FileInputStream(f);
		// try to get from classpath
		if (is == null) {
			is = IndexMboxArchive.class.getClassLoader().getResourceAsStream(filePath);
		}
		return is;
	}


	private static void processMessageBuffer(ThreadPoolExecutor executor, Properties excludeMessageIds, StringBuilder messageSB, String mailListName, String mailListCategory, int offset) throws IOException, MimeException {
		if (messageSB.length() > 0) {
			String messageString = messageSB.toString();
			Message message = null;
			boolean filterOut = false;
			if (excludeMessageIds != null && !excludeMessageIds.isEmpty()) {
				message = mb.parseMessage(new ByteArrayInputStream(messageString.getBytes()));
				String messageId = getMessageHeaders(message).get(MessageParser.MessageHeader.MESSAGE_ID.toString()).getBody();
				filterOut = excludeMessageIds.containsKey(messageId) ? true : false;
				if (filterOut) log.info("skipping message [{}]", messageId);
			}
			if (!filterOut) {
				executor.submit(prepareTask(messageString, message, mailListName, mailListCategory, messageCount+offset));
				messageCount++;
			}
			messageSB.setLength(0);
		}
	}

	/**
	 * @param args see Class JavaDoc
	 */
	public static void main(String[] args) {

		log.info("Job started.");

		IndexMboxArchiveOptions options = new IndexMboxArchiveOptions();
		options.parseArgs(args);
		if (options.isValid()) {

			int offset = options.getNumberOffset() == null ? 0 : options.getNumberOffset();
			File excludeMessageIdListPath = options.getExcludeMessageIdListPath() == null ? null : options.getExcludeMessageIdListPath();

			if (log.isDebugEnabled()) {
				log.debug("CL parameters:");
				log.debug("----------------------------------");
				log.debug("mboxFilePath: {}", options.getMboxFilePath());
				log.debug("numberOfThreads: {} (avail_cores: {})", new Object[]{options.getNumberOfThreads(), Runtime.getRuntime().availableProcessors()});
				log.debug("mailListName: {}", options.getMailListName());
				log.debug("mailListCategory: {}", options.getMailListCategory());
				log.debug("offset: {}", offset);
				log.debug("excludeMessageIdListPath: {}", excludeMessageIdListPath.getAbsolutePath());
				log.debug("----------------------------------");
			}

			if (options.getNumberOfThreads() < 1) {
				throw new IllegalArgumentException("numberOfThreads must be at least 1");
			}

			httpClient = new Client(getConfig()
					.connectionsPerRoute(options.getNumberOfThreads() + 1) // because task can be executed in the `main` thread as well
					.serviceHost(options.getServiceHost())
					.servicePath(options.getServicePath())
					.contentType(options.getContentType())
					.username(options.getUsername())
					.password(options.getPassword())
			);

			FileReader mboxFileReader = null;
			FileReader excludedIdsFileReader = null;
			BufferedReader br = null;

			ThreadPoolExecutor executor = new ThreadPoolExecutor(
					options.getNumberOfThreads(),
					options.getNumberOfThreads(),
					3, TimeUnit.SECONDS,
					new ArrayBlockingQueue<Runnable>(options.getNumberOfThreads(), true),
					new ThreadPoolExecutor.CallerRunsPolicy());

			try {
				mb = getMessageBuilder();

				log.info("Processing file {}", options.getMboxFilePath());
				mboxFileReader = new FileReader(options.getMboxFilePath());
				Properties excludeMessageIds = new Properties();
				// Note that if there are any Message-Ids to be excluded then we have to parse all messages
				// in the main thread before they are handed to another thread for processing.
				if (excludeMessageIdListPath != null) {
					excludeMessageIds.load(new FileInputStream(options.getExcludeMessageIdListPath()));
				}
				br = new BufferedReader(mboxFileReader);

				String line;
				StringBuilder messageSB = new StringBuilder();
				String separator = System.getProperty("line.separator");

				Date start = new Date();

				while ((line = br.readLine()) != null) {
					if (line.startsWith("From ")) {
						processMessageBuffer(executor, excludeMessageIds, messageSB, options.getMailListName(), options.getMailListCategory(), offset);
					}
					messageSB.append(line).append(separator);
				}
				// process last message
				processMessageBuffer(executor, excludeMessageIds, messageSB, options.getMailListName(), options.getMailListCategory(), offset);

				executor.shutdown();
				executor.awaitTermination(10L, TimeUnit.SECONDS);

				Date end = new Date();

				log.info("Processed {} mails in {} millis", messageCount, end.getTime() - start.getTime());
				log.debug("Tasks created: {}", taskCount.get());

			} catch (IOException e) {
				log.error("Error occurred", e);
			} catch (MimeException e) {
				log.error("Unable to instantiate MessageBuilder", e);
			} catch (/*InterruptedException | */ Throwable e) {
				log.error("Unexpected exception", e);
			} finally {

				if (br != null) {
					try {
						br.close();
					} catch (IOException e) {
						e.printStackTrace();
						log.error("Error closing BufferedReader", e);
					}
				}

				if (mboxFileReader != null) {
					try {
						mboxFileReader.close();
					} catch (IOException e) {
						e.printStackTrace();
						log.error("Error closing mboxFileReader", e);
					}
				}

				if (excludedIdsFileReader != null) {
					try {
						excludedIdsFileReader.close();
					} catch (IOException e) {
						e.printStackTrace();
						log.error("Error closing excludedIdsFileReader", e);
					}
				}

				// try to force executor termination if needed
				if (!executor.isTerminated()) {
					log.warn("Executor not terminated, forcing termination.");
					executor.shutdownNow();
					Thread.currentThread().interrupt();
				}

				log.info("Job finished.");
			}
		}
	}
}