/* * Copyright 2010 Outerthought bvba * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.lilyproject.tools.mboximport; import java.io.File; import java.io.FileInputStream; import java.io.InputStream; import java.io.OutputStream; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.zip.GZIPInputStream; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.Option; import org.apache.commons.cli.OptionBuilder; import org.apache.commons.io.IOUtils; import org.apache.james.mime4j.codec.Base64InputStream; import org.apache.james.mime4j.codec.QuotedPrintableInputStream; import org.apache.james.mime4j.field.AddressListField; import org.apache.james.mime4j.field.DefaultFieldParser; import org.apache.james.mime4j.field.FieldName; import org.apache.james.mime4j.field.MailboxField; import org.apache.james.mime4j.field.MailboxListField; import org.apache.james.mime4j.field.ParsedField; import org.apache.james.mime4j.field.address.Address; import org.apache.james.mime4j.field.address.AddressList; import org.apache.james.mime4j.field.address.Mailbox; import org.apache.james.mime4j.field.address.MailboxList; import org.apache.james.mime4j.io.EOLConvertingInputStream; import org.apache.james.mime4j.parser.Field; import org.apache.james.mime4j.parser.MimeEntityConfig; import org.apache.james.mime4j.parser.MimeTokenStream; import org.apache.james.mime4j.util.MimeUtil; import org.lilyproject.repository.api.Blob; import org.lilyproject.repository.api.Link; import org.lilyproject.repository.api.QName; import org.lilyproject.repository.api.Record; import org.lilyproject.repository.api.RecordId; import org.lilyproject.testclientfw.BaseRepositoryTestTool; import org.lilyproject.tools.import_.cli.JsonImport; import org.lilyproject.util.Version; import org.lilyproject.util.io.Closer; public class MboxImport extends BaseRepositoryTestTool { private Option fileOption; private Option schemaOption; private Map<String, Integer> partsByMediaType = new HashMap<String, Integer>(); private static final String NS = "org.lilyproject.mail"; private static final int MAX_LINE_LENGTH = 10000; @Override protected String getCmdName() { return "lily-mbox-import"; } @Override protected String getVersion() { return Version.readVersion("org.lilyproject", "lily-mbox-import"); } public static void main(String[] args) throws Exception { new MboxImport().start(args); } @Override @SuppressWarnings("static-access") public List<Option> getOptions() { List<Option> options = super.getOptions(); fileOption = OptionBuilder .withArgName("file") .hasArg() .withDescription("File or directory name") .withLongOpt("file") .create("f"); options.add(fileOption); schemaOption = OptionBuilder .withDescription("Create/update the schema") .withLongOpt("schema") .create("s"); options.add(schemaOption); return options; } @Override protected int getDefaultWorkers() { return 1; } @Override public int run(CommandLine cmd) throws Exception { int result = super.run(cmd); if (result != 0) { return result; } if (!cmd.hasOption(schemaOption.getOpt()) && !cmd.hasOption(fileOption.getOpt())) { printHelp(); return 1; } setupLily(); if (cmd.hasOption(schemaOption.getOpt()) || cmd.hasOption(fileOption.getOpt())) { loadSchema(); } if (cmd.hasOption(fileOption.getOpt())) { String fileName = cmd.getOptionValue(fileOption.getOpt()); File file = new File(fileName); if (!file.exists()) { System.out.println("File does not exist: " + file.getAbsolutePath()); return 1; } setupMetrics(); startExecutor(); if (file.isDirectory()) { File[] files = file.listFiles(); Arrays.sort(files); for (File item : files) { if (!item.isDirectory()) { executor.submit(new ImportMboxFileTask(item)); } } } else { executor.submit(new ImportMboxFileTask(file)); } stopExecutor(); finishMetrics(); System.out.println(); System.out.println("Number of created parts per media type:"); for (Map.Entry<String, Integer> entry : partsByMediaType.entrySet()) { System.out.println(" " + entry.getKey() + " : " + entry.getValue()); } System.out.println(); } lilyClient.close(); return 0; } private void loadSchema() throws Exception { System.out.println("Creating the schema (if necessary)"); System.out.println(); InputStream is = getClass().getClassLoader().getResourceAsStream("org/lilyproject/tools/mboximport/mail_schema.json"); JsonImport.loadSchema(repository, is); System.out.println(); } private class ImportMboxFileTask implements Runnable { private File file; private ImportMboxFileTask(File file) { this.file = file; } @Override public void run() { try { importFile(file); } catch (Throwable t) { t.printStackTrace(); metrics.increment("Exceptions", 1); } } } private void importFile(File file) throws Exception { System.out.println("Processing file " + file.getAbsolutePath()); InputStream is = null; try { is = new FileInputStream(file); if (file.getName().endsWith(".gz")) { is = new GZIPInputStream(is); } MboxInputStream mboxStream = new MboxInputStream(is, MAX_LINE_LENGTH); while (mboxStream.nextMessage()) { MimeTokenStream stream = new MyMimeTokenStream(); stream.parse(mboxStream); importMessage(stream); } } finally { Closer.close(is); } System.out.println(); } public static class MyMimeTokenStream extends MimeTokenStream { protected MyMimeTokenStream() { super(getConfig()); } private static MimeEntityConfig getConfig() { MimeEntityConfig config = new MimeEntityConfig(); config.setMaxLineLen(MAX_LINE_LENGTH); return config; } } private void importMessage(MimeTokenStream stream) throws Exception { int multiPartNesting = 0; // note that a multipart can again contain a multipart Message message = new Message(); for (int state = stream.getState(); state != MimeTokenStream.T_END_OF_STREAM; state = stream.next()) { switch (state) { case MimeTokenStream.T_BODY: String mediaType = stream.getBodyDescriptor().getMimeType() + "; charset=" + stream.getBodyDescriptor().getCharset(); // oftwewel: gebruik getDecodedInputStream InputStream bodyDataStream; if (MimeUtil.isQuotedPrintableEncoded(stream.getBodyDescriptor().getTransferEncoding())) { bodyDataStream = new QuotedPrintableInputStream(new EOLConvertingInputStream(stream.getInputStream(), EOLConvertingInputStream.CONVERT_LF)); } else if (MimeUtil.isBase64Encoding(stream.getBodyDescriptor().getTransferEncoding())) { bodyDataStream = new Base64InputStream(stream.getInputStream()); } else { bodyDataStream = stream.getInputStream(); } byte[] data = IOUtils.toByteArray(bodyDataStream); // TODO could fill in filename long startTime = System.nanoTime(); Blob blob = new Blob(mediaType, (long)data.length, null); OutputStream os = table.getOutputStream(blob); try { IOUtils.write(data, os); } finally { os.close(); } double duration = System.nanoTime() - startTime; metrics.increment("Blob creation", "Blob", duration / 1e6d); Part part = message.addPart(blob); part.baseMediaType = stream.getBodyDescriptor().getMimeType(); break; case MimeTokenStream.T_FIELD: if (multiPartNesting == 0) { Field field = stream.getField(); ParsedField parsedField = new DefaultFieldParser().parse(field.getName(), MimeUtil.unfold(field.getBody()), null); if (parsedField.getParseException() != null) { // TODO print error } else if (parsedField.getName().equals(FieldName.TO)) { message.to = ((AddressListField)parsedField).getAddressList(); } else if (parsedField.getName().equals(FieldName.CC)) { message.cc = ((AddressListField)parsedField).getAddressList(); } else if (parsedField.getName().equals(FieldName.FROM)) { message.from = ((MailboxListField)parsedField).getMailboxList(); } else if (parsedField.getName().equals(FieldName.SENDER)) { message.sender = ((MailboxField)parsedField).getMailbox(); } else if (parsedField.getName().equals("List-Id")) { message.listId = parsedField.getBody(); } else if (parsedField.getName().equals(FieldName.SUBJECT)) { message.subject = parsedField.getBody(); } } break; case MimeTokenStream.T_START_MULTIPART: multiPartNesting++; break; case MimeTokenStream.T_END_MULTIPART: multiPartNesting--; } } // Now create the records in Lily // Since we want to link the messages and parts bidirectionally, and for performance we want to avoid // having to update the message, we generate record IDs ourselves. // Since for the current usage typically parts are indexed with information dereferenced from messages, // we can save additional indexer work (update of dereferenced data) by first creating the messages // and then the parts. List<RecordId> partRecordIds = new ArrayList<RecordId>(message.parts.size()); for (Part part : message.parts) { partRecordIds.add(idGenerator.newRecordId()); } Record messageRecord = repository.getRecordFactory().newRecord(idGenerator.newRecordId()); messageRecord.setRecordType(new QName(NS, "Message")); if (message.subject != null) { messageRecord.setField(new QName(NS, "subject"), message.subject); } if (message.to != null) { messageRecord.setField(new QName(NS, "to"), message.getToAddressesAsStringList()); } if (message.cc != null) { messageRecord.setField(new QName(NS, "cc"), message.getCcAddressesAsStringList()); } if (message.from != null) { messageRecord.setField(new QName(NS, "from"), message.getFromAddressesAsStringList()); } if (message.sender != null) { messageRecord.setField(new QName(NS, "sender"), message.getSenderAddressAsString()); } if (message.listId != null) { messageRecord.setField(new QName(NS, "listId"), message.listId); } if (messageRecord.getFields().size() == 0 || message.parts.size() == 0) { // Message has no useful headers, do not create it. metrics.increment("Invalid messages", 1); return; } List<Link> partLinks = new ArrayList<Link>(message.parts.size()); for (RecordId recordId : partRecordIds) { partLinks.add(new Link(recordId)); } messageRecord.setField(new QName(NS, "parts"), partLinks); long startTime = System.nanoTime(); messageRecord = table.createOrUpdate(messageRecord); double duration = System.nanoTime() - startTime; metrics.increment("Message record", "Create", duration / 1e6d); for (int i = 0; i < message.parts.size(); i++) { Part part = message.parts.get(i); Record partRecord = table.newRecord(partRecordIds.get(i)); partRecord.setRecordType(new QName(NS, "Part")); partRecord.setField(new QName(NS, "mediaType"), part.blob.getMediaType()); partRecord.setField(new QName(NS, "content"), part.blob); partRecord.setField(new QName(NS, "message"), new Link(messageRecord.getId())); startTime = System.nanoTime(); partRecord = table.createOrUpdate(partRecord); duration = System.nanoTime() - startTime; metrics.increment("Part record", "Create", duration / 1e6d); part.recordId = partRecord.getId(); increment(part.baseMediaType); if (verbose) { System.out.println("Created part record: " + partRecord.getId()); } } if (verbose) { System.out.println("Created message record " + messageRecord.getId()); } } public void increment(String mediaType) { Integer count = partsByMediaType.get(mediaType); if (count == null) { partsByMediaType.put(mediaType, 1); } else { partsByMediaType.put(mediaType, count + 1); } } private static class Message { public String subject; public AddressList to; public AddressList cc; public MailboxList from; public Mailbox sender; public String listId; public List<Part> parts = new ArrayList<Part>(); public Part addPart(Blob blob) { Part part = new Part(); part.blob = blob; parts.add(part); return part; } public List<String> getToAddressesAsStringList() { List<String> result = new ArrayList<String>(to.size()); for (Address address : to) { result.add(address.getDisplayString()); } return result; } public List<String> getCcAddressesAsStringList() { List<String> result = new ArrayList<String>(cc.size()); for (Address address : cc) { result.add(address.getDisplayString()); } return result; } public List<String> getFromAddressesAsStringList() { List<String> result = new ArrayList<String>(from.size()); for (Mailbox mailbox : from) { result.add(mailbox.getDisplayString()); } return result; } public String getSenderAddressAsString() { return sender.getDisplayString(); } } private static class Part { public Blob blob; public RecordId recordId; /** Media type without parameters. */ public String baseMediaType; } }