/* * Autopsy Forensic Browser * * Copyright 2011-2014 Basis Technology Corp. * Contact: carrier <at> sleuthkit <dot> org * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.sleuthkit.autopsy.thunderbirdparser; import java.io.BufferedInputStream; import java.io.BufferedReader; import java.io.CharConversionException; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.nio.charset.Charset; import java.nio.charset.CharsetEncoder; import java.nio.charset.IllegalCharsetNameException; import java.nio.charset.StandardCharsets; import java.nio.charset.UnsupportedCharsetException; import java.util.ArrayList; import java.util.List; import java.util.UUID; import java.util.logging.Level; import org.sleuthkit.autopsy.coreutils.Logger; import org.apache.james.mime4j.dom.BinaryBody; import org.apache.james.mime4j.dom.Body; import org.apache.james.mime4j.dom.Entity; import org.apache.james.mime4j.dom.Message; import org.apache.james.mime4j.dom.Multipart; import org.apache.james.mime4j.dom.TextBody; import org.apache.james.mime4j.dom.address.AddressList; import org.apache.james.mime4j.dom.address.Mailbox; import org.apache.james.mime4j.dom.address.MailboxList; import org.apache.james.mime4j.dom.field.ContentDispositionField; import org.apache.james.mime4j.dom.field.ContentTypeField; import org.apache.james.mime4j.mboxiterator.CharBufferWrapper; import org.apache.james.mime4j.mboxiterator.MboxIterator; import org.apache.james.mime4j.message.DefaultMessageBuilder; import org.apache.james.mime4j.stream.Field; import org.apache.james.mime4j.stream.MimeConfig; import org.apache.tika.parser.txt.CharsetDetector; import org.apache.tika.parser.txt.CharsetMatch; import org.openide.util.NbBundle; import org.sleuthkit.autopsy.ingest.IngestServices; import org.sleuthkit.datamodel.TskData; import org.sleuthkit.datamodel.EncodedFileOutputStream; /** * A parser that extracts information about email messages and attachments from * a mbox file. * * @author jwallace */ class MboxParser { private static final Logger logger = Logger.getLogger(MboxParser.class.getName()); private DefaultMessageBuilder messageBuilder; private IngestServices services; private StringBuilder errors; /** * The mime type string for html text. */ private static final String HTML_TYPE = "text/html"; //NON-NLS /** * The local path of the mbox file. */ private String localPath; MboxParser(IngestServices services, String localPath) { this.services = services; this.localPath = localPath; messageBuilder = new DefaultMessageBuilder(); MimeConfig config = MimeConfig.custom().setMaxLineLen(-1).build(); // disable line length checks. messageBuilder.setMimeEntityConfig(config); errors = new StringBuilder(); } static boolean isValidMimeTypeMbox(byte[] buffer) { return (new String(buffer)).startsWith("From "); //NON-NLS } /** * Parse the mbox file and get the email messages. * * @param mboxFile * * @return a list of the email messages in the mbox file. */ List<EmailMessage> parse(File mboxFile, long fileID) { // Detect possible charsets List<CharsetEncoder> encoders = getPossibleEncoders(mboxFile); CharsetEncoder theEncoder = null; Iterable<CharBufferWrapper> mboxIterator = null; // Loop through the possible encoders and find the first one that works. // That will usually be one of the first ones. for (CharsetEncoder encoder : encoders) { try { mboxIterator = MboxIterator.fromFile(mboxFile).charset(encoder.charset()).build(); theEncoder = encoder; break; } catch (CharConversionException | UnsupportedCharsetException ex) { // Not the right encoder } catch (IllegalArgumentException ex) { // Not the right encoder } catch (IOException ex) { logger.log(Level.WARNING, "couldn't find mbox file.", ex); //NON-NLS addErrorMessage(NbBundle.getMessage(this.getClass(), "MboxParser.parse.errMsg.failedToReadFile")); return new ArrayList<>(); } } // If no encoders work, post an error message and return. if (mboxIterator == null || theEncoder == null) { addErrorMessage(NbBundle.getMessage(this.getClass(), "MboxParser.parse.errMsg.couldntFindCharset")); return new ArrayList<>(); } List<EmailMessage> emails = new ArrayList<>(); long failCount = 0; // Parse each message and extract an EmailMessage structure for (CharBufferWrapper message : mboxIterator) { try { Message msg = messageBuilder.parseMessage(message.asInputStream(theEncoder.charset())); emails.add(extractEmail(msg, fileID)); } catch (RuntimeException | IOException ex) { logger.log(Level.WARNING, "Failed to get message from mbox: {0}", ex.getMessage()); //NON-NLS failCount++; } } if (failCount > 0) { addErrorMessage( NbBundle.getMessage(this.getClass(), "MboxParser.parse.errMsg.failedToParseNMsgs", failCount)); } return emails; } String getErrors() { return errors.toString(); } /** * Use the information stored in the given mime4j message to populate an * EmailMessage. * * @param msg * * @return */ private EmailMessage extractEmail(Message msg, long fileID) { EmailMessage email = new EmailMessage(); // Basic Info email.setSender(getAddresses(msg.getFrom())); email.setRecipients(getAddresses(msg.getTo())); email.setBcc(getAddresses(msg.getBcc())); email.setCc(getAddresses(msg.getCc())); email.setSubject(msg.getSubject()); email.setSentDate(msg.getDate()); email.setLocalPath(localPath); // Body if (msg.isMultipart()) { handleMultipart(email, (Multipart) msg.getBody(), fileID); } else { handleTextBody(email, (TextBody) msg.getBody(), msg.getMimeType(), msg.getHeader().getFields()); } return email; } /** * Handle a multipart mime message. Recursively calls handleMultipart if one * of the body parts is another multipart. Otherwise, calls the correct * method to extract information out of each part of the body. * * @param email * @param multi */ private void handleMultipart(EmailMessage email, Multipart multi, long fileID) { List<Entity> entities = multi.getBodyParts(); for (int index = 0; index < entities.size(); index++) { Entity e = entities.get(index); if (e.isMultipart()) { handleMultipart(email, (Multipart) e.getBody(), fileID); } else if (e.getDispositionType() != null && e.getDispositionType().equals(ContentDispositionField.DISPOSITION_TYPE_ATTACHMENT)) { handleAttachment(email, e, fileID, index); } else if (e.getMimeType().equals(HTML_TYPE) || e.getMimeType().equals(ContentTypeField.TYPE_TEXT_PLAIN)) { handleTextBody(email, (TextBody) e.getBody(), e.getMimeType(), e.getHeader().getFields()); } else { // Ignore other types. } } } /** * Extract text out of a body part of the message. * * Handles text and html mime types. Throws away all other types. (only * other example I've seen is text/calendar) * * @param email * @param tb * @param type The Mime type of the body. */ private void handleTextBody(EmailMessage email, TextBody tb, String type, List<Field> fields) { BufferedReader r; try { r = new BufferedReader(tb.getReader()); StringBuilder bodyString = new StringBuilder(); String line; while ((line = r.readLine()) != null) { bodyString.append(line).append("\n"); } bodyString.append("\n-----HEADERS-----\n"); for(Field field: fields) { String nextLine = field.getName() + ": " + field.getBody(); bodyString.append("\n").append(nextLine); } bodyString.append("\n\n---END HEADERS--\n\n"); switch (type) { case ContentTypeField.TYPE_TEXT_PLAIN: email.setTextBody(bodyString.toString()); break; case HTML_TYPE: email.setHtmlBody(bodyString.toString()); break; default: // Not interested in other text types. break; } } catch (IOException ex) { logger.log(Level.WARNING, "Error getting text body of mbox message", ex); //NON-NLS } } /** * Extract the attachment out of the given entity. Should only be called if * e.getDispositionType() == "attachment" * * @param email * @param e */ private void handleAttachment(EmailMessage email, Entity e, long fileID, int index) { String outputDirPath = ThunderbirdMboxFileIngestModule.getModuleOutputPath() + File.separator; String filename = e.getFilename(); // sanitize name. Had an attachment with a Japanese encoded path that // invalid characters and attachment could not be saved. filename = filename.replaceAll("\\?", "_"); filename = filename.replaceAll("<", "_"); filename = filename.replaceAll(">", "_"); filename = filename.replaceAll(":", "_"); filename = filename.replaceAll("\"", "_"); filename = filename.replaceAll("/", "_"); filename = filename.replaceAll("\\\\", "_"); filename = filename.replaceAll("|", "_"); filename = filename.replaceAll("\\*", "_"); // also had some crazy long names, so make random one if we get those. // also from Japanese image that had encoded name if (filename.length() > 64) { filename = UUID.randomUUID().toString(); } String uniqueFilename = fileID + "-" + index + "-" + email.getSentDate() + "-" + filename; String outPath = outputDirPath + uniqueFilename; EncodedFileOutputStream fos; BinaryBody bb; try { fos = new EncodedFileOutputStream(new FileOutputStream(outPath), TskData.EncodingType.XOR1); } catch (IOException ex) { addErrorMessage( NbBundle.getMessage(this.getClass(), "MboxParser.handleAttch.errMsg.failedToCreateOnDisk", outPath)); logger.log(Level.INFO, "Failed to create file output stream for: " + outPath, ex); //NON-NLS return; } try { Body b = e.getBody(); if (b instanceof BinaryBody) { bb = (BinaryBody) b; bb.writeTo(fos); } else { // This could potentially be other types. Only seen this once. } } catch (IOException ex) { logger.log(Level.INFO, "Failed to write mbox email attachment to disk.", ex); //NON-NLS addErrorMessage(NbBundle.getMessage(this.getClass(), "MboxParser.handleAttch.failedWriteToDisk", filename)); return; } finally { try { fos.close(); } catch (IOException ex) { logger.log(Level.INFO, "Failed to close file output stream", ex); //NON-NLS } } EmailMessage.Attachment attach = new EmailMessage.Attachment(); attach.setName(filename); attach.setLocalPath(ThunderbirdMboxFileIngestModule.getRelModuleOutputPath() + File.separator + uniqueFilename); attach.setSize(new File(outPath).length()); attach.setEncodingType(TskData.EncodingType.XOR1); email.addAttachment(attach); } /** * Get a String representation of the MailboxList (which is a list of email * addresses). * * @param mailboxList * * @return */ private String getAddresses(MailboxList mailboxList) { if (mailboxList == null) { return ""; } StringBuilder addresses = new StringBuilder(); for (Mailbox m : mailboxList) { addresses.append(m.toString()).append("; "); } return addresses.toString(); } /** * Get a String representation of the AddressList (which is a list of email * addresses). * * @param addressList * * @return */ private String getAddresses(AddressList addressList) { return (addressList == null) ? "" : getAddresses(addressList.flatten()); } /** * Get a list of the possible encoders for the given mboxFile using Tika's * CharsetDetector. At a minimum, returns the standard built in charsets. * * @param mboxFile * * @return */ private List<CharsetEncoder> getPossibleEncoders(File mboxFile) { InputStream is; List<CharsetEncoder> possibleEncoders = new ArrayList<>(); possibleEncoders.add(StandardCharsets.ISO_8859_1.newEncoder()); possibleEncoders.add(StandardCharsets.US_ASCII.newEncoder()); possibleEncoders.add(StandardCharsets.UTF_16.newEncoder()); possibleEncoders.add(StandardCharsets.UTF_16BE.newEncoder()); possibleEncoders.add(StandardCharsets.UTF_16LE.newEncoder()); possibleEncoders.add(StandardCharsets.UTF_8.newEncoder()); try { is = new BufferedInputStream(new FileInputStream(mboxFile)); } catch (FileNotFoundException ex) { logger.log(Level.WARNING, "Failed to find mbox file while detecting charset"); //NON-NLS return possibleEncoders; } try { CharsetDetector detector = new CharsetDetector(); detector.setText(is); CharsetMatch[] matches = detector.detectAll(); for (CharsetMatch match : matches) { try { possibleEncoders.add(Charset.forName(match.getName()).newEncoder()); } catch (UnsupportedCharsetException | IllegalCharsetNameException ex) { // Don't add unsupported charsets to the list } } return possibleEncoders; } catch (IOException | IllegalArgumentException ex) { logger.log(Level.WARNING, "Failed to detect charset of mbox file.", ex); //NON-NLS return possibleEncoders; } finally { try { is.close(); } catch (IOException ex) { logger.log(Level.INFO, "Failed to close input stream"); //NON-NLS } } } private void addErrorMessage(String msg) { errors.append("<li>").append(msg).append("</li>"); //NON-NLS } }