/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nifi.processors.email; import org.apache.commons.lang3.ArrayUtils; import org.apache.commons.lang3.StringUtils; import org.apache.commons.mail.util.MimeMessageParser; import org.apache.nifi.annotation.behavior.EventDriven; import org.apache.nifi.annotation.behavior.InputRequirement; import org.apache.nifi.annotation.behavior.InputRequirement.Requirement; import org.apache.nifi.annotation.behavior.SideEffectFree; import org.apache.nifi.annotation.behavior.SupportsBatching; import org.apache.nifi.annotation.behavior.WritesAttribute; import org.apache.nifi.annotation.behavior.WritesAttributes; import org.apache.nifi.annotation.documentation.CapabilityDescription; import org.apache.nifi.annotation.documentation.Tags; import org.apache.nifi.components.PropertyDescriptor; import org.apache.nifi.flowfile.FlowFile; import org.apache.nifi.logging.ComponentLog; import org.apache.nifi.processor.AbstractProcessor; import org.apache.nifi.processor.ProcessContext; import org.apache.nifi.processor.ProcessSession; import org.apache.nifi.processor.ProcessorInitializationContext; import org.apache.nifi.processor.Relationship; import org.apache.nifi.processor.io.InputStreamCallback; import org.apache.nifi.processor.util.StandardValidators; import org.apache.nifi.stream.io.BufferedInputStream; import javax.mail.Address; import javax.mail.Header; import javax.mail.Message; import javax.mail.MessagingException; import javax.mail.Session; import javax.mail.internet.MimeMessage; import java.io.IOException; import java.io.InputStream; import java.lang.reflect.Array; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.Date; import java.util.Enumeration; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Properties; import java.util.Set; @SupportsBatching @EventDriven @SideEffectFree @Tags({"split", "email"}) @InputRequirement(Requirement.INPUT_REQUIRED) @CapabilityDescription("Using the flowfile content as source of data, extract header from an RFC compliant email file adding the relevant attributes to the flowfile. " + "This processor does not perform extensive RFC validation but still requires a bare minimum compliance with RFC 2822") @WritesAttributes({ @WritesAttribute(attribute = "email.headers.bcc.*", description = "Each individual BCC recipient (if available)"), @WritesAttribute(attribute = "email.headers.cc.*", description = "Each individual CC recipient (if available)"), @WritesAttribute(attribute = "email.headers.from.*", description = "Each individual mailbox contained in the From of the Email (array as per RFC-2822)"), @WritesAttribute(attribute = "email.headers.message-id", description = "The value of the Message-ID header (if available)"), @WritesAttribute(attribute = "email.headers.received_date", description = "The Received-Date of the message (if available)"), @WritesAttribute(attribute = "email.headers.sent_date", description = "Date the message was sent"), @WritesAttribute(attribute = "email.headers.subject", description = "Subject of the message (if available)"), @WritesAttribute(attribute = "email.headers.to.*", description = "Each individual TO recipient (if available)"), @WritesAttribute(attribute = "email.attachment_count", description = "Number of attachments of the message" )}) public class ExtractEmailHeaders extends AbstractProcessor { public static final String EMAIL_HEADER_BCC = "email.headers.bcc"; public static final String EMAIL_HEADER_CC = "email.headers.cc"; public static final String EMAIL_HEADER_FROM = "email.headers.from"; public static final String EMAIL_HEADER_MESSAGE_ID = "email.headers.message-id"; public static final String EMAIL_HEADER_RECV_DATE = "email.headers.received_date"; public static final String EMAIL_HEADER_SENT_DATE = "email.headers.sent_date"; public static final String EMAIL_HEADER_SUBJECT = "email.headers.subject"; public static final String EMAIL_HEADER_TO = "email.headers.to"; public static final String EMAIL_ATTACHMENT_COUNT = "email.attachment_count"; public static final PropertyDescriptor CAPTURED_HEADERS = new PropertyDescriptor.Builder() .name("CAPTURED_HEADERS") .displayName("Additional Header List") .description("COLON separated list of additional headers to be extracted from the flowfile content." + "NOTE the header key is case insensitive and will be matched as lower-case." + " Values will respect email contents.") .required(false) .expressionLanguageSupported(false) .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) .defaultValue("x-mailer") .build(); public static final Relationship REL_SUCCESS = new Relationship.Builder() .name("success") .description("Extraction was successful") .build(); public static final Relationship REL_FAILURE = new Relationship.Builder() .name("failure") .description("Flowfiles that could not be parsed as a RFC-2822 compliant message") .build(); private Set<Relationship> relationships; private List<PropertyDescriptor> descriptors; @Override protected void init(final ProcessorInitializationContext context) { final Set<Relationship> relationships = new HashSet<>(); relationships.add(REL_SUCCESS); relationships.add(REL_FAILURE); this.relationships = Collections.unmodifiableSet(relationships); final List<PropertyDescriptor> descriptors = new ArrayList<>(); descriptors.add(CAPTURED_HEADERS); this.descriptors = Collections.unmodifiableList(descriptors); } @Override public void onTrigger(final ProcessContext context, final ProcessSession session) { final ComponentLog logger = getLogger(); final List<FlowFile> invalidFlowFilesList = new ArrayList<>(); final List<FlowFile> processedFlowFilesList = new ArrayList<>(); final FlowFile originalFlowFile = session.get(); if (originalFlowFile == null) { return; } final List<String> capturedHeadersList = Arrays.asList(context.getProperty(CAPTURED_HEADERS).getValue().toLowerCase().split(":")); final Map<String, String> attributes = new HashMap<>(); session.read(originalFlowFile, new InputStreamCallback() { @Override public void process(final InputStream rawIn) throws IOException { try (final InputStream in = new BufferedInputStream(rawIn)) { Properties props = new Properties(); Session mailSession = Session.getDefaultInstance(props, null); MimeMessage originalMessage = new MimeMessage(mailSession, in); MimeMessageParser parser = new MimeMessageParser(originalMessage).parse(); // RFC-2822 determines that a message must have a "From:" header // if a message lacks the field, it is flagged as invalid Address[] from = originalMessage.getFrom(); Date sentDate = originalMessage.getSentDate(); if (from == null || sentDate == null ) { // Throws MessageException due to lack of minimum required headers throw new MessagingException("Message failed RFC2822 validation"); } else if (capturedHeadersList.size() > 0){ Enumeration headers = originalMessage.getAllHeaders(); while (headers.hasMoreElements()) { Header header = (Header) headers.nextElement(); if (StringUtils.isNotEmpty(header.getValue()) && capturedHeadersList.contains(header.getName().toLowerCase())) { attributes.put("email.headers." + header.getName().toLowerCase(), header.getValue()); } } } if (Array.getLength(originalMessage.getAllRecipients()) > 0) { for (int toCount = 0; toCount < ArrayUtils.getLength(originalMessage.getRecipients(Message.RecipientType.TO)); toCount++) { attributes.put(EMAIL_HEADER_TO + "." + toCount, originalMessage.getRecipients(Message.RecipientType.TO)[toCount].toString()); } for (int toCount = 0; toCount < ArrayUtils.getLength(originalMessage.getRecipients(Message.RecipientType.BCC)); toCount++) { attributes.put(EMAIL_HEADER_BCC + "." + toCount, originalMessage.getRecipients(Message.RecipientType.BCC)[toCount].toString()); } for (int toCount = 0; toCount < ArrayUtils.getLength(originalMessage.getRecipients(Message.RecipientType.CC)); toCount++) { attributes.put(EMAIL_HEADER_CC + "." + toCount, originalMessage.getRecipients(Message.RecipientType.CC)[toCount].toString()); } } // Incredibly enough RFC-2822 specified From as a "mailbox-list" so an array I returned by getFrom for (int toCount = 0; toCount < ArrayUtils.getLength(originalMessage.getFrom()); toCount++) { attributes.put(EMAIL_HEADER_FROM + "." + toCount, originalMessage.getFrom()[toCount].toString()); } if (StringUtils.isNotEmpty(originalMessage.getMessageID())) { attributes.put(EMAIL_HEADER_MESSAGE_ID, originalMessage.getMessageID()); } if (originalMessage.getReceivedDate() != null) { attributes.put(EMAIL_HEADER_RECV_DATE, originalMessage.getReceivedDate().toString()); } if (originalMessage.getSentDate() != null) { attributes.put(EMAIL_HEADER_SENT_DATE, originalMessage.getSentDate().toString()); } if (StringUtils.isNotEmpty(originalMessage.getSubject())) { attributes.put(EMAIL_HEADER_SUBJECT, originalMessage.getSubject()); } // Zeroes EMAIL_ATTACHMENT_COUNT attributes.put(EMAIL_ATTACHMENT_COUNT, "0"); // But insert correct value if attachments are present if (parser.hasAttachments()) { attributes.put(EMAIL_ATTACHMENT_COUNT, String.valueOf(parser.getAttachmentList().size())); } } catch (Exception e) { // Message is invalid or triggered an error during parsing attributes.clear(); logger.error("Could not parse the flowfile {} as an email, treating as failure", new Object[]{originalFlowFile, e}); invalidFlowFilesList.add(originalFlowFile); } } }); if (attributes.size() > 0) { FlowFile updatedFlowFile = session.putAllAttributes(originalFlowFile, attributes); logger.info("Extracted {} headers into {} file", new Object[]{attributes.size(), updatedFlowFile}); processedFlowFilesList.add(updatedFlowFile); } session.transfer(processedFlowFilesList, REL_SUCCESS); session.transfer(invalidFlowFilesList, REL_FAILURE); } @Override public Set<Relationship> getRelationships() { return this.relationships; } @Override public final List<PropertyDescriptor> getSupportedPropertyDescriptors() { return descriptors; } }