/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nifi.processors.media; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.atomic.AtomicReference; import java.util.regex.Pattern; import org.apache.nifi.annotation.behavior.InputRequirement; import org.apache.nifi.annotation.behavior.InputRequirement.Requirement; import org.apache.nifi.annotation.behavior.SupportsBatching; import org.apache.nifi.annotation.behavior.WritesAttribute; import org.apache.nifi.annotation.behavior.WritesAttributes; import org.apache.nifi.annotation.documentation.CapabilityDescription; import org.apache.nifi.annotation.documentation.Tags; import org.apache.nifi.annotation.lifecycle.OnScheduled; import org.apache.nifi.components.PropertyDescriptor; import org.apache.nifi.flowfile.FlowFile; import org.apache.nifi.logging.ComponentLog; import org.apache.nifi.processor.AbstractProcessor; import org.apache.nifi.processor.ProcessContext; import org.apache.nifi.processor.ProcessSession; import org.apache.nifi.processor.ProcessorInitializationContext; import org.apache.nifi.processor.Relationship; import org.apache.nifi.processor.exception.ProcessException; import org.apache.nifi.processor.io.InputStreamCallback; import org.apache.nifi.processor.util.StandardValidators; import org.apache.tika.exception.TikaException; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.AutoDetectParser; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; @InputRequirement(Requirement.INPUT_REQUIRED) @Tags({"media", "file", "format", "metadata", "audio", "video", "image", "document", "pdf"}) @CapabilityDescription("Extract the content metadata from flowfiles containing audio, video, image, and other file " + "types. This processor relies on the Apache Tika project for file format detection and parsing. It " + "extracts a long list of metadata types for media files including audio, video, and print media " + "formats." + "NOTE: the attribute names and content extracted may vary across upgrades because parsing is performed by " + "the external Tika tools which in turn depend on other projects for metadata extraction. For the more " + "details and the list of supported file types, visit the library's website at http://tika.apache.org/.") @WritesAttributes({@WritesAttribute(attribute = "<Metadata Key Prefix><attribute>", description = "The extracted content metadata " + "will be inserted with the attribute name \"<Metadata Key Prefix><attribute>\", or \"<attribute>\" if " + "\"Metadata Key Prefix\" is not provided.")}) @SupportsBatching public class ExtractMediaMetadata extends AbstractProcessor { static final PropertyDescriptor MAX_NUMBER_OF_ATTRIBUTES = new PropertyDescriptor.Builder() .name("Max Number of Attributes") .description("Specify the max number of attributes to add to the flowfile. There is no guarantee in what order" + " the tags will be processed. By default it will process all of them.") .required(false) .defaultValue("100") .addValidator(StandardValidators.NON_NEGATIVE_INTEGER_VALIDATOR) .build(); private static final PropertyDescriptor MAX_ATTRIBUTE_LENGTH = new PropertyDescriptor.Builder() .name("Max Attribute Length") .description("Specifies the maximum length of a single attribute value. When a metadata item has multiple" + " values, they will be merged until this length is reached and then \", ...\" will be added as" + " an indicator that additional values where dropped. If a single value is longer than this, it" + " will be truncated and \"(truncated)\" appended to indicate that truncation occurred.") .required(true) .defaultValue("100") .addValidator(StandardValidators.NON_NEGATIVE_INTEGER_VALIDATOR) .build(); static final PropertyDescriptor METADATA_KEY_FILTER = new PropertyDescriptor.Builder() .name("Metadata Key Filter") .description("A regular expression identifying which metadata keys received from the parser should be" + " added to the flowfile attributes. If left blank, all metadata keys parsed will be added to the" + " flowfile attributes.") .required(false) .addValidator(StandardValidators.REGULAR_EXPRESSION_VALIDATOR) .build(); static final PropertyDescriptor METADATA_KEY_PREFIX = new PropertyDescriptor.Builder() .name("Metadata Key Prefix") .description("Text to be prefixed to metadata keys as the are added to the flowfile attributes. It is" + " recommended to end with with a separator character like '.' or '-', this is not automatically " + " added by the processor.") .required(false) .addValidator(StandardValidators.ATTRIBUTE_KEY_VALIDATOR) .expressionLanguageSupported(true) .build(); static final Relationship SUCCESS = new Relationship.Builder() .name("success") .description("Any FlowFile that successfully has media metadata extracted will be routed to success") .build(); static final Relationship FAILURE = new Relationship.Builder() .name("failure") .description("Any FlowFile that fails to have media metadata extracted will be routed to failure") .build(); private Set<Relationship> relationships; private List<PropertyDescriptor> properties; private final AtomicReference<Pattern> metadataKeyFilterRef = new AtomicReference<>(); private volatile AutoDetectParser autoDetectParser; @Override protected void init(final ProcessorInitializationContext context) { final List<PropertyDescriptor> properties = new ArrayList<>(); properties.add(MAX_NUMBER_OF_ATTRIBUTES); properties.add(MAX_ATTRIBUTE_LENGTH); properties.add(METADATA_KEY_FILTER); properties.add(METADATA_KEY_PREFIX); this.properties = Collections.unmodifiableList(properties); final Set<Relationship> relationships = new HashSet<>(); relationships.add(SUCCESS); relationships.add(FAILURE); this.relationships = Collections.unmodifiableSet(relationships); } @Override public Set<Relationship> getRelationships() { return this.relationships; } @Override protected List<PropertyDescriptor> getSupportedPropertyDescriptors() { return this.properties; } @SuppressWarnings("unused") @OnScheduled public void onScheduled(ProcessContext context) { String metadataKeyFilterInput = context.getProperty(METADATA_KEY_FILTER).getValue(); if (metadataKeyFilterInput != null && metadataKeyFilterInput.length() > 0) { metadataKeyFilterRef.set(Pattern.compile(metadataKeyFilterInput)); } else { metadataKeyFilterRef.set(null); } autoDetectParser = new AutoDetectParser(); } @Override public void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException { FlowFile flowFile = session.get(); if (flowFile == null) { return; } final ComponentLog logger = this.getLogger(); final AtomicReference<Map<String, String>> value = new AtomicReference<>(null); final Integer maxAttribCount = context.getProperty(MAX_NUMBER_OF_ATTRIBUTES).asInteger(); final Integer maxAttribLength = context.getProperty(MAX_ATTRIBUTE_LENGTH).asInteger(); final String prefix = context.getProperty(METADATA_KEY_PREFIX).evaluateAttributeExpressions(flowFile).getValue(); try { session.read(flowFile, new InputStreamCallback() { @Override public void process(InputStream in) throws IOException { try { Map<String, String> results = tika_parse(in, prefix, maxAttribCount, maxAttribLength); value.set(results); } catch (SAXException | TikaException e) { throw new IOException(e); } } }); // Write the results to attributes Map<String, String> results = value.get(); if (results != null && !results.isEmpty()) { flowFile = session.putAllAttributes(flowFile, results); } session.transfer(flowFile, SUCCESS); session.getProvenanceReporter().modifyAttributes(flowFile, "media attributes extracted"); } catch (ProcessException e) { logger.error("Failed to extract media metadata from {} due to {}", new Object[]{flowFile, e}); flowFile = session.penalize(flowFile); session.transfer(flowFile, FAILURE); } } private Map<String, String> tika_parse(InputStream sourceStream, String prefix, Integer maxAttribs, Integer maxAttribLen) throws IOException, TikaException, SAXException { final Metadata metadata = new Metadata(); final TikaInputStream tikaInputStream = TikaInputStream.get(sourceStream); autoDetectParser.parse(tikaInputStream, new DefaultHandler(), metadata); final Map<String, String> results = new HashMap<>(); final Pattern metadataKeyFilter = metadataKeyFilterRef.get(); final StringBuilder dataBuilder = new StringBuilder(); for (final String key : metadata.names()) { if (metadataKeyFilter != null && !metadataKeyFilter.matcher(key).matches()) { continue; } dataBuilder.setLength(0); if (metadata.isMultiValued(key)) { for (String val : metadata.getValues(key)) { if (dataBuilder.length() > 1) { dataBuilder.append(", "); } if (dataBuilder.length() + val.length() < maxAttribLen) { dataBuilder.append(val); } else { dataBuilder.append("..."); break; } } } else { dataBuilder.append(metadata.get(key)); } if (prefix == null) { results.put(key, dataBuilder.toString().trim()); } else { results.put(prefix + key, dataBuilder.toString().trim()); } // cutoff at max if provided if (maxAttribs != null && results.size() >= maxAttribs) { break; } } return results; } }