/* * Licensed to Elasticsearch under one or more contributor * license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright * ownership. Elasticsearch licenses this file to you under * the Apache License, Version 2.0 (the "License"); you may * not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.elasticsearch.ingest.attachment; import org.apache.tika.language.LanguageIdentifier; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.elasticsearch.ElasticsearchParseException; import org.elasticsearch.common.Strings; import org.elasticsearch.ingest.AbstractProcessor; import org.elasticsearch.ingest.IngestDocument; import org.elasticsearch.ingest.Processor; import java.io.IOException; import java.util.Arrays; import java.util.EnumSet; import java.util.HashMap; import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Set; import static org.elasticsearch.ingest.ConfigurationUtils.newConfigurationException; import static org.elasticsearch.ingest.ConfigurationUtils.readBooleanProperty; import static org.elasticsearch.ingest.ConfigurationUtils.readIntProperty; import static org.elasticsearch.ingest.ConfigurationUtils.readOptionalList; import static org.elasticsearch.ingest.ConfigurationUtils.readStringProperty; public final class AttachmentProcessor extends AbstractProcessor { public static final String TYPE = "attachment"; private static final int NUMBER_OF_CHARS_INDEXED = 100000; private final String field; private final String targetField; private final Set<Property> properties; private final int indexedChars; private final boolean ignoreMissing; AttachmentProcessor(String tag, String field, String targetField, Set<Property> properties, int indexedChars, boolean ignoreMissing) throws IOException { super(tag); this.field = field; this.targetField = targetField; this.properties = properties; this.indexedChars = indexedChars; this.ignoreMissing = ignoreMissing; } boolean isIgnoreMissing() { return ignoreMissing; } @Override public void execute(IngestDocument ingestDocument) { Map<String, Object> additionalFields = new HashMap<>(); byte[] input = ingestDocument.getFieldValueAsBytes(field, ignoreMissing); if (input == null && ignoreMissing) { return; } else if (input == null) { throw new IllegalArgumentException("field [" + field + "] is null, cannot parse."); } try { Metadata metadata = new Metadata(); String parsedContent = TikaImpl.parse(input, metadata, indexedChars); if (properties.contains(Property.CONTENT) && Strings.hasLength(parsedContent)) { // somehow tika seems to append a newline at the end automatically, lets remove that again additionalFields.put(Property.CONTENT.toLowerCase(), parsedContent.trim()); } if (properties.contains(Property.LANGUAGE) && Strings.hasLength(parsedContent)) { LanguageIdentifier identifier = new LanguageIdentifier(parsedContent); String language = identifier.getLanguage(); additionalFields.put(Property.LANGUAGE.toLowerCase(), language); } if (properties.contains(Property.DATE)) { String createdDate = metadata.get(TikaCoreProperties.CREATED); if (createdDate != null) { additionalFields.put(Property.DATE.toLowerCase(), createdDate); } } if (properties.contains(Property.TITLE)) { String title = metadata.get(TikaCoreProperties.TITLE); if (Strings.hasLength(title)) { additionalFields.put(Property.TITLE.toLowerCase(), title); } } if (properties.contains(Property.AUTHOR)) { String author = metadata.get("Author"); if (Strings.hasLength(author)) { additionalFields.put(Property.AUTHOR.toLowerCase(), author); } } if (properties.contains(Property.KEYWORDS)) { String keywords = metadata.get("Keywords"); if (Strings.hasLength(keywords)) { additionalFields.put(Property.KEYWORDS.toLowerCase(), keywords); } } if (properties.contains(Property.CONTENT_TYPE)) { String contentType = metadata.get(Metadata.CONTENT_TYPE); if (Strings.hasLength(contentType)) { additionalFields.put(Property.CONTENT_TYPE.toLowerCase(), contentType); } } if (properties.contains(Property.CONTENT_LENGTH)) { String contentLength = metadata.get(Metadata.CONTENT_LENGTH); long length; if (Strings.hasLength(contentLength)) { length = Long.parseLong(contentLength); } else { length = parsedContent.length(); } additionalFields.put(Property.CONTENT_LENGTH.toLowerCase(), length); } } catch (Exception e) { throw new ElasticsearchParseException("Error parsing document in field [{}]", e, field); } ingestDocument.setFieldValue(targetField, additionalFields); } @Override public String getType() { return TYPE; } String getField() { return field; } String getTargetField() { return targetField; } Set<Property> getProperties() { return properties; } int getIndexedChars() { return indexedChars; } public static final class Factory implements Processor.Factory { static final Set<Property> DEFAULT_PROPERTIES = EnumSet.allOf(Property.class); @Override public AttachmentProcessor create(Map<String, Processor.Factory> registry, String processorTag, Map<String, Object> config) throws Exception { String field = readStringProperty(TYPE, processorTag, config, "field"); String targetField = readStringProperty(TYPE, processorTag, config, "target_field", "attachment"); List<String> properyNames = readOptionalList(TYPE, processorTag, config, "properties"); int indexedChars = readIntProperty(TYPE, processorTag, config, "indexed_chars", NUMBER_OF_CHARS_INDEXED); boolean ignoreMissing = readBooleanProperty(TYPE, processorTag, config, "ignore_missing", false); final Set<Property> properties; if (properyNames != null) { properties = EnumSet.noneOf(Property.class); for (String fieldName : properyNames) { try { properties.add(Property.parse(fieldName)); } catch (Exception e) { throw newConfigurationException(TYPE, processorTag, "properties", "illegal field option [" + fieldName + "]. valid values are " + Arrays.toString(Property.values())); } } } else { properties = DEFAULT_PROPERTIES; } return new AttachmentProcessor(processorTag, field, targetField, properties, indexedChars, ignoreMissing); } } enum Property { CONTENT, TITLE, AUTHOR, KEYWORDS, DATE, CONTENT_TYPE, CONTENT_LENGTH, LANGUAGE; public static Property parse(String value) { return valueOf(value.toUpperCase(Locale.ROOT)); } public String toLowerCase() { return this.toString().toLowerCase(Locale.ROOT); } } }