/* * Licensed to Elasticsearch under one or more contributor * license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright * ownership. Elasticsearch licenses this file to you under * the Apache License, Version 2.0 (the "License"); you may * not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.elasticsearch.ingest.attachment; import org.apache.commons.io.IOUtils; import org.elasticsearch.ElasticsearchParseException; import org.elasticsearch.ingest.IngestDocument; import org.elasticsearch.ingest.Processor; import org.elasticsearch.ingest.RandomDocumentPicks; import org.elasticsearch.test.ESTestCase; import org.junit.Before; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.Base64; import java.util.Collections; import java.util.EnumSet; import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.Set; import static org.elasticsearch.ingest.IngestDocumentMatcher.assertIngestDocument; import static org.hamcrest.Matchers.containsInAnyOrder; import static org.hamcrest.Matchers.containsString; import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.hasSize; import static org.hamcrest.Matchers.is; import static org.hamcrest.Matchers.not; import static org.hamcrest.Matchers.notNullValue; import static org.hamcrest.Matchers.nullValue; import static org.hamcrest.core.IsCollectionContaining.hasItem; public class AttachmentProcessorTests extends ESTestCase { private AttachmentProcessor processor; @Before public void createStandardProcessor() throws IOException { processor = new AttachmentProcessor(randomAlphaOfLength(10), "source_field", "target_field", EnumSet.allOf(AttachmentProcessor.Property.class), 10000, false); } public void testEnglishTextDocument() throws Exception { Map<String, Object> attachmentData = parseDocument("text-in-english.txt", processor); assertThat(attachmentData.keySet(), containsInAnyOrder("language", "content", "content_type", "content_length")); assertThat(attachmentData.get("language"), is("en")); assertThat(attachmentData.get("content"), is("\"God Save the Queen\" (alternatively \"God Save the King\"")); assertThat(attachmentData.get("content_type").toString(), containsString("text/plain")); assertThat(attachmentData.get("content_length"), is(notNullValue())); } public void testHtmlDocumentWithRandomFields() throws Exception { //date is not present in the html doc ArrayList<AttachmentProcessor.Property> fieldsList = new ArrayList<>(EnumSet.complementOf(EnumSet.of (AttachmentProcessor.Property.DATE))); Set<AttachmentProcessor.Property> selectedProperties = new HashSet<>(); int numFields = randomIntBetween(1, fieldsList.size()); String[] selectedFieldNames = new String[numFields]; for (int i = 0; i < numFields; i++) { AttachmentProcessor.Property property; do { property = randomFrom(fieldsList); } while (selectedProperties.add(property) == false); selectedFieldNames[i] = property.toLowerCase(); } if (randomBoolean()) { selectedProperties.add(AttachmentProcessor.Property.DATE); } processor = new AttachmentProcessor(randomAlphaOfLength(10), "source_field", "target_field", selectedProperties, 10000, false); Map<String, Object> attachmentData = parseDocument("htmlWithEmptyDateMeta.html", processor); assertThat(attachmentData.keySet(), hasSize(selectedFieldNames.length)); assertThat(attachmentData.keySet(), containsInAnyOrder(selectedFieldNames)); } public void testFrenchTextDocument() throws Exception { Map<String, Object> attachmentData = parseDocument("text-in-french.txt", processor); assertThat(attachmentData.keySet(), hasItem("language")); assertThat(attachmentData.get("language"), is("fr")); } public void testUnknownLanguageDocument() throws Exception { Map<String, Object> attachmentData = parseDocument("text-gibberish.txt", processor); assertThat(attachmentData.keySet(), hasItem("language")); // lt seems some standard for not detected assertThat(attachmentData.get("language"), is("lt")); } public void testEmptyTextDocument() throws Exception { Map<String, Object> attachmentData = parseDocument("text-empty.txt", processor); assertThat(attachmentData.keySet(), not(hasItem("language"))); } public void testWordDocument() throws Exception { Map<String, Object> attachmentData = parseDocument("issue-104.docx", processor); assertThat(attachmentData.keySet(), containsInAnyOrder("content", "language", "date", "author", "content_type", "content_length")); assertThat(attachmentData.get("content"), is(notNullValue())); assertThat(attachmentData.get("language"), is("en")); assertThat(attachmentData.get("date"), is("2012-10-12T11:17:00Z")); assertThat(attachmentData.get("author"), is("Windows User")); assertThat(attachmentData.get("content_length"), is(notNullValue())); assertThat(attachmentData.get("content_type").toString(), is("application/vnd.openxmlformats-officedocument.wordprocessingml.document")); } public void testWordDocumentWithVisioSchema() throws Exception { Map<String, Object> attachmentData = parseDocument("issue-22077.docx", processor); assertThat(attachmentData.keySet(), containsInAnyOrder("content", "language", "date", "author", "content_type", "content_length")); assertThat(attachmentData.get("content").toString(), containsString("Table of Contents")); assertThat(attachmentData.get("language"), is("en")); assertThat(attachmentData.get("date"), is("2015-01-06T18:07:00Z")); assertThat(attachmentData.get("author"), is(notNullValue())); assertThat(attachmentData.get("content_length"), is(notNullValue())); assertThat(attachmentData.get("content_type").toString(), is("application/vnd.openxmlformats-officedocument.wordprocessingml.document")); } public void testLegacyWordDocumentWithVisioSchema() throws Exception { Map<String, Object> attachmentData = parseDocument("issue-22077.doc", processor); assertThat(attachmentData.keySet(), containsInAnyOrder("content", "language", "date", "author", "content_type", "content_length")); assertThat(attachmentData.get("content").toString(), containsString("Table of Contents")); assertThat(attachmentData.get("language"), is("en")); assertThat(attachmentData.get("date"), is("2016-12-16T15:04:00Z")); assertThat(attachmentData.get("author"), is(notNullValue())); assertThat(attachmentData.get("content_length"), is(notNullValue())); assertThat(attachmentData.get("content_type").toString(), is("application/msword")); } public void testPdf() throws Exception { Map<String, Object> attachmentData = parseDocument("test.pdf", processor); assertThat(attachmentData.get("content"), is("This is a test, with umlauts, from München\n\nAlso contains newlines for testing.\n\nAnd one more.")); assertThat(attachmentData.get("content_type").toString(), is("application/pdf")); assertThat(attachmentData.get("content_length"), is(notNullValue())); } public void testVisioIsExcluded() throws Exception { Map<String, Object> attachmentData = parseDocument("issue-22077.vsdx", processor); assertThat(attachmentData.get("content"), nullValue()); assertThat(attachmentData.get("content_type"), is("application/vnd.ms-visio.drawing")); assertThat(attachmentData.get("content_length"), is(0L)); } public void testEncryptedPdf() throws Exception { ElasticsearchParseException e = expectThrows(ElasticsearchParseException.class, () -> parseDocument("encrypted.pdf", processor)); assertThat(e.getDetailedMessage(), containsString("document is encrypted")); } public void testHtmlDocument() throws Exception { Map<String, Object> attachmentData = parseDocument("htmlWithEmptyDateMeta.html", processor); assertThat(attachmentData.keySet(), containsInAnyOrder("language", "content", "author", "keywords", "title", "content_type", "content_length")); assertThat(attachmentData.get("language"), is("en")); assertThat(attachmentData.get("content"), is(notNullValue())); assertThat(attachmentData.get("content_length"), is(notNullValue())); assertThat(attachmentData.get("author"), is("kimchy")); assertThat(attachmentData.get("keywords"), is("elasticsearch,cool,bonsai")); assertThat(attachmentData.get("title"), is("Hello")); assertThat(attachmentData.get("content_type").toString(), containsString("text/html")); } public void testXHtmlDocument() throws Exception { Map<String, Object> attachmentData = parseDocument("testXHTML.html", processor); assertThat(attachmentData.keySet(), containsInAnyOrder("language", "content", "author", "title", "content_type", "content_length")); assertThat(attachmentData.get("content_type").toString(), containsString("application/xhtml+xml")); } public void testEpubDocument() throws Exception { Map<String, Object> attachmentData = parseDocument("testEPUB.epub", processor); assertThat(attachmentData.keySet(), containsInAnyOrder("language", "content", "author", "title", "content_type", "content_length", "date", "keywords")); assertThat(attachmentData.get("content_type").toString(), containsString("application/epub+zip")); } // no real detection, just rudimentary public void testAsciidocDocument() throws Exception { Map<String, Object> attachmentData = parseDocument("asciidoc.asciidoc", processor); assertThat(attachmentData.keySet(), containsInAnyOrder("language", "content_type", "content", "content_length")); assertThat(attachmentData.get("content_type").toString(), containsString("text/plain")); } public void testParseAsBytesArray() throws Exception { String path = "/org/elasticsearch/ingest/attachment/test/sample-files/text-in-english.txt"; byte[] bytes; try (InputStream is = AttachmentProcessorTests.class.getResourceAsStream(path)) { bytes = IOUtils.toByteArray(is); } Map<String, Object> document = new HashMap<>(); document.put("source_field", bytes); IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); processor.execute(ingestDocument); @SuppressWarnings("unchecked") Map<String, Object> attachmentData = (Map<String, Object>) ingestDocument.getSourceAndMetadata().get("target_field"); assertThat(attachmentData.keySet(), containsInAnyOrder("language", "content", "content_type", "content_length")); assertThat(attachmentData.get("language"), is("en")); assertThat(attachmentData.get("content"), is("\"God Save the Queen\" (alternatively \"God Save the King\"")); assertThat(attachmentData.get("content_type").toString(), containsString("text/plain")); assertThat(attachmentData.get("content_length"), is(notNullValue())); } public void testNullValueWithIgnoreMissing() throws Exception { IngestDocument originalIngestDocument = RandomDocumentPicks.randomIngestDocument(random(), Collections.singletonMap("source_field", null)); IngestDocument ingestDocument = new IngestDocument(originalIngestDocument); Processor processor = new AttachmentProcessor(randomAlphaOfLength(10), "source_field", "randomTarget", null, 10, true); processor.execute(ingestDocument); assertIngestDocument(originalIngestDocument, ingestDocument); } public void testNonExistentWithIgnoreMissing() throws Exception { IngestDocument originalIngestDocument = RandomDocumentPicks.randomIngestDocument(random(), Collections.emptyMap()); IngestDocument ingestDocument = new IngestDocument(originalIngestDocument); Processor processor = new AttachmentProcessor(randomAlphaOfLength(10), "source_field", "randomTarget", null, 10, true); processor.execute(ingestDocument); assertIngestDocument(originalIngestDocument, ingestDocument); } public void testNullWithoutIgnoreMissing() throws Exception { IngestDocument originalIngestDocument = RandomDocumentPicks.randomIngestDocument(random(), Collections.singletonMap("source_field", null)); IngestDocument ingestDocument = new IngestDocument(originalIngestDocument); Processor processor = new AttachmentProcessor(randomAlphaOfLength(10), "source_field", "randomTarget", null, 10, false); Exception exception = expectThrows(Exception.class, () -> processor.execute(ingestDocument)); assertThat(exception.getMessage(), equalTo("field [source_field] is null, cannot parse.")); } public void testNonExistentWithoutIgnoreMissing() throws Exception { IngestDocument originalIngestDocument = RandomDocumentPicks.randomIngestDocument(random(), Collections.emptyMap()); IngestDocument ingestDocument = new IngestDocument(originalIngestDocument); Processor processor = new AttachmentProcessor(randomAlphaOfLength(10), "source_field", "randomTarget", null, 10, false); Exception exception = expectThrows(Exception.class, () -> processor.execute(ingestDocument)); assertThat(exception.getMessage(), equalTo("field [source_field] not present as part of path [source_field]")); } private Map<String, Object> parseDocument(String file, AttachmentProcessor processor) throws Exception { Map<String, Object> document = new HashMap<>(); document.put("source_field", getAsBase64(file)); IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document); processor.execute(ingestDocument); @SuppressWarnings("unchecked") Map<String, Object> attachmentData = (Map<String, Object>) ingestDocument.getSourceAndMetadata().get("target_field"); return attachmentData; } protected String getAsBase64(String filename) throws Exception { String path = "/org/elasticsearch/ingest/attachment/test/sample-files/" + filename; try (InputStream is = AttachmentProcessorTests.class.getResourceAsStream(path)) { byte bytes[] = IOUtils.toByteArray(is); return Base64.getEncoder().encodeToString(bytes); } } }