/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.mapper.attachments;
import org.apache.tika.io.IOUtils;
import org.apache.tika.metadata.Metadata;
import org.elasticsearch.common.bytes.BytesReference;
import org.elasticsearch.common.compress.CompressedXContent;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.mapper.DocumentMapper;
import org.elasticsearch.index.mapper.DocumentMapperParser;
import org.elasticsearch.index.mapper.ParseContext;
import org.elasticsearch.index.mapper.core.MapperTestUtils;
import org.junit.Before;
import java.io.IOException;
import java.io.InputStream;
import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
import static org.elasticsearch.mapper.attachments.AttachmentMapper.FieldNames.*;
import static org.elasticsearch.test.StreamsUtils.copyToBytesFromClasspath;
import static org.elasticsearch.test.StreamsUtils.copyToStringFromClasspath;
import static org.hamcrest.Matchers.isEmptyOrNullString;
import static org.hamcrest.Matchers.not;
/**
* Test for different documents
*/
public class VariousDocTests extends AttachmentUnitTestCase {
protected DocumentMapper docMapper;
@Before
public void createMapper() throws IOException {
DocumentMapperParser mapperParser = MapperTestUtils.newMapperService(createTempDir(), Settings.EMPTY, getIndicesModuleWithRegisteredAttachmentMapper()).documentMapperParser();
String mapping = copyToStringFromClasspath("/org/elasticsearch/index/mapper/attachment/test/unit/various-doc/test-mapping.json");
docMapper = mapperParser.parse("person", new CompressedXContent(mapping));
}
/**
* Test for https://github.com/elasticsearch/elasticsearch-mapper-attachments/issues/104
*/
public void testWordDocxDocument104() throws Exception {
assertParseable("issue-104.docx");
testMapper("issue-104.docx", false);
}
/**
* Test for encrypted PDF
*/
public void testEncryptedPDFDocument() throws Exception {
assertException("encrypted.pdf", "is encrypted");
// TODO Remove when this will be fixed in Tika. See https://issues.apache.org/jira/browse/TIKA-1548
System.clearProperty("sun.font.fontmanager");
testMapper("encrypted.pdf", true);
}
/**
* Test for HTML
*/
public void testHtmlDocument() throws Exception {
assertParseable("htmlWithEmptyDateMeta.html");
testMapper("htmlWithEmptyDateMeta.html", false);
}
/**
* Test for XHTML
*/
public void testXHtmlDocument() throws Exception {
assertParseable("testXHTML.html");
testMapper("testXHTML.html", false);
}
/**
* Test for TXT
*/
public void testTxtDocument() throws Exception {
assertParseable("text-in-english.txt");
testMapper("text-in-english.txt", false);
}
/**
* Test for .epub
*/
public void testEpubDocument() throws Exception {
assertParseable("testEPUB.epub");
testMapper("testEPUB.epub", false);
}
/**
* Test for ASCIIDOC
* Not yet supported by Tika: https://github.com/elasticsearch/elasticsearch-mapper-attachments/issues/29
*/
public void testAsciidocDocument() throws Exception {
assertParseable("asciidoc.asciidoc");
testMapper("asciidoc.asciidoc", false);
}
void assertException(String filename, String expectedMessage) throws Exception {
try (InputStream is = VariousDocTests.class.getResourceAsStream("/org/elasticsearch/index/mapper/attachment/test/sample-files/" + filename)) {
byte bytes[] = IOUtils.toByteArray(is);
TikaImpl.parse(bytes, new Metadata(), -1);
fail("expected exception");
} catch (Exception e) {
if (e.getMessage() != null && e.getMessage().contains(expectedMessage)) {
// ok
} else {
// unexpected
throw e;
}
}
}
protected void assertParseable(String filename) throws Exception {
try (InputStream is = VariousDocTests.class.getResourceAsStream("/org/elasticsearch/index/mapper/attachment/test/sample-files/" + filename)) {
byte bytes[] = IOUtils.toByteArray(is);
String parsedContent = TikaImpl.parse(bytes, new Metadata(), -1);
assertThat(parsedContent, not(isEmptyOrNullString()));
logger.debug("extracted content: {}", parsedContent);
}
}
protected void testMapper(String filename, boolean errorExpected) throws IOException {
byte[] html = copyToBytesFromClasspath("/org/elasticsearch/index/mapper/attachment/test/sample-files/" + filename);
BytesReference json = jsonBuilder()
.startObject()
.startObject("file")
.field("_name", filename)
.field("_content", html)
.endObject()
.endObject().bytes();
ParseContext.Document doc = docMapper.parse("person", "person", "1", json).rootDoc();
if (!errorExpected) {
assertThat(doc.get(docMapper.mappers().getMapper("file.content").fieldType().names().indexName()), not(isEmptyOrNullString()));
logger.debug("-> extracted content: {}", doc.get(docMapper.mappers().getMapper("file").fieldType().names().indexName()));
logger.debug("-> extracted metadata:");
printMetadataContent(doc, AUTHOR);
printMetadataContent(doc, CONTENT_LENGTH);
printMetadataContent(doc, CONTENT_TYPE);
printMetadataContent(doc, DATE);
printMetadataContent(doc, KEYWORDS);
printMetadataContent(doc, LANGUAGE);
printMetadataContent(doc, NAME);
printMetadataContent(doc, TITLE);
}
}
private void printMetadataContent(ParseContext.Document doc, String field) {
logger.debug("- [{}]: [{}]", field, doc.get(docMapper.mappers().getMapper("file." + field).fieldType().names().indexName()));
}
}