/* * Licensed to David Pilato (the "Author") under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. Author licenses this * file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package fr.pilato.elasticsearch.crawler.fs.test.unit.parser; import fr.pilato.elasticsearch.crawler.fs.meta.doc.Doc; import fr.pilato.elasticsearch.crawler.fs.meta.settings.Fs; import fr.pilato.elasticsearch.crawler.fs.meta.settings.FsSettings; import fr.pilato.elasticsearch.crawler.fs.tika.TikaDocParser; import org.junit.Test; import java.io.IOException; import java.io.InputStream; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; import java.time.LocalDateTime; import java.util.Map; import static org.hamcrest.MatcherAssert.assertThat; import static org.hamcrest.Matchers.containsInAnyOrder; import static org.hamcrest.Matchers.containsString; import static org.hamcrest.Matchers.emptyIterable; import static org.hamcrest.Matchers.hasEntry; import static org.hamcrest.Matchers.is; import static org.hamcrest.Matchers.not; import static org.hamcrest.Matchers.notNullValue; import static org.hamcrest.Matchers.nullValue; import static org.junit.Assert.fail; import static org.junit.Assume.assumeNoException; public class TikaDocParserTest extends DocParserTestCase { /** * Test case for https://github.com/dadoonet/fscrawler/issues/162 */ @Test public void testLangDetect162() throws IOException { FsSettings fsSettings = FsSettings.builder(getCurrentTestName()) .setFs(Fs.builder().setLangDetect(true).build()) .build(); Doc doc = extractFromFile("test.txt", fsSettings); assertThat(doc.getMeta().getLanguage(), is("en")); doc = extractFromFile("test-fr.txt", fsSettings); assertThat(doc.getMeta().getLanguage(), is("fr")); doc = extractFromFile("test-de.txt", fsSettings); assertThat(doc.getMeta().getLanguage(), is("de")); doc = extractFromFile("test-enfrde.txt", fsSettings); assertThat(doc.getMeta().getLanguage(), is("fr")); } /** * Test case for https://github.com/dadoonet/fscrawler/issues/221 */ @Test public void testPdfIssue221() throws IOException { // We test document 1 Doc doc = extractFromFile("issue-221-doc1.pdf"); // Extracted content assertThat(doc.getContent(), containsString("Formations")); // Content Type assertThat(doc.getFile().getContentType(), containsString("application/pdf")); // Meta data assertThat(doc.getMeta().getAuthor(), is(notNullValue())); assertThat(doc.getMeta().getDate(), is(LocalDateTime.of(2016, 9, 20, 9, 38, 56))); assertThat(doc.getMeta().getKeywords(), not(emptyIterable())); assertThat(doc.getMeta().getTitle(), containsString("Recherche")); // We test document 2 doc = extractFromFile("issue-221-doc2.pdf"); // Extracted content assertThat(doc.getContent(), containsString("FORMATIONS")); // Content Type assertThat(doc.getFile().getContentType(), containsString("application/pdf")); // Meta data assertThat(doc.getMeta().getAuthor(), is(nullValue())); assertThat(doc.getMeta().getDate(), is(LocalDateTime.of(2016, 9, 19, 14, 29, 37))); assertThat(doc.getMeta().getKeywords(), emptyIterable()); assertThat(doc.getMeta().getTitle(), is(nullValue())); } /** * Test case for https://github.com/dadoonet/fscrawler/issues/163 */ @Test public void testXmlIssue163() throws IOException { Doc doc = extractFromFile("issue-163.xml"); // Extracted content assertThat(doc.getContent(), is(" \n")); // Content Type assertThat(doc.getFile().getContentType(), containsString("application/xml")); // Meta data assertThat(doc.getMeta().getAuthor(), is(nullValue())); assertThat(doc.getMeta().getDate(), is(nullValue())); assertThat(doc.getMeta().getKeywords(), emptyIterable()); assertThat(doc.getMeta().getTitle(), is(nullValue())); Map<String, String> raw = doc.getMeta().getRaw(); assertThat(raw, hasEntry("X-Parsed-By", "org.apache.tika.parser.DefaultParser")); assertThat(raw, hasEntry(is("Content-Type"), containsString("application/xml"))); } @Test public void testExtractFromDoc() throws IOException { Doc doc = extractFromFileExtension("doc"); // Extracted content assertThat(doc.getContent(), containsString("This is a sample text available in page")); // Content Type assertThat(doc.getFile().getContentType(), is("application/msword")); // Meta data assertThat(doc.getMeta().getAuthor(), is("David Pilato")); assertThat(doc.getMeta().getDate(), is(LocalDateTime.of(2016, 7, 7, 8, 37, 0))); assertThat(doc.getMeta().getKeywords(), containsInAnyOrder("keyword1"," keyword2")); assertThat(doc.getMeta().getTitle(), is("Test Tika title")); Map<String, String> raw = doc.getMeta().getRaw(); assertThat(raw, hasEntry("date", "2016-07-07T08:37:00Z")); assertThat(raw, hasEntry("Company", "elastic")); assertThat(raw, hasEntry("subject", "Test Tika Object")); assertThat(raw, hasEntry("Word-Count", "19")); assertThat(raw, hasEntry("Manager", "My Mother")); assertThat(raw, hasEntry("Template", "Normal.dotm")); assertThat(raw, hasEntry("dc:title", "Test Tika title")); assertThat(raw, hasEntry("modified", "2016-07-07T08:37:00Z")); assertThat(raw, hasEntry("cp:subject", "Test Tika Object")); assertThat(raw, hasEntry("custom:N° du document", "1234")); assertThat(raw, hasEntry("meta:author", "David Pilato")); assertThat(raw, hasEntry("extended-properties:Application", "Microsoft Macintosh Word")); assertThat(raw, hasEntry("meta:creation-date", "2016-07-07T08:37:00Z")); assertThat(raw, hasEntry("Creation-Date", "2016-07-07T08:37:00Z")); assertThat(raw, hasEntry("Last-Author", "David Pilato")); assertThat(raw, hasEntry("w:comments", "Comments")); assertThat(raw, hasEntry("Character Count", "68")); assertThat(raw, hasEntry("Page-Count", "2")); assertThat(raw, hasEntry("extended-properties:Template", "Normal.dotm")); assertThat(raw, hasEntry("Author", "David Pilato")); assertThat(raw, hasEntry("meta:page-count", "2")); assertThat(raw, hasEntry("cp:revision", "2")); assertThat(raw, hasEntry("Keywords", "keyword1, keyword2")); assertThat(raw, hasEntry("meta:word-count", "19")); assertThat(raw, hasEntry("Category", "test")); assertThat(raw, hasEntry("dc:creator", "David Pilato")); assertThat(raw, hasEntry("extended-properties:Company", "elastic")); assertThat(raw, hasEntry("dcterms:created", "2016-07-07T08:37:00Z")); assertThat(raw, hasEntry("dcterms:modified", "2016-07-07T08:37:00Z")); assertThat(raw, hasEntry("Last-Modified", "2016-07-07T08:37:00Z")); assertThat(raw, hasEntry("title", "Test Tika title")); assertThat(raw, hasEntry("Last-Save-Date", "2016-07-07T08:37:00Z")); assertThat(raw, hasEntry("meta:character-count", "68")); assertThat(raw, hasEntry("custom:Terminé le", "2016-07-06T22:00:00Z")); assertThat(raw, hasEntry("meta:save-date", "2016-07-07T08:37:00Z")); assertThat(raw, hasEntry("Application-Name", "Microsoft Macintosh Word")); assertThat(raw, hasEntry("Edit-Time", "600000000")); assertThat(raw, hasEntry("extended-properties:Manager", "My Mother")); assertThat(raw, hasEntry("Content-Type", "application/msword")); assertThat(raw, hasEntry("X-Parsed-By", "org.apache.tika.parser.DefaultParser")); assertThat(raw, hasEntry("creator", "David Pilato")); assertThat(raw, hasEntry("dc:subject", "keyword1, keyword2")); assertThat(raw, hasEntry("meta:last-author", "David Pilato")); assertThat(raw, hasEntry("Comments", "Comments")); assertThat(raw, hasEntry("xmpTPg:NPages", "2")); assertThat(raw, hasEntry("Revision-Number", "2")); assertThat(raw, hasEntry("meta:keyword", "keyword1, keyword2")); assertThat(raw, hasEntry("comment", "Comments")); assertThat(raw, hasEntry("cp:category", "test")); } @Test public void testExtractFromDocx() throws IOException { Doc doc = extractFromFileExtension("docx"); // Extracted content assertThat(doc.getContent(), containsString("This is a sample text available in page")); // Content Type assertThat(doc.getFile().getContentType(), is("application/vnd.openxmlformats-officedocument.wordprocessingml.document")); // Meta data assertThat(doc.getMeta().getAuthor(), is("David Pilato")); assertThat(doc.getMeta().getDate(), is(LocalDateTime.of(2016, 7, 7, 8, 36, 0))); assertThat(doc.getMeta().getKeywords(), containsInAnyOrder("keyword1"," keyword2")); assertThat(doc.getMeta().getTitle(), is("Test Tika title")); Map<String, String> raw = doc.getMeta().getRaw(); assertThat(raw, hasEntry("date", "2016-07-07T08:36:00Z")); assertThat(raw, hasEntry("Total-Time", "6")); assertThat(raw, hasEntry("extended-properties:AppVersion", "15.0000")); assertThat(raw, hasEntry("meta:paragraph-count", "2")); assertThat(raw, hasEntry("subject", "Test Tika Object")); assertThat(raw, hasEntry("Word-Count", "19")); assertThat(raw, hasEntry("meta:line-count", "3")); assertThat(raw, hasEntry("Manager", "My Mother")); assertThat(raw, hasEntry("Template", "Normal.dotm")); assertThat(raw, hasEntry("Paragraph-Count", "2")); assertThat(raw, hasEntry("meta:character-count-with-spaces", "82")); assertThat(raw, hasEntry("dc:title", "Test Tika title")); assertThat(raw, hasEntry("modified", "2016-07-07T08:36:00Z")); assertThat(raw, hasEntry("cp:subject", "Test Tika Object")); assertThat(raw, hasEntry("custom:N° du document", "1234")); assertThat(raw, hasEntry("meta:author", "David Pilato")); assertThat(raw, hasEntry("meta:creation-date", "2015-12-19T23:39:00Z")); assertThat(raw, hasEntry("extended-properties:Application", "Microsoft Macintosh Word")); assertThat(raw, hasEntry("Creation-Date", "2015-12-19T23:39:00Z")); assertThat(raw, hasEntry("Character-Count-With-Spaces", "82")); assertThat(raw, hasEntry("Last-Author", "David Pilato")); assertThat(raw, hasEntry("Character Count", "65")); assertThat(raw, hasEntry("Page-Count", "2")); assertThat(raw, hasEntry("Application-Version", "15.0000")); assertThat(raw, hasEntry("extended-properties:Template", "Normal.dotm")); assertThat(raw, hasEntry("Author", "David Pilato")); assertThat(raw, hasEntry("publisher", "elastic")); assertThat(raw, hasEntry("meta:page-count", "2")); assertThat(raw, hasEntry("cp:revision", "4")); assertThat(raw, hasEntry("dc:description", "Comments")); assertThat(raw, hasEntry("Keywords", "keyword1, keyword2")); assertThat(raw, hasEntry("Category", "test")); assertThat(raw, hasEntry("meta:word-count", "19")); assertThat(raw, hasEntry("dc:creator", "David Pilato")); assertThat(raw, hasEntry("extended-properties:Company", "elastic")); assertThat(raw, hasEntry("description", "Comments")); assertThat(raw, hasEntry("dcterms:created", "2015-12-19T23:39:00Z")); assertThat(raw, hasEntry("Last-Modified", "2016-07-07T08:36:00Z")); assertThat(raw, hasEntry("dcterms:modified", "2016-07-07T08:36:00Z")); assertThat(raw, hasEntry("title", "Test Tika title")); assertThat(raw, hasEntry("Last-Save-Date", "2016-07-07T08:36:00Z")); assertThat(raw, hasEntry("meta:character-count", "65")); assertThat(raw, hasEntry("custom:Terminé le", "2016-07-06T22:00:00Z")); assertThat(raw, hasEntry("Line-Count", "3")); assertThat(raw, hasEntry("meta:save-date", "2016-07-07T08:36:00Z")); assertThat(raw, hasEntry("Application-Name", "Microsoft Macintosh Word")); assertThat(raw, hasEntry("extended-properties:TotalTime", "6")); assertThat(raw, hasEntry("extended-properties:Manager", "My Mother")); assertThat(raw, hasEntry("Content-Type", "application/vnd.openxmlformats-officedocument.wordprocessingml.document")); assertThat(raw, hasEntry("X-Parsed-By", "org.apache.tika.parser.DefaultParser")); assertThat(raw, hasEntry("creator", "David Pilato")); assertThat(raw, hasEntry("dc:subject", "keyword1, keyword2")); assertThat(raw, hasEntry("meta:last-author", "David Pilato")); assertThat(raw, hasEntry("xmpTPg:NPages", "2")); assertThat(raw, hasEntry("Revision-Number", "4")); assertThat(raw, hasEntry("meta:keyword", "keyword1, keyword2")); assertThat(raw, hasEntry("cp:category", "test")); assertThat(raw, hasEntry("dc:publisher", "elastic")); } @Test public void testExtractFromHtml() throws IOException { Doc doc = extractFromFileExtension("html"); // Extracted content assertThat(doc.getContent(), containsString("a sample text available in")); // Content Type assertThat(doc.getFile().getContentType(), containsString("text/html")); // Meta data assertThat(doc.getMeta().getAuthor(), is(nullValue())); assertThat(doc.getMeta().getDate(), is(nullValue())); assertThat(doc.getMeta().getKeywords(), emptyIterable()); assertThat(doc.getMeta().getTitle(), is("Test Tika title")); Map<String, String> raw = doc.getMeta().getRaw(); assertThat(raw, hasEntry("X-Parsed-By", "org.apache.tika.parser.DefaultParser")); assertThat(raw, hasEntry("Titre", "Test Tika title")); assertThat(raw, hasEntry("Originator", "Microsoft Word 15")); assertThat(raw, hasEntry("Mots clés", "keyword1, keyword2")); assertThat(raw, hasEntry("Content-Location", "Web%20page")); assertThat(raw, hasEntry("dc:title", "Test Tika title")); assertThat(raw, hasEntry("Content-Encoding", "UTF-8")); assertThat(raw, hasEntry("Content-Type-Hint", "text/html; charset=macintosh")); assertThat(raw, hasEntry("ProgId", "Word.Document")); assertThat(raw, hasEntry("title", "Test Tika title")); assertThat(raw, hasEntry(is("Content-Type"), containsString("text/html"))); assertThat(raw, hasEntry("Generator", "Microsoft Word 15")); } /** * Test for #87: https://github.com/dadoonet/fscrawler/issues/87 */ @Test public void testExtractFromMp3() throws IOException { Doc doc = extractFromFileExtension("mp3"); // Extracted content assertThat(doc.getContent(), containsString("Test Tika")); // Content Type assertThat(doc.getFile().getContentType(), is("audio/mpeg")); // Meta data assertThat(doc.getMeta().getAuthor(), is("David Pilato")); assertThat(doc.getMeta().getDate(), is(nullValue())); assertThat(doc.getMeta().getKeywords(), emptyIterable()); assertThat(doc.getMeta().getTitle(), is("Test Tika")); Map<String, String> raw = doc.getMeta().getRaw(); assertThat(raw, hasEntry("xmpDM:genre", "Vocal")); assertThat(raw, hasEntry("X-Parsed-By", "org.apache.tika.parser.DefaultParser")); assertThat(raw, hasEntry("creator", "David Pilato")); assertThat(raw, hasEntry("xmpDM:album", "FS Crawler")); assertThat(raw, hasEntry("xmpDM:trackNumber", "1")); assertThat(raw, hasEntry("xmpDM:releaseDate", "2016")); assertThat(raw, hasEntry("meta:author", "David Pilato")); assertThat(raw, hasEntry("xmpDM:artist", "David Pilato")); assertThat(raw, hasEntry("dc:creator", "David Pilato")); assertThat(raw, hasEntry("xmpDM:audioCompressor", "MP3")); assertThat(raw, hasEntry("title", "Test Tika")); assertThat(raw, hasEntry("xmpDM:audioChannelType", "Stereo")); assertThat(raw, hasEntry("version", "MPEG 3 Layer III Version 1")); assertThat(raw, hasEntry(is("xmpDM:logComment"), containsString("Hello but reverted"))); assertThat(raw, hasEntry("xmpDM:audioSampleRate", "44100")); assertThat(raw, hasEntry("channels", "2")); assertThat(raw, hasEntry("dc:title", "Test Tika")); assertThat(raw, hasEntry("Author", "David Pilato")); assertThat(raw, hasEntry("xmpDM:duration", "1018.775146484375")); assertThat(raw, hasEntry("Content-Type", "audio/mpeg")); assertThat(raw, hasEntry("samplerate", "44100")); } @Test public void testExtractFromOdt() throws IOException { Doc doc = extractFromFileExtension("odt"); // Extracted content assertThat(doc.getContent(), containsString("This is a sample text available in page")); // Content Type assertThat(doc.getFile().getContentType(), is("application/vnd.oasis.opendocument.text")); // Meta data assertThat(doc.getMeta().getAuthor(), is("David Pilato")); assertThat(doc.getMeta().getDate(), is(LocalDateTime.of(2016, 7, 7, 8, 37, 0))); assertThat(doc.getMeta().getKeywords(), containsInAnyOrder("keyword1", " keyword2")); assertThat(doc.getMeta().getTitle(), is("Test Tika title")); Map<String, String> raw = doc.getMeta().getRaw(); assertThat(raw, hasEntry("date", "2016-07-07T08:37:00Z")); assertThat(raw, hasEntry("dc:description", "Comments")); assertThat(raw, hasEntry("Keywords", "keyword1, keyword2")); assertThat(raw, hasEntry("meta:paragraph-count", "1")); assertThat(raw, hasEntry("meta:word-count", "12")); assertThat(raw, hasEntry("subject", "Test Tika Object")); assertThat(raw, hasEntry("meta:initial-author", "David Pilato")); assertThat(raw, hasEntry("initial-creator", "David Pilato")); assertThat(raw, hasEntry("dc:creator", "David Pilato")); assertThat(raw, hasEntry("generator", "MicrosoftOffice/15.0 MicrosoftWord")); assertThat(raw, hasEntry("description", "Comments")); assertThat(raw, hasEntry("Word-Count", "12")); assertThat(raw, hasEntry("dcterms:created", "2016-07-07T08:37:00Z")); assertThat(raw, hasEntry("dcterms:modified", "2016-07-07T08:37:00Z")); assertThat(raw, hasEntry("Last-Modified", "2016-07-07T08:37:00Z")); assertThat(raw, hasEntry("nbPara", "1")); assertThat(raw, hasEntry("title", "Test Tika title")); assertThat(raw, hasEntry("Last-Save-Date", "2016-07-07T08:37:00Z")); assertThat(raw, hasEntry("meta:character-count", "86")); assertThat(raw, hasEntry("custom:Terminé le", "2016-07-06T22:00:00Z")); assertThat(raw, hasEntry("Paragraph-Count", "1")); assertThat(raw, hasEntry("meta:save-date", "2016-07-07T08:37:00Z")); assertThat(raw, hasEntry("dc:title", "Test Tika title")); assertThat(raw, hasEntry("modified", "2016-07-07T08:37:00Z")); assertThat(raw, hasEntry("Edit-Time", "PT0S")); assertThat(raw, hasEntry("cp:subject", "Test Tika Object")); assertThat(raw, hasEntry("nbCharacter", "86")); assertThat(raw, hasEntry("nbPage", "1")); assertThat(raw, hasEntry("nbWord", "12")); assertThat(raw, hasEntry("Content-Type", "application/vnd.oasis.opendocument.text")); assertThat(raw, hasEntry("X-Parsed-By", "org.apache.tika.parser.DefaultParser")); assertThat(raw, hasEntry("creator", "David Pilato")); assertThat(raw, hasEntry("dc:subject", "keyword1, keyword2")); assertThat(raw, hasEntry("meta:author", "David Pilato")); assertThat(raw, hasEntry("meta:creation-date", "2016-07-07T08:37:00Z")); assertThat(raw, hasEntry("Creation-Date", "2016-07-07T08:37:00Z")); assertThat(raw, hasEntry("xmpTPg:NPages", "1")); assertThat(raw, hasEntry("Character Count", "86")); assertThat(raw, hasEntry("editing-cycles", "2")); assertThat(raw, hasEntry("Page-Count", "1")); assertThat(raw, hasEntry("Author", "David Pilato")); assertThat(raw, hasEntry("meta:page-count", "1")); } @Test public void testExtractFromPdf() throws IOException { Doc doc = extractFromFileExtension("pdf"); // Extracted content assertThat(doc.getContent(), containsString("This is a sample text available in page")); // Content Type assertThat(doc.getFile().getContentType(), is("application/pdf")); // Meta data assertThat(doc.getMeta().getAuthor(), is("David Pilato")); assertThat(doc.getMeta().getDate(), is(LocalDateTime.of(2016, 7, 7, 8, 37, 42))); assertThat(doc.getMeta().getKeywords(), containsInAnyOrder("keyword1", " keyword2")); assertThat(doc.getMeta().getTitle(), is("Test Tika title")); Map<String, String> raw = doc.getMeta().getRaw(); assertThat(raw, hasEntry("date", "2016-07-07T08:37:42Z")); assertThat(raw, hasEntry("pdf:PDFVersion", "1.5")); assertThat(raw, hasEntry("xmp:CreatorTool", "Microsoft Word")); assertThat(raw, hasEntry("Keywords", "keyword1, keyword2")); assertThat(raw, hasEntry("access_permission:modify_annotations", "true")); assertThat(raw, hasEntry("access_permission:can_print_degraded", "true")); assertThat(raw, hasEntry("subject", "Test Tika Object")); assertThat(raw, hasEntry("dc:creator", "David Pilato")); assertThat(raw, hasEntry("dcterms:created", "2016-07-07T08:37:42Z")); assertThat(raw, hasEntry("Last-Modified", "2016-07-07T08:37:42Z")); assertThat(raw, hasEntry("dcterms:modified", "2016-07-07T08:37:42Z")); assertThat(raw, hasEntry("dc:format", "application/pdf; version=1.5")); assertThat(raw, hasEntry("title", "Test Tika title")); assertThat(raw, hasEntry("Last-Save-Date", "2016-07-07T08:37:42Z")); assertThat(raw, hasEntry("access_permission:fill_in_form", "true")); assertThat(raw, hasEntry("meta:save-date", "2016-07-07T08:37:42Z")); assertThat(raw, hasEntry("pdf:encrypted", "false")); assertThat(raw, hasEntry("dc:title", "Test Tika title")); assertThat(raw, hasEntry("modified", "2016-07-07T08:37:42Z")); assertThat(raw, hasEntry("cp:subject", "Test Tika Object")); assertThat(raw, hasEntry("Content-Type", "application/pdf")); assertThat(raw, hasEntry("X-Parsed-By", "org.apache.tika.parser.DefaultParser")); assertThat(raw, hasEntry("creator", "David Pilato")); assertThat(raw, hasEntry("meta:author", "David Pilato")); assertThat(raw, hasEntry("dc:subject", "keyword1, keyword2")); assertThat(raw, hasEntry("meta:creation-date", "2016-07-07T08:37:42Z")); assertThat(raw, hasEntry(is("created"), containsString("Jul 0"))); assertThat(raw, hasEntry("access_permission:extract_for_accessibility", "true")); assertThat(raw, hasEntry("access_permission:assemble_document", "true")); assertThat(raw, hasEntry("xmpTPg:NPages", "2")); assertThat(raw, hasEntry("Creation-Date", "2016-07-07T08:37:42Z")); assertThat(raw, hasEntry("access_permission:extract_content", "true")); assertThat(raw, hasEntry("access_permission:can_print", "true")); assertThat(raw, hasEntry("meta:keyword", "keyword1, keyword2")); assertThat(raw, hasEntry("Author", "David Pilato")); assertThat(raw, hasEntry("access_permission:can_modify", "true")); } @Test public void testExtractFromRtf() throws IOException { Doc doc = extractFromFileExtension("rtf"); // Extracted content assertThat(doc.getContent(), containsString("This is a sample text available in page")); // Content Type assertThat(doc.getFile().getContentType(), is("application/rtf")); // Meta data assertThat(doc.getMeta().getAuthor(), is("David Pilato")); assertThat(doc.getMeta().getDate(), is(nullValue())); assertThat(doc.getMeta().getKeywords(), containsInAnyOrder("keyword1", " keyword2")); assertThat(doc.getMeta().getTitle(), is("Test Tika title")); Map<String, String> raw = doc.getMeta().getRaw(); assertThat(raw, hasEntry("X-Parsed-By", "org.apache.tika.parser.DefaultParser")); assertThat(raw, hasEntry("creator", "David Pilato")); assertThat(raw, hasEntry("Keywords", "keyword1, keyword2")); assertThat(raw, hasEntry("meta:word-count", "19")); assertThat(raw, hasEntry("meta:author", "David Pilato")); assertThat(raw, hasEntry("dc:subject", "keyword1, keyword2")); assertThat(raw, hasEntry(is("meta:creation-date"), containsString("2016-07-0"))); assertThat(raw, hasEntry("subject", "Test Tika Object")); assertThat(raw, hasEntry("dc:creator", "David Pilato")); assertThat(raw, hasEntry("extended-properties:Company", "elastic")); assertThat(raw, hasEntry(is("Creation-Date"), containsString("2016-07-"))); assertThat(raw, hasEntry(is("dcterms:created"), containsString("2016-07-"))); assertThat(raw, hasEntry("title", "Test Tika title")); assertThat(raw, hasEntry("meta:character-count", "68")); assertThat(raw, hasEntry("dc:title", "Test Tika title")); assertThat(raw, hasEntry("Author", "David Pilato")); assertThat(raw, hasEntry("extended-properties:Manager", "My Mother")); assertThat(raw, hasEntry("cp:subject", "Test Tika Object")); assertThat(raw, hasEntry("meta:page-count", "2")); assertThat(raw, hasEntry("cp:category", "test")); assertThat(raw, hasEntry("Content-Type", "application/rtf")); } @Test public void testExtractFromTxt() throws IOException { Doc doc = extractFromFileExtension("txt"); // Extracted content assertThat(doc.getContent(), containsString("This file contains some words.")); // Content Type assertThat(doc.getFile().getContentType(), containsString("text/plain")); // Meta data assertThat(doc.getMeta().getAuthor(), is(nullValue())); assertThat(doc.getMeta().getDate(), is(nullValue())); assertThat(doc.getMeta().getKeywords(), emptyIterable()); assertThat(doc.getMeta().getTitle(), is(nullValue())); Map<String, String> raw = doc.getMeta().getRaw(); assertThat(raw, hasEntry("X-Parsed-By", "org.apache.tika.parser.DefaultParser")); assertThat(raw, hasEntry(is("Content-Encoding"), notNullValue())); assertThat(raw, hasEntry(is("Content-Type"), containsString("text/plain"))); assertThat(doc.getAttachment(), nullValue()); assertThat(doc.getFile().getChecksum(), nullValue()); } @Test public void testExtractFromWav() throws IOException { Doc doc = extractFromFileExtension("wav"); // Extracted content assertThat(doc.getContent(), is("")); // Content Type assertThat(doc.getFile().getContentType(), is("audio/x-wav")); // Meta data assertThat(doc.getMeta().getAuthor(), is(nullValue())); assertThat(doc.getMeta().getDate(), is(nullValue())); assertThat(doc.getMeta().getKeywords(), emptyIterable()); assertThat(doc.getMeta().getTitle(), is(nullValue())); Map<String, String> raw = doc.getMeta().getRaw(); assertThat(raw, hasEntry("X-Parsed-By", "org.apache.tika.parser.DefaultParser")); assertThat(raw, hasEntry("xmpDM:audioSampleRate", "44100")); assertThat(raw, hasEntry("channels", "2")); assertThat(raw, hasEntry("bits", "16")); assertThat(raw, hasEntry("encoding", "PCM_SIGNED")); assertThat(raw, hasEntry("xmpDM:audioSampleType", "16Int")); assertThat(raw, hasEntry("Content-Type", "audio/x-wav")); assertThat(raw, hasEntry("samplerate", "44100.0")); } @Test public void testExtractFromTxtAndStoreSource() throws IOException { Doc doc = extractFromFile("test.txt", FsSettings.builder(getCurrentTestName()) .setFs(Fs.builder().setStoreSource(true).build()) .build()); // Extracted content assertThat(doc.getContent(), containsString("This file contains some words.")); assertThat(doc.getAttachment(), notNullValue()); } @Test public void testExtractFromTxtAndStoreSourceWithDigest() throws IOException { try { MessageDigest.getInstance("MD5"); } catch (NoSuchAlgorithmException e) { assumeNoException(e); } Doc doc = extractFromFile("test.txt", FsSettings.builder(getCurrentTestName()) .setFs(Fs.builder().setStoreSource(true).setChecksum("MD5").build()) .build()); // Extracted content assertThat(doc.getContent(), containsString("This file contains some words.")); assertThat(doc.getAttachment(), notNullValue()); assertThat(doc.getFile().getChecksum(), notNullValue()); } @Test public void testExtractFromTxtWithDigest() throws IOException { try { MessageDigest.getInstance("MD5"); } catch (NoSuchAlgorithmException e) { assumeNoException(e); } Doc doc = extractFromFile("test.txt", FsSettings.builder(getCurrentTestName()) .setFs(Fs.builder().setChecksum("MD5").build()) .build()); // Extracted content assertThat(doc.getContent(), containsString("This file contains some words.")); assertThat(doc.getAttachment(), nullValue()); assertThat(doc.getFile().getChecksum(), notNullValue()); } private Doc extractFromFileExtension(String extension) throws IOException { logger.info("Test extraction of [{}] file", extension); return extractFromFile("test." + extension); } private Doc extractFromFile(String filename) throws IOException { return extractFromFile(filename, FsSettings.builder(getCurrentTestName()).build()); } private Doc extractFromFile(String filename, FsSettings fsSettings) throws IOException { InputStream data = getBinaryContent(filename); Doc doc = new Doc(); MessageDigest messageDigest = null; if (fsSettings.getFs() != null && fsSettings.getFs().getChecksum() != null) { try { messageDigest = MessageDigest.getInstance(fsSettings.getFs().getChecksum()); } catch (NoSuchAlgorithmException e) { fail("Algorithm [" + fsSettings.getFs().getChecksum() + "] not found: " + e.getMessage()); } } TikaDocParser.generate( fsSettings, data, filename, doc, messageDigest, 0); logger.debug("Generated Content: [{}]", doc.getContent()); logger.debug("Generated Raw Metadata: [{}]", doc.getMeta().getRaw()); return doc; } }