/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.solr.handler.extraction; import java.util.ArrayList; import java.util.List; import java.util.Locale; import org.apache.solr.SolrTestCaseJ4; import org.apache.solr.common.SolrException; import org.apache.solr.common.util.ContentStream; import org.apache.solr.common.util.ContentStreamBase; import org.apache.solr.common.util.NamedList; import org.apache.solr.request.LocalSolrQueryRequest; import org.apache.solr.request.SolrQueryRequest; import org.apache.solr.response.SolrQueryResponse; import org.apache.solr.update.AddUpdateCommand; import org.apache.solr.update.processor.BufferingRequestProcessor; import org.junit.Before; import org.junit.BeforeClass; import org.junit.Test; /** * * **/ public class ExtractingRequestHandlerTest extends SolrTestCaseJ4 { @BeforeClass public static void beforeClass() throws Exception { assumeFalse("This test fails on UNIX with Turkish default locale (https://issues.apache.org/jira/browse/SOLR-6387)", new Locale("tr").getLanguage().equals(Locale.getDefault().getLanguage())); initCore("solrconfig.xml", "schema.xml", getFile("extraction/solr").getAbsolutePath()); } @Override @Before public void setUp() throws Exception { super.setUp(); clearIndex(); assertU(commit()); } @Test public void testExtraction() throws Exception { ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract"); assertTrue("handler is null and it shouldn't be", handler != null); loadLocal("extraction/solr-word.pdf", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer", "fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords", "fmap.Creation-Date", "extractedDate", "uprefix", "ignored_", "fmap.Author", "extractedAuthor", "fmap.content", "extractedContent", "literal.id", "one", "fmap.Last-Modified", "extractedDate" ); assertQ(req("title:solr-word"), "//*[@numFound='0']"); assertU(commit()); assertQ(req("title:solr-word"), "//*[@numFound='1']"); loadLocal("extraction/simple.html", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer", "fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords", "fmap.Author", "extractedAuthor", "fmap.language", "extractedLanguage", "literal.id", "two", "uprefix", "ignored_", "fmap.content", "extractedContent", "fmap.Last-Modified", "extractedDate" ); assertQ(req("title:Welcome"), "//*[@numFound='0']"); assertU(commit()); assertQ(req("title:Welcome"), "//*[@numFound='1']"); assertQ(req("extractedContent:distinctwords"), "//*[@numFound='0']"); assertQ(req("extractedContent:distinct"), "//*[@numFound='1']"); assertQ(req("extractedContent:words"), "//*[@numFound='2']"); assertQ(req("extractedContent:\"distinct words\""), "//*[@numFound='1']"); loadLocal("extraction/simple.html", "literal.id","simple2", "uprefix", "t_", "lowernames", "true", "captureAttr", "true", "fmap.a","t_href", "fmap.content_type", "abcxyz", // test that lowernames is applied before mapping, and uprefix is applied after mapping "commit", "true" // test immediate commit ); // test that purposely causes a failure to print out the doc for test debugging // assertQ(req("q","id:simple2","indent","true"), "//*[@numFound='0']"); // test both lowernames and unknown field mapping //assertQ(req("+id:simple2 +t_content_type:[* TO *]"), "//*[@numFound='1']"); assertQ(req("+id:simple2 +t_href:[* TO *]"), "//*[@numFound='1']"); assertQ(req("+id:simple2 +t_abcxyz:[* TO *]"), "//*[@numFound='1']"); assertQ(req("+id:simple2 +t_content:serif"), "//*[@numFound='0']"); // make sure <style> content is excluded assertQ(req("+id:simple2 +t_content:blur"), "//*[@numFound='0']"); // make sure <script> content is excluded // make sure the fact there is an index-time boost does not fail the parsing loadLocal("extraction/simple.html", "literal.id","simple3", "uprefix", "t_", "lowernames", "true", "captureAttr", "true", "fmap.a","t_href", "commit", "true" ,"boost.t_href", "100.0" ); assertQ(req("t_href:http"), "//*[@numFound='2']"); assertQ(req("t_href:http"), "//doc[2]/str[.='simple3']"); assertQ(req("+id:simple3 +t_content_type:[* TO *]"), "//*[@numFound='1']");//test lowercase and then uprefix loadLocal("extraction/version_control.xml", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer", "fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords", "fmap.Author", "extractedAuthor", "literal.id", "three", "uprefix", "ignored_", "fmap.content", "extractedContent", "fmap.language", "extractedLanguage", "fmap.Last-Modified", "extractedDate" ); assertQ(req("stream_name:version_control.xml"), "//*[@numFound='0']"); assertU(commit()); assertQ(req("stream_name:version_control.xml"), "//*[@numFound='1']"); loadLocal("extraction/word2003.doc", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer", "fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords", "fmap.Author", "extractedAuthor", "literal.id", "four", "uprefix", "ignored_", "fmap.content", "extractedContent", "fmap.language", "extractedLanguage", "fmap.Last-Modified", "extractedDate" ); assertQ(req("title:\"Word 2003 Title\""), "//*[@numFound='0']"); // There is already a PDF file with this content: assertQ(req("extractedContent:\"This is a test of PDF and Word extraction in Solr, it is only a test\""), "//*[@numFound='1']"); assertU(commit()); assertQ(req("title:\"Word 2003 Title\""), "//*[@numFound='1']"); // now 2 of them: assertQ(req("extractedContent:\"This is a test of PDF and Word extraction in Solr, it is only a test\""), "//*[@numFound='2']"); // compressed file loadLocal("extraction/tiny.txt.gz", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer", "fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords", "fmap.Author", "extractedAuthor", "uprefix", "ignored_", "fmap.content", "extractedContent", "fmap.language", "extractedLanguage", "fmap.Last-Modified", "extractedDate", "literal.id", "tiny.txt.gz"); assertU(commit()); assertQ(req("id:tiny.txt.gz") , "//*[@numFound='1']" , "//*/arr[@name='stream_name']/str[.='tiny.txt.gz']" ); // compressed file loadLocal("extraction/open-document.odt", "uprefix", "ignored_", "fmap.content", "extractedContent", "literal.id", "open-document"); assertU(commit()); assertQ(req("extractedContent:\"Práctica sobre GnuPG\"") , "//*[@numFound='1']" , "//*/arr[@name='stream_name']/str[.='open-document.odt']" ); } @Test public void testCapture() throws Exception { loadLocal("extraction/simple.html", "literal.id","capture1", "uprefix","t_", "capture","div", "fmap.div", "foo_t", "commit", "true" ); assertQ(req("+id:capture1 +t_content:Solr"), "//*[@numFound='1']"); assertQ(req("+id:capture1 +foo_t:\"here is some text in a div\""), "//*[@numFound='1']"); loadLocal("extraction/simple.html", "literal.id", "capture2", "captureAttr", "true", "defaultField", "text", "fmap.div", "div_t", "fmap.a", "anchor_t", "capture", "div", "capture", "a", "commit", "true" ); assertQ(req("+id:capture2 +text:Solr"), "//*[@numFound='1']"); assertQ(req("+id:capture2 +div_t:\"here is some text in a div\""), "//*[@numFound='1']"); assertQ(req("+id:capture2 +anchor_t:http\\://www.apache.org"), "//*[@numFound='1']"); assertQ(req("+id:capture2 +anchor_t:link"), "//*[@numFound='1']"); } @Test public void testDefaultField() throws Exception { ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract"); assertTrue("handler is null and it shouldn't be", handler != null); try { ignoreException("unknown field 'a'"); ignoreException("unknown field 'meta'"); // TODO: should this exception be happening? loadLocal("extraction/simple.html", "literal.id","simple2", "lowernames", "true", "captureAttr", "true", //"fmap.content_type", "abcxyz", "commit", "true" // test immediate commit ); fail("Should throw SolrException"); } catch (SolrException e) { //do nothing } finally { resetExceptionIgnores(); } loadLocal("extraction/simple.html", "literal.id","simple2", ExtractingParams.DEFAULT_FIELD, "defaultExtr",//test that unmapped fields go to the text field when no uprefix is specified "lowernames", "true", "captureAttr", "true", //"fmap.content_type", "abcxyz", "commit", "true" // test immediate commit ); assertQ(req("id:simple2"), "//*[@numFound='1']"); assertQ(req("defaultExtr:http\\:\\/\\/www.apache.org"), "//*[@numFound='1']"); //Test when both uprefix and default are specified. loadLocal("extraction/simple.html", "literal.id","simple2", ExtractingParams.DEFAULT_FIELD, "defaultExtr",//test that unmapped fields go to the text field when no uprefix is specified ExtractingParams.UNKNOWN_FIELD_PREFIX, "t_", "lowernames", "true", "captureAttr", "true", "fmap.a","t_href", //"fmap.content_type", "abcxyz", "commit", "true" // test immediate commit ); assertQ(req("+id:simple2 +t_href:[* TO *]"), "//*[@numFound='1']"); } @Test public void testLiterals() throws Exception { ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract"); assertTrue("handler is null and it shouldn't be", handler != null); //test literal loadLocal("extraction/version_control.xml", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer", "fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords", "fmap.Author", "extractedAuthor", "fmap.content", "extractedContent", "literal.id", "one", "uprefix", "ignored_", "fmap.language", "extractedLanguage", "literal.extractionLiteralMV", "one", "literal.extractionLiteralMV", "two", "fmap.Last-Modified", "extractedDate" ); assertQ(req("stream_name:version_control.xml"), "//*[@numFound='0']"); assertU(commit()); assertQ(req("stream_name:version_control.xml"), "//*[@numFound='1']"); assertQ(req("extractionLiteralMV:one"), "//*[@numFound='1']"); assertQ(req("extractionLiteralMV:two"), "//*[@numFound='1']"); try { loadLocal("extraction/version_control.xml", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer", "fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords", "fmap.Author", "extractedAuthor", "fmap.content", "extractedContent", "literal.id", "two", "fmap.language", "extractedLanguage", "literal.extractionLiteral", "one", "literal.extractionLiteral", "two", "fmap.X-Parsed-By", "ignored_parser", "fmap.Last-Modified", "extractedDate" ); // TODO: original author did not specify why an exception should be thrown... how to fix? // assertTrue("Exception should have been thrown", false); } catch (SolrException e) { //nothing to see here, move along } loadLocal("extraction/version_control.xml", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer", "fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords", "fmap.Author", "extractedAuthor", "fmap.content", "extractedContent", "literal.id", "three", "fmap.language", "extractedLanguage", "literal.extractionLiteral", "one", "fmap.X-Parsed-By", "ignored_parser", "fmap.Last-Modified", "extractedDate" ); assertU(commit()); assertQ(req("extractionLiteral:one"), "//*[@numFound='1']"); } public void testLiteralDefaults() throws Exception { // sanity check config loadLocalFromHandler("/update/extract/lit-def", "extraction/simple.html", "literal.id", "lit-def-simple"); assertU(commit()); assertQ(req("q", "id:lit-def-simple") , "//*[@numFound='1']" , "count(//arr[@name='foo_s']/str)=1" , "//arr[@name='foo_s']/str[.='x']" , "count(//arr[@name='bar_s']/str)=1" , "//arr[@name='bar_s']/str[.='y']" , "count(//arr[@name='zot_s']/str)=1" , "//arr[@name='zot_s']/str[.='z']" ); // override the default foo_s loadLocalFromHandler("/update/extract/lit-def", "extraction/simple.html", "literal.foo_s", "1111", "literal.id", "lit-def-simple"); assertU(commit()); assertQ(req("q", "id:lit-def-simple") , "//*[@numFound='1']" , "count(//arr[@name='foo_s']/str)=1" , "//arr[@name='foo_s']/str[.='1111']" , "count(//arr[@name='bar_s']/str)=1" , "//arr[@name='bar_s']/str[.='y']" , "count(//arr[@name='zot_s']/str)=1" , "//arr[@name='zot_s']/str[.='z']" ); // pre-pend the bar_s loadLocalFromHandler("/update/extract/lit-def", "extraction/simple.html", "literal.bar_s", "2222", "literal.id", "lit-def-simple"); assertU(commit()); assertQ(req("q", "id:lit-def-simple") , "//*[@numFound='1']" , "count(//arr[@name='foo_s']/str)=1" , "//arr[@name='foo_s']/str[.='x']" , "count(//arr[@name='bar_s']/str)=2" , "//arr[@name='bar_s']/str[.='2222']" , "//arr[@name='bar_s']/str[.='y']" , "count(//arr[@name='zot_s']/str)=1" , "//arr[@name='zot_s']/str[.='z']" ); // invariant zot_s can not be changed loadLocalFromHandler("/update/extract/lit-def", "extraction/simple.html", "literal.zot_s", "3333", "literal.id", "lit-def-simple"); assertU(commit()); assertQ(req("q", "id:lit-def-simple") , "//*[@numFound='1']" , "count(//arr[@name='foo_s']/str)=1" , "//arr[@name='foo_s']/str[.='x']" , "count(//arr[@name='bar_s']/str)=1" , "//arr[@name='bar_s']/str[.='y']" , "count(//arr[@name='zot_s']/str)=1" , "//arr[@name='zot_s']/str[.='z']" ); } @Test public void testPlainTextSpecifyingMimeType() throws Exception { ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract"); assertTrue("handler is null and it shouldn't be", handler != null); // Load plain text specifying MIME type: loadLocal("extraction/version_control.txt", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer", "fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords", "fmap.Author", "extractedAuthor", "literal.id", "one", "fmap.language", "extractedLanguage", "fmap.X-Parsed-By", "ignored_parser", "fmap.content", "extractedContent", ExtractingParams.STREAM_TYPE, "text/plain" ); assertQ(req("extractedContent:Apache"), "//*[@numFound='0']"); assertU(commit()); assertQ(req("extractedContent:Apache"), "//*[@numFound='1']"); } @Test public void testPlainTextSpecifyingResourceName() throws Exception { ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract"); assertTrue("handler is null and it shouldn't be", handler != null); // Load plain text specifying filename loadLocal("extraction/version_control.txt", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer", "fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords", "fmap.Author", "extractedAuthor", "literal.id", "one", "fmap.language", "extractedLanguage", "fmap.X-Parsed-By", "ignored_parser", "fmap.content", "extractedContent", ExtractingParams.RESOURCE_NAME, "extraction/version_control.txt" ); assertQ(req("extractedContent:Apache"), "//*[@numFound='0']"); assertU(commit()); assertQ(req("extractedContent:Apache"), "//*[@numFound='1']"); } @Test public void testCommitWithin() throws Exception { ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract"); assertTrue("handler is null and it shouldn't be", handler != null); SolrQueryRequest req = req("literal.id", "one", ExtractingParams.RESOURCE_NAME, "extraction/version_control.txt", "commitWithin", "200" ); SolrQueryResponse rsp = new SolrQueryResponse(); BufferingRequestProcessor p = new BufferingRequestProcessor(null); ExtractingDocumentLoader loader = (ExtractingDocumentLoader) handler.newLoader(req, p); loader.load(req, rsp, new ContentStreamBase.FileStream(getFile("extraction/version_control.txt")),p); AddUpdateCommand add = p.addCommands.get(0); assertEquals(200, add.commitWithin); req.close(); } // Note: If you load a plain text file specifying neither MIME type nor filename, extraction will silently fail. This is because Tika's // automatic MIME type detection will fail, and it will default to using an empty-string-returning default parser @Test public void testExtractOnly() throws Exception { ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract"); assertTrue("handler is null and it shouldn't be", handler != null); SolrQueryResponse rsp = loadLocal("extraction/solr-word.pdf", ExtractingParams.EXTRACT_ONLY, "true"); assertTrue("rsp is null and it shouldn't be", rsp != null); NamedList list = rsp.getValues(); String extraction = (String) list.get("solr-word.pdf"); assertTrue("extraction is null and it shouldn't be", extraction != null); assertTrue(extraction + " does not contain " + "solr-word", extraction.indexOf("solr-word") != -1); NamedList nl = (NamedList) list.get("solr-word.pdf_metadata"); assertTrue("nl is null and it shouldn't be", nl != null); Object title = nl.get("title"); assertTrue("title is null and it shouldn't be", title != null); assertTrue(extraction.indexOf("<?xml") != -1); rsp = loadLocal("extraction/solr-word.pdf", ExtractingParams.EXTRACT_ONLY, "true", ExtractingParams.EXTRACT_FORMAT, ExtractingDocumentLoader.TEXT_FORMAT); assertTrue("rsp is null and it shouldn't be", rsp != null); list = rsp.getValues(); extraction = (String) list.get("solr-word.pdf"); assertTrue("extraction is null and it shouldn't be", extraction != null); assertTrue(extraction + " does not contain " + "solr-word", extraction.indexOf("solr-word") != -1); assertTrue(extraction.indexOf("<?xml") == -1); nl = (NamedList) list.get("solr-word.pdf_metadata"); assertTrue("nl is null and it shouldn't be", nl != null); title = nl.get("title"); assertTrue("title is null and it shouldn't be", title != null); } @Test public void testXPath() throws Exception { ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract"); assertTrue("handler is null and it shouldn't be", handler != null); SolrQueryResponse rsp = loadLocal("extraction/example.html", ExtractingParams.XPATH_EXPRESSION, "/xhtml:html/xhtml:body/xhtml:a/descendant::node()", ExtractingParams.EXTRACT_ONLY, "true" ); assertTrue("rsp is null and it shouldn't be", rsp != null); NamedList list = rsp.getValues(); String val = (String) list.get("example.html"); assertEquals("News", val.trim()); //there is only one matching <a> tag loadLocal("extraction/example.html", "literal.id", "example1", "captureAttr", "true", "defaultField", "text", "capture", "div", "fmap.div", "foo_t", "boost.foo_t", "3", "xpath", "/xhtml:html/xhtml:body/xhtml:div//node()", "commit", "true" ); assertQ(req("+id:example1 +foo_t:\"here is some text in a div\""), "//*[@numFound='1']"); } /** test arabic PDF extraction is functional */ @Test public void testArabicPDF() throws Exception { ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract"); assertTrue("handler is null and it shouldn't be", handler != null); loadLocal("extraction/arabic.pdf", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer", "fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords", "fmap.Creation-Date", "extractedDate", "fmap.Author", "extractedAuthor", "uprefix", "ignored_", "fmap.content", "wdf_nocase", "literal.id", "one", "fmap.Last-Modified", "extractedDate"); assertQ(req("wdf_nocase:السلم"), "//result[@numFound=0]"); assertU(commit()); assertQ(req("wdf_nocase:السلم"), "//result[@numFound=1]"); } @Test public void testTikaExceptionHandling() throws Exception { ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract"); assertTrue("handler is null and it shouldn't be", handler != null); try{ loadLocal("extraction/password-is-solrcell.docx", "literal.id", "one"); fail("TikaException is expected because of trying to extract text from password protected word file without supplying a password."); } catch(Exception expected){} assertU(commit()); assertQ(req("*:*"), "//result[@numFound=0]"); try{ loadLocal("extraction/password-is-solrcell.docx", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer", "fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords", "fmap.Creation-Date", "extractedDate", "uprefix", "ignored_", "fmap.Author", "extractedAuthor", "fmap.content", "wdf_nocase", "literal.id", "one", "ignoreTikaException", "true", // set ignore flag "fmap.Last-Modified", "extractedDate"); } catch(Exception e){ fail("TikaException should be ignored."); } assertU(commit()); assertQ(req("*:*"), "//result[@numFound=1]"); } @Test public void testWrongStreamType() throws Exception { ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract"); assertTrue("handler is null and it shouldn't be", handler != null); try{ // Load plain text specifying another mime type, should fail loadLocal("extraction/version_control.txt", "literal.id", "one", ExtractingParams.STREAM_TYPE, "application/pdf" ); fail("SolrException is expected because wrong parser specified for the file type"); } catch(Exception expected){} try{ // Load plain text specifying non existing mimetype, should fail loadLocal("extraction/version_control.txt", "literal.id", "one", ExtractingParams.STREAM_TYPE, "foo/bar" ); fail("SolrException is expected because nonexsisting parser specified"); } catch(Exception expected){} } public void testLiteralsOverride() throws Exception { ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract"); assertTrue("handler is null and it shouldn't be", handler != null); assertQ(req("*:*"), "//*[@numFound='0']"); // Here Tika should parse out a title for this document: loadLocal("extraction/solr-word.pdf", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer", "fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords", "fmap.Author", "extractedAuthor", "literal.id", "three", "fmap.content", "extractedContent", "fmap.language", "extractedLanguage", "fmap.Creation-Date", "extractedDate", "uprefix", "ignored_", "fmap.Last-Modified", "extractedDate"); // Here the literal value should override the Tika-parsed title: loadLocal("extraction/solr-word.pdf", "literal.title", "wolf-man", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer", "fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords", "fmap.Author", "extractedAuthor", "literal.id", "four", "fmap.content", "extractedContent", "fmap.language", "extractedLanguage", "fmap.Creation-Date", "extractedDate", "uprefix", "ignored_", "fmap.Last-Modified", "extractedDate"); // Here we mimic the old behaviour where literals are added, not overridden loadLocal("extraction/solr-word.pdf", "literalsOverride", "false", // Trick - we first map the metadata-title to an ignored field before we replace with literal title "fmap.title", "ignored_a", "literal.title", "old-behaviour", "literal.extractedKeywords", "literalkeyword", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer", "fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords", "fmap.Author", "extractedAuthor", "literal.id", "five", "fmap.content", "extractedContent", "fmap.language", "extractedLanguage", "fmap.Creation-Date", "extractedDate", "uprefix", "ignored_", "fmap.Last-Modified", "extractedDate"); assertU(commit()); assertQ(req("title:solr-word"), "//*[@numFound='1']"); assertQ(req("title:wolf-man"), "//*[@numFound='1']"); assertQ(req("extractedKeywords:(solr AND word AND pdf AND literalkeyword)"), "//*[@numFound='1']"); } @Test public void testPdfWithImages() throws Exception { //Tests possibility to configure ParseContext (by example to extract embedded images from pdf) loadLocal("extraction/pdf-with-image.pdf", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer", "fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords", "fmap.Creation-Date", "extractedDate", "uprefix", "ignored_", "fmap.Author", "extractedAuthor", "fmap.content", "wdf_nocase", "literal.id", "pdfWithImage", "resource.name", "pdf-with-image.pdf", "resource.password", "solrRules", "fmap.Last-Modified", "extractedDate"); assertQ(req("wdf_nocase:\"embedded:image0.jpg\""), "//*[@numFound='0']"); assertU(commit()); assertQ(req("wdf_nocase:\"embedded:image0.jpg\""), "//*[@numFound='1']"); } @Test public void testPasswordProtected() throws Exception { // PDF, Passwords from resource.password loadLocal("extraction/encrypted-password-is-solrRules.pdf", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer", "fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords", "fmap.Creation-Date", "extractedDate", "uprefix", "ignored_", "fmap.Author", "extractedAuthor", "fmap.content", "wdf_nocase", "literal.id", "pdfpwliteral", "resource.name", "encrypted-password-is-solrRules.pdf", "resource.password", "solrRules", "fmap.Last-Modified", "extractedDate"); // PDF, Passwords from passwords property file loadLocal("extraction/encrypted-password-is-solrRules.pdf", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer", "fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords", "fmap.Creation-Date", "extractedDate", "uprefix", "ignored_", "fmap.Author", "extractedAuthor", "fmap.content", "wdf_nocase", "literal.id", "pdfpwfile", "resource.name", "encrypted-password-is-solrRules.pdf", "passwordsFile", "passwordRegex.properties", // Passwords-file "fmap.Last-Modified", "extractedDate"); // DOCX, Explicit password loadLocal("extraction/password-is-Word2010.docx", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer", "fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords", "fmap.Creation-Date", "extractedDate", "fmap.Author", "extractedAuthor", "fmap.content", "wdf_nocase", "uprefix", "ignored_", "literal.id", "docxpwliteral", "resource.name", "password-is-Word2010.docx", "resource.password", "Word2010", // Explicit password "fmap.Last-Modified", "extractedDate"); // DOCX, Passwords from file loadLocal("extraction/password-is-Word2010.docx", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer", "fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords", "fmap.Creation-Date", "extractedDate", "uprefix", "ignored_", "fmap.Author", "extractedAuthor", "fmap.content", "wdf_nocase", "literal.id", "docxpwfile", "resource.name", "password-is-Word2010.docx", "passwordsFile", "passwordRegex.properties", // Passwords-file "fmap.Last-Modified", "extractedDate"); assertU(commit()); Thread.sleep(100); assertQ(req("wdf_nocase:\"This is a test of PDF\""), "//*[@numFound='2']"); assertQ(req("wdf_nocase:\"Test password protected word doc\""), "//*[@numFound='2']"); } SolrQueryResponse loadLocalFromHandler(String handler, String filename, String... args) throws Exception { LocalSolrQueryRequest req = (LocalSolrQueryRequest) req(args); try { // TODO: stop using locally defined streams once stream.file and // stream.body work everywhere List<ContentStream> cs = new ArrayList<>(); cs.add(new ContentStreamBase.FileStream(getFile(filename))); req.setContentStreams(cs); return h.queryAndResponse(handler, req); } finally { req.close(); } } SolrQueryResponse loadLocal(String filename, String... args) throws Exception { return loadLocalFromHandler("/update/extract", filename, args); } }