/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.solr.handler; import org.apache.lucene.analysis.MockTokenizer; import org.apache.solr.client.solrj.request.DocumentAnalysisRequest; import org.apache.solr.common.SolrInputDocument; import org.apache.solr.common.SolrInputField; import org.apache.solr.common.params.ModifiableSolrParams; import org.apache.solr.common.util.ContentStream; import org.apache.solr.common.util.ContentStreamBase; import org.apache.solr.common.util.NamedList; import org.apache.solr.request.SolrQueryRequest; import org.apache.solr.request.SolrQueryRequestBase; import org.junit.Before; import org.junit.BeforeClass; import org.junit.Test; import java.util.Collections; import java.util.List; import java.io.ByteArrayInputStream; import java.io.InputStream; import java.io.IOException; import java.io.Reader; import java.nio.charset.StandardCharsets; /** * A test for {@link DocumentAnalysisRequestHandler}. * * * @since solr 1.4 */ public class DocumentAnalysisRequestHandlerTest extends AnalysisRequestHandlerTestBase { private DocumentAnalysisRequestHandler handler; @BeforeClass public static void beforeClass() throws Exception { initCore("solrconfig.xml", "schema.xml"); } @Override @Before public void setUp() throws Exception { super.setUp(); handler = new DocumentAnalysisRequestHandler(); handler.init(new NamedList()); } /** * Tests the {@link DocumentAnalysisRequestHandler#resolveAnalysisRequest(org.apache.solr.request.SolrQueryRequest)} */ @Test public void testResolveAnalysisRequest() throws Exception { String docsInput = "<docs>" + "<doc>" + "<field name=\"id\">1</field>" + "<field name=\"whitetok\">The Whitetok</field>" + "<field name=\"text\">The Text</field>" + "</doc>" + "</docs>"; final ContentStream cs = new ContentStreamBase.StringStream(docsInput); ModifiableSolrParams params = new ModifiableSolrParams(); params.add("analysis.query", "The Query String"); params.add("analysis.showmatch", "true"); SolrQueryRequest req = new SolrQueryRequestBase(h.getCore(), params) { @Override public Iterable<ContentStream> getContentStreams() { return Collections.singleton(cs); } }; DocumentAnalysisRequest request = handler.resolveAnalysisRequest(req); assertNotNull(request); assertTrue(request.isShowMatch()); assertNotNull(request.getQuery()); assertEquals("The Query String", request.getQuery()); List<SolrInputDocument> documents = request.getDocuments(); assertNotNull(documents); assertEquals(1, documents.size()); SolrInputDocument document = documents.get(0); SolrInputField field = document.getField("id"); assertNotNull(field); assertEquals("1", field.getFirstValue()); field = document.getField("whitetok"); assertNotNull(field); assertEquals("The Whitetok", field.getFirstValue()); field = document.getField("text"); assertNotNull(field); assertEquals("The Text", field.getFirstValue()); req.close(); } /** A binary-only ContentStream */ static class ByteStream extends ContentStreamBase { private final byte[] bytes; public ByteStream(byte[] bytes, String contentType) { this.bytes = bytes; this.contentType = contentType; name = null; size = Long.valueOf(bytes.length); sourceInfo = "rawBytes"; } @Override public InputStream getStream() throws IOException { return new ByteArrayInputStream(bytes); } @Override public Reader getReader() throws IOException { throw new IOException("This is a byte stream, Readers are not supported."); } } // This test should also test charset detection in UpdateRequestHandler, // but the DocumentAnalysisRequestHandler is simplier to use/check. @Test public void testCharsetInDocument() throws Exception { final byte[] xmlBytes = ( "<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?>\r\n" + "<docs>\r\n" + " <doc>\r\n" + " <field name=\"id\">Müller</field>\r\n" + " </doc>" + "</docs>" ).getBytes(StandardCharsets.ISO_8859_1); // we declare a content stream without charset: final ContentStream cs = new ByteStream(xmlBytes, "application/xml"); ModifiableSolrParams params = new ModifiableSolrParams(); SolrQueryRequest req = new SolrQueryRequestBase(h.getCore(), params) { @Override public Iterable<ContentStream> getContentStreams() { return Collections.singleton(cs); } }; DocumentAnalysisRequest request = handler.resolveAnalysisRequest(req); assertNotNull(request); final List<SolrInputDocument> documents = request.getDocuments(); assertNotNull(documents); assertEquals(1, documents.size()); SolrInputDocument doc = documents.get(0); assertEquals("Müller", doc.getField("id").getValue()); } // This test should also test charset detection in UpdateRequestHandler, // but the DocumentAnalysisRequestHandler is simplier to use/check. @Test public void testCharsetOutsideDocument() throws Exception { final byte[] xmlBytes = ( "<docs>\r\n" + " <doc>\r\n" + " <field name=\"id\">Müller</field>\r\n" + " </doc>" + "</docs>" ).getBytes(StandardCharsets.ISO_8859_1); // we declare a content stream with charset: final ContentStream cs = new ByteStream(xmlBytes, "application/xml; charset=ISO-8859-1"); ModifiableSolrParams params = new ModifiableSolrParams(); SolrQueryRequest req = new SolrQueryRequestBase(h.getCore(), params) { @Override public Iterable<ContentStream> getContentStreams() { return Collections.singleton(cs); } }; DocumentAnalysisRequest request = handler.resolveAnalysisRequest(req); assertNotNull(request); final List<SolrInputDocument> documents = request.getDocuments(); assertNotNull(documents); assertEquals(1, documents.size()); SolrInputDocument doc = documents.get(0); assertEquals("Müller", doc.getField("id").getValue()); } /** * Tests the {@link DocumentAnalysisRequestHandler#handleAnalysisRequest(org.apache.solr.client.solrj.request.DocumentAnalysisRequest, * org.apache.solr.schema.IndexSchema)} */ @Test public void testHandleAnalysisRequest() throws Exception { SolrInputDocument document = new SolrInputDocument(); document.addField("id", 1); document.addField("whitetok", "Jumping Jack"); document.addField("text", "The Fox Jumped Over The Dogs"); DocumentAnalysisRequest request = new DocumentAnalysisRequest() .setQuery("JUMPING") .setShowMatch(true) .addDocument(document); NamedList<Object> result = handler.handleAnalysisRequest(request, h.getCore().getLatestSchema()); assertNotNull("result is null and it shouldn't be", result); NamedList<NamedList<NamedList<Object>>> documentResult = (NamedList<NamedList<NamedList<Object>>>) result.get("1"); assertNotNull("An analysis for document with key '1' should be returned", documentResult); // the id field NamedList<NamedList<Object>> idResult = documentResult.get("id"); assertNotNull("an analysis for the 'id' field should be returned", idResult); NamedList<Object> queryResult; List<NamedList> tokenList; NamedList<Object> indexResult; NamedList<List<NamedList>> valueResult; /*** Much of this test seems invalid for a numeric "id" field NamedList<Object> queryResult = idResult.get("query"); assertEquals("Only the default analyzer should be applied", 1, queryResult.size()); String name = queryResult.getName(0); assertTrue("Only the default analyzer should be applied", name.matches("org.apache.solr.schema.FieldType\\$DefaultAnalyzer.*")); List<NamedList> tokenList = (List<NamedList>) queryResult.getVal(0); assertEquals("Query has only one token", 1, tokenList.size()); assertToken(tokenList.get(0), new TokenInfo("JUMPING", null, "word", 0, 7, 1, new int[]{1}, null, false)); NamedList<Object> indexResult = idResult.get("index"); assertEquals("The id field has only a single value", 1, indexResult.size()); NamedList<List<NamedList>> valueResult = (NamedList<List<NamedList>>) indexResult.get("1"); assertEquals("Only the default analyzer should be applied", 1, valueResult.size()); name = queryResult.getName(0); assertTrue("Only the default analyzer should be applied", name.matches("org.apache.solr.schema.FieldType\\$DefaultAnalyzer.*")); tokenList = valueResult.getVal(0); assertEquals("The 'id' field value has only one token", 1, tokenList.size()); assertToken(tokenList.get(0), new TokenInfo("1", null, "word", 0, 1, 1, new int[]{1}, null, false)); ***/ // the name field NamedList<NamedList<Object>> whitetokResult = documentResult.get("whitetok"); assertNotNull("an analysis for the 'whitetok' field should be returned", whitetokResult); queryResult = whitetokResult.get("query"); tokenList = (List<NamedList>) queryResult.get(MockTokenizer.class.getName()); assertNotNull("Expecting the 'MockTokenizer' to be applied on the query for the 'whitetok' field", tokenList); assertEquals("Query has only one token", 1, tokenList.size()); assertToken(tokenList.get(0), new TokenInfo("JUMPING", null, "word", 0, 7, 1, new int[]{1}, null, false)); indexResult = whitetokResult.get("index"); assertEquals("The 'whitetok' field has only a single value", 1, indexResult.size()); valueResult = (NamedList<List<NamedList>>) indexResult.get("Jumping Jack"); tokenList = valueResult.getVal(0); assertEquals("Expecting 2 tokens to be present", 2, tokenList.size()); assertToken(tokenList.get(0), new TokenInfo("Jumping", null, "word", 0, 7, 1, new int[]{1}, null, false)); assertToken(tokenList.get(1), new TokenInfo("Jack", null, "word", 8, 12, 2, new int[]{2}, null, false)); // the text field NamedList<NamedList<Object>> textResult = documentResult.get("text"); assertNotNull("an analysis for the 'text' field should be returned", textResult); queryResult = textResult.get("query"); tokenList = (List<NamedList>) queryResult.get("org.apache.lucene.analysis.standard.StandardTokenizer"); assertNotNull("Expecting the 'StandardTokenizer' to be applied on the query for the 'text' field", tokenList); assertEquals("Query has only one token", 1, tokenList.size()); assertToken(tokenList.get(0), new TokenInfo("JUMPING", null, "<ALPHANUM>", 0, 7, 1, new int[]{1}, null, false)); tokenList = (List<NamedList>) queryResult.get("org.apache.lucene.analysis.standard.StandardFilter"); assertNotNull("Expecting the 'StandardFilter' to be applied on the query for the 'text' field", tokenList); assertEquals("Query has only one token", 1, tokenList.size()); assertToken(tokenList.get(0), new TokenInfo("JUMPING", null, "<ALPHANUM>", 0, 7, 1, new int[]{1,1}, null, false)); tokenList = (List<NamedList>) queryResult.get("org.apache.lucene.analysis.core.LowerCaseFilter"); assertNotNull("Expecting the 'LowerCaseFilter' to be applied on the query for the 'text' field", tokenList); assertEquals("Query has only one token", 1, tokenList.size()); assertToken(tokenList.get(0), new TokenInfo("jumping", null, "<ALPHANUM>", 0, 7, 1, new int[]{1,1,1}, null, false)); tokenList = (List<NamedList>) queryResult.get("org.apache.lucene.analysis.core.StopFilter"); assertNotNull("Expecting the 'StopFilter' to be applied on the query for the 'text' field", tokenList); assertEquals("Query has only one token", 1, tokenList.size()); assertToken(tokenList.get(0), new TokenInfo("jumping", null, "<ALPHANUM>", 0, 7, 1, new int[]{1,1,1,1}, null, false)); tokenList = (List<NamedList>) queryResult.get("org.apache.lucene.analysis.en.PorterStemFilter"); assertNotNull("Expecting the 'PorterStemFilter' to be applied on the query for the 'text' field", tokenList); assertEquals("Query has only one token", 1, tokenList.size()); assertToken(tokenList.get(0), new TokenInfo("jump", null, "<ALPHANUM>", 0, 7, 1, new int[]{1,1,1,1,1}, null, false)); indexResult = textResult.get("index"); assertEquals("The 'text' field has only a single value", 1, indexResult.size()); valueResult = (NamedList<List<NamedList>>) indexResult.get("The Fox Jumped Over The Dogs"); tokenList = valueResult.get("org.apache.lucene.analysis.standard.StandardTokenizer"); assertNotNull("Expecting the 'StandardTokenizer' to be applied on the index for the 'text' field", tokenList); assertEquals("Expecting 6 tokens", 6, tokenList.size()); assertToken(tokenList.get(0), new TokenInfo("The", null, "<ALPHANUM>", 0, 3, 1, new int[]{1}, null, false)); assertToken(tokenList.get(1), new TokenInfo("Fox", null, "<ALPHANUM>", 4, 7, 2, new int[]{2}, null, false)); assertToken(tokenList.get(2), new TokenInfo("Jumped", null, "<ALPHANUM>", 8, 14, 3, new int[]{3}, null, false)); assertToken(tokenList.get(3), new TokenInfo("Over", null, "<ALPHANUM>", 15, 19, 4, new int[]{4}, null, false)); assertToken(tokenList.get(4), new TokenInfo("The", null, "<ALPHANUM>", 20, 23, 5, new int[]{5}, null, false)); assertToken(tokenList.get(5), new TokenInfo("Dogs", null, "<ALPHANUM>", 24, 28, 6, new int[]{6}, null, false)); tokenList = valueResult.get("org.apache.lucene.analysis.standard.StandardFilter"); assertNotNull("Expecting the 'StandardFilter' to be applied on the index for the 'text' field", tokenList); assertEquals("Expecting 6 tokens", 6, tokenList.size()); assertToken(tokenList.get(0), new TokenInfo("The", null, "<ALPHANUM>", 0, 3, 1, new int[]{1,1}, null, false)); assertToken(tokenList.get(1), new TokenInfo("Fox", null, "<ALPHANUM>", 4, 7, 2, new int[]{2,2}, null, false)); assertToken(tokenList.get(2), new TokenInfo("Jumped", null, "<ALPHANUM>", 8, 14, 3, new int[]{3,3}, null, false)); assertToken(tokenList.get(3), new TokenInfo("Over", null, "<ALPHANUM>", 15, 19, 4, new int[]{4,4}, null, false)); assertToken(tokenList.get(4), new TokenInfo("The", null, "<ALPHANUM>", 20, 23, 5, new int[]{5,5}, null, false)); assertToken(tokenList.get(5), new TokenInfo("Dogs", null, "<ALPHANUM>", 24, 28, 6, new int[]{6,6}, null, false)); tokenList = valueResult.get("org.apache.lucene.analysis.core.LowerCaseFilter"); assertNotNull("Expecting the 'LowerCaseFilter' to be applied on the index for the 'text' field", tokenList); assertEquals("Expecting 6 tokens", 6, tokenList.size()); assertToken(tokenList.get(0), new TokenInfo("the", null, "<ALPHANUM>", 0, 3, 1, new int[]{1,1,1}, null, false)); assertToken(tokenList.get(1), new TokenInfo("fox", null, "<ALPHANUM>", 4, 7, 2, new int[]{2,2,2}, null, false)); assertToken(tokenList.get(2), new TokenInfo("jumped", null, "<ALPHANUM>", 8, 14, 3, new int[]{3,3,3}, null, false)); assertToken(tokenList.get(3), new TokenInfo("over", null, "<ALPHANUM>", 15, 19, 4, new int[]{4,4,4}, null, false)); assertToken(tokenList.get(4), new TokenInfo("the", null, "<ALPHANUM>", 20, 23, 5, new int[]{5,5,5}, null, false)); assertToken(tokenList.get(5), new TokenInfo("dogs", null, "<ALPHANUM>", 24, 28, 6, new int[]{6,6,6}, null, false)); tokenList = valueResult.get("org.apache.lucene.analysis.core.StopFilter"); assertNotNull("Expecting the 'StopFilter' to be applied on the index for the 'text' field", tokenList); assertEquals("Expecting 4 tokens after stop word removal", 4, tokenList.size()); assertToken(tokenList.get(0), new TokenInfo("fox", null, "<ALPHANUM>", 4, 7, 2, new int[]{2,2,2,2}, null, false)); assertToken(tokenList.get(1), new TokenInfo("jumped", null, "<ALPHANUM>", 8, 14, 3, new int[]{3,3,3,3}, null, false)); assertToken(tokenList.get(2), new TokenInfo("over", null, "<ALPHANUM>", 15, 19, 4, new int[]{4,4,4,4}, null, false)); assertToken(tokenList.get(3), new TokenInfo("dogs", null, "<ALPHANUM>", 24, 28, 6, new int[]{6,6,6,6}, null, false)); tokenList = valueResult.get("org.apache.lucene.analysis.en.PorterStemFilter"); assertNotNull("Expecting the 'PorterStemFilter' to be applied on the index for the 'text' field", tokenList); assertEquals("Expecting 4 tokens", 4, tokenList.size()); assertToken(tokenList.get(0), new TokenInfo("fox", null, "<ALPHANUM>", 4, 7, 2, new int[]{2,2,2,2,2}, null, false)); assertToken(tokenList.get(1), new TokenInfo("jump", null, "<ALPHANUM>", 8, 14, 3, new int[]{3,3,3,3,3}, null, true)); assertToken(tokenList.get(2), new TokenInfo("over", null, "<ALPHANUM>", 15, 19, 4, new int[]{4,4,4,4,4}, null, false)); assertToken(tokenList.get(3), new TokenInfo("dog", null, "<ALPHANUM>", 24, 28, 6, new int[]{6,6,6,6,6}, null, false)); } }