/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.queries.mlt; import java.io.IOException; import java.io.StringReader; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.HashMap; import java.util.Map; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.BoostQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.QueryUtils; import org.apache.lucene.search.TermQuery; import org.apache.lucene.store.Directory; import org.apache.lucene.util.LuceneTestCase; public class TestMoreLikeThis extends LuceneTestCase { private static final String SHOP_TYPE = "type"; private static final String FOR_SALE = "weSell"; private static final String NOT_FOR_SALE = "weDontSell"; private Directory directory; private IndexReader reader; private IndexSearcher searcher; @Override public void setUp() throws Exception { super.setUp(); directory = newDirectory(); RandomIndexWriter writer = new RandomIndexWriter(random(), directory); // Add series of docs with specific information for MoreLikeThis addDoc(writer, "lucene"); addDoc(writer, "lucene release"); addDoc(writer, "apache"); addDoc(writer, "apache lucene"); reader = writer.getReader(); writer.close(); searcher = newSearcher(reader); } @Override public void tearDown() throws Exception { reader.close(); directory.close(); super.tearDown(); } private void addDoc(RandomIndexWriter writer, String text) throws IOException { Document doc = new Document(); doc.add(newTextField("text", text, Field.Store.YES)); writer.addDocument(doc); } private void addDoc(RandomIndexWriter writer, String[] texts) throws IOException { Document doc = new Document(); for (String text : texts) { doc.add(newTextField("text", text, Field.Store.YES)); } writer.addDocument(doc); } public void testBoostFactor() throws Throwable { Map<String,Float> originalValues = getOriginalValues(); MoreLikeThis mlt = new MoreLikeThis(reader); Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false); mlt.setAnalyzer(analyzer); mlt.setMinDocFreq(1); mlt.setMinTermFreq(1); mlt.setMinWordLen(1); mlt.setFieldNames(new String[] {"text"}); mlt.setBoost(true); // this mean that every term boost factor will be multiplied by this // number float boostFactor = 5; mlt.setBoostFactor(boostFactor); BooleanQuery query = (BooleanQuery) mlt.like("text", new StringReader( "lucene release")); Collection<BooleanClause> clauses = query.clauses(); assertEquals("Expected " + originalValues.size() + " clauses.", originalValues.size(), clauses.size()); for (BooleanClause clause : clauses) { BoostQuery bq = (BoostQuery) clause.getQuery(); TermQuery tq = (TermQuery) bq.getQuery(); Float termBoost = originalValues.get(tq.getTerm().text()); assertNotNull("Expected term " + tq.getTerm().text(), termBoost); float totalBoost = termBoost * boostFactor; assertEquals("Expected boost of " + totalBoost + " for term '" + tq.getTerm().text() + "' got " + bq.getBoost(), totalBoost, bq .getBoost(), 0.0001); } analyzer.close(); } private Map<String,Float> getOriginalValues() throws IOException { Map<String,Float> originalValues = new HashMap<>(); MoreLikeThis mlt = new MoreLikeThis(reader); Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false); mlt.setAnalyzer(analyzer); mlt.setMinDocFreq(1); mlt.setMinTermFreq(1); mlt.setMinWordLen(1); mlt.setFieldNames(new String[] {"text"}); mlt.setBoost(true); BooleanQuery query = (BooleanQuery) mlt.like("text", new StringReader( "lucene release")); Collection<BooleanClause> clauses = query.clauses(); for (BooleanClause clause : clauses) { BoostQuery bq = (BoostQuery) clause.getQuery(); TermQuery tq = (TermQuery) bq.getQuery(); originalValues.put(tq.getTerm().text(), bq.getBoost()); } analyzer.close(); return originalValues; } // LUCENE-3326 public void testMultiFields() throws Exception { MoreLikeThis mlt = new MoreLikeThis(reader); Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false); mlt.setAnalyzer(analyzer); mlt.setMinDocFreq(1); mlt.setMinTermFreq(1); mlt.setMinWordLen(1); mlt.setFieldNames(new String[] {"text", "foobar"}); mlt.like("foobar", new StringReader("this is a test")); analyzer.close(); } // LUCENE-5725 public void testMultiValues() throws Exception { MoreLikeThis mlt = new MoreLikeThis(reader); Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.KEYWORD, false); mlt.setAnalyzer(analyzer); mlt.setMinDocFreq(1); mlt.setMinTermFreq(1); mlt.setMinWordLen(1); mlt.setFieldNames(new String[] {"text"}); BooleanQuery query = (BooleanQuery) mlt.like("text", new StringReader("lucene"), new StringReader("lucene release"), new StringReader("apache"), new StringReader("apache lucene")); Collection<BooleanClause> clauses = query.clauses(); assertEquals("Expected 2 clauses only!", 2, clauses.size()); for (BooleanClause clause : clauses) { Term term = ((TermQuery) clause.getQuery()).getTerm(); assertTrue(Arrays.asList(new Term("text", "lucene"), new Term("text", "apache")).contains(term)); } analyzer.close(); } // just basic equals/hashcode etc public void testMoreLikeThisQuery() throws Exception { Analyzer analyzer = new MockAnalyzer(random()); Query query = new MoreLikeThisQuery("this is a test", new String[] { "text" }, analyzer, "text"); QueryUtils.check(random(), query, searcher); analyzer.close(); } public void testTopN() throws Exception { int numDocs = 100; int topN = 25; // add series of docs with terms of decreasing df Directory dir = newDirectory(); RandomIndexWriter writer = new RandomIndexWriter(random(), dir); for (int i = 0; i < numDocs; i++) { addDoc(writer, generateStrSeq(0, i + 1)); } IndexReader reader = writer.getReader(); writer.close(); // setup MLT query MoreLikeThis mlt = new MoreLikeThis(reader); Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false); mlt.setAnalyzer(analyzer); mlt.setMaxQueryTerms(topN); mlt.setMinDocFreq(1); mlt.setMinTermFreq(1); mlt.setMinWordLen(1); mlt.setFieldNames(new String[]{"text"}); // perform MLT query String likeText = ""; for (String text : generateStrSeq(0, numDocs)) { likeText += text + " "; } BooleanQuery query = (BooleanQuery) mlt.like("text", new StringReader(likeText)); // check best terms are topN of highest idf Collection<BooleanClause> clauses = query.clauses(); assertEquals("Expected" + topN + "clauses only!", topN, clauses.size()); Term[] expectedTerms = new Term[topN]; int idx = 0; for (String text : generateStrSeq(numDocs - topN, topN)) { expectedTerms[idx++] = new Term("text", text); } for (BooleanClause clause : clauses) { Term term = ((TermQuery) clause.getQuery()).getTerm(); assertTrue(Arrays.asList(expectedTerms).contains(term)); } // clean up reader.close(); dir.close(); analyzer.close(); } private String[] generateStrSeq(int from, int size) { String[] generatedStrings = new String[size]; for (int i = 0; i < generatedStrings.length; i++) { generatedStrings[i] = String.valueOf(from + i); } return generatedStrings; } private int addShopDoc(RandomIndexWriter writer, String type, String[] weSell, String[] weDontSell) throws IOException { Document doc = new Document(); doc.add(newTextField(SHOP_TYPE, type, Field.Store.YES)); for (String item : weSell) { doc.add(newTextField(FOR_SALE, item, Field.Store.YES)); } for (String item : weDontSell) { doc.add(newTextField(NOT_FOR_SALE, item, Field.Store.YES)); } writer.addDocument(doc); return writer.numDocs() - 1; } @AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-7161") public void testMultiFieldShouldReturnPerFieldBooleanQuery() throws Exception { IndexReader reader = null; Directory dir = newDirectory(); Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false); try { int maxQueryTerms = 25; String[] itShopItemForSale = new String[]{"watch", "ipod", "asrock", "imac", "macbookpro", "monitor", "keyboard", "mouse", "speakers"}; String[] itShopItemNotForSale = new String[]{"tie", "trousers", "shoes", "skirt", "hat"}; String[] clothesShopItemForSale = new String[]{"tie", "trousers", "shoes", "skirt", "hat"}; String[] clothesShopItemNotForSale = new String[]{"watch", "ipod", "asrock", "imac", "macbookpro", "monitor", "keyboard", "mouse", "speakers"}; // add series of shop docs RandomIndexWriter writer = new RandomIndexWriter(random(), dir); for (int i = 0; i < 300; i++) { addShopDoc(writer, "it", itShopItemForSale, itShopItemNotForSale); } for (int i = 0; i < 300; i++) { addShopDoc(writer, "clothes", clothesShopItemForSale, clothesShopItemNotForSale); } // Input Document is a clothes shop int inputDocId = addShopDoc(writer, "clothes", clothesShopItemForSale, clothesShopItemNotForSale); reader = writer.getReader(); writer.close(); // setup MLT query MoreLikeThis mlt = new MoreLikeThis(reader); mlt.setAnalyzer(analyzer); mlt.setMaxQueryTerms(maxQueryTerms); mlt.setMinDocFreq(1); mlt.setMinTermFreq(1); mlt.setMinWordLen(1); mlt.setFieldNames(new String[]{FOR_SALE, NOT_FOR_SALE}); // perform MLT query BooleanQuery query = (BooleanQuery) mlt.like(inputDocId); Collection<BooleanClause> clauses = query.clauses(); Collection<BooleanClause> expectedClothesShopClauses = new ArrayList<BooleanClause>(); for (String itemForSale : clothesShopItemForSale) { BooleanClause booleanClause = new BooleanClause(new TermQuery(new Term(FOR_SALE, itemForSale)), BooleanClause.Occur.SHOULD); expectedClothesShopClauses.add(booleanClause); } for (String itemNotForSale : clothesShopItemNotForSale) { BooleanClause booleanClause = new BooleanClause(new TermQuery(new Term(NOT_FOR_SALE, itemNotForSale)), BooleanClause.Occur.SHOULD); expectedClothesShopClauses.add(booleanClause); } for (BooleanClause expectedClause : expectedClothesShopClauses) { assertTrue(clauses.contains(expectedClause)); } } finally { // clean up if (reader != null) { reader.close(); } dir.close(); analyzer.close(); } } // TODO: add tests for the MoreLikeThisQuery }