/* Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved. Contact: SYSTAP, LLC DBA Blazegraph 2501 Calvert ST NW #106 Washington, DC 20008 licenses@blazegraph.com This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; version 2 of the License. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ /* * Created on Apr 3, 2008 */ package com.bigdata.search; import java.io.StringReader; import java.util.Iterator; import java.util.Properties; import java.util.concurrent.TimeUnit; import com.bigdata.rdf.lexicon.ITextIndexer.FullTextQuery; /** * Test suite using examples based on <a * href="http://www.ec-securehost.com/SIAM/SE17.html"><i>Understanding Search * Engines</i></a> by Barry and Browne. I recommend the book as a good * overview of search engine basis and an excellent reader for latent semantic * indexing (Barry was one of the original people involved with LSI). * <p> * There is a worksheet <code>src/architecture/search.xls</code> that gives * the background for this test suite. * * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a> * @version $Id$ */ public class TestSearch extends AbstractSearchTest { public TestSearch() { super(); } public TestSearch(String name) { super(name); } /** * Note: the examples have been modified to only expose the terms that were * accepted by the indexer used in the book. Since the authors were using * stemming, the examples are pre-stemmed. This just reduces the complexity * of the system under test. */ final String[] docs = new String[]{ /* Infant & Toddler First Aid */ "Infant Toddler",// /* Babies Children's Room (For Your Home) */ "Bab Child Home",// /* Child Safety at Home */ "Child Safety Home",// /* Your Baby's Health and Safety: From Infant to Toddler */ "Bab Health Safety Infant Toddler",// /* Baby Proofing Basics */ "Bab Proofing", /* Your Guide To Easy Rust Proofing */ "Guide Proofing", /* Beanie Babies Collector's Guide */ "Bab Guide" }; /** * Overrides some properties to setup the {@link FullTextIndex} configuration. */ public Properties getProperties() { Properties properties = new Properties( super.getProperties() ); /* * TODO Configure and test with various local, global, and document * normalization methods. */ return properties; } public void test_ChildProofing() throws InterruptedException { /** all documents are in English. */ final String languageCode = "EN"; final boolean prefixMatch = false; final double minCosine = .0; final double maxCosine = 1.0d; final int minRank = 1; final int maxRank = Integer.MAX_VALUE;// (was 10000) final boolean matchAllTerms = false; final long timeout = Long.MAX_VALUE; final TimeUnit unit = TimeUnit.MILLISECONDS; final String regex = null; init(); { /* * Index the documents. */ long docId = 1; final int fieldId = 0; final TokenBuffer<Long> buffer = new TokenBuffer<Long>(docs.length, getNdx()); for (String s : docs) { getNdx().index(buffer, Long.valueOf(docId++), fieldId, languageCode, new StringReader(s)); } // flush index writes to the database. buffer.flush(); } // run query and verify results. { final String query = "child proofing"; final Hiterator<Hit<Long>> itr = getNdx().search(new FullTextQuery( query, languageCode, prefixMatch, regex, matchAllTerms, false/* matchExact*/, minCosine, maxCosine, minRank, maxRank, timeout, unit)); // query, languageCode, 0d/* minCosine */, // Integer.MAX_VALUE/* maxRank */); assertSameHits(new IHit[] { // new HT<Long>(5L, 0.44194173824159216d),// new HT<Long>(6L, 0.44194173824159216d),// new HT<Long>(2L, 0.35355339059327373d),// new HT<Long>(3L, 0.35355339059327373d),// }, itr); } } /** * Compares the hit list to the expected hit list. * <p> * Note: Ties on cosine are broken by sorting the ties into increasing order * by docId. * * @param hits * The expected hits. * @param itr * The iterator visiting the actual hits. */ protected void assertSameHits(final IHit[] hits, final Iterator<? extends IHit> itr) { final int nhits = hits.length; for (int i = 0; i < nhits; i++) { assertTrue("Iterator exhausted after " + (i) + " hits out of " + nhits, itr.hasNext()); final IHit expected = hits[i]; final IHit actual = itr.next(); if(log.isInfoEnabled()) log.info("rank=" + (i + 1) + ", expected=" + expected + ", actual: " + actual); // first check the document. assertEquals("wrong document: rank=" + (i + 1), expected.getDocId(), actual.getDocId()); /* * Verify the cosine. * * Note: This allows for some variation in the computed cosine. More * variation will be present when the local term weights in the * index are stored using single precision (versus double precision) * values. */ final double expectedCosine = expected.getCosine(); final double actualCosine = actual.getCosine(); if (actualCosine < expectedCosine - .01d || actualCosine > expectedCosine + .01d) { assertEquals("wrong cosine: rank=" + (i + 1), expected .getCosine(), actual.getCosine()); } } assertFalse("Iterator will visit too many hits - only " + nhits + " are expected", itr.hasNext()); } private static class HT<V extends Comparable<V>> implements IHit<V> { final private V docId; final private double cosine; public HT(final V docId, final double cosine) { if (docId == null) throw new IllegalArgumentException(); this.docId = docId; this.cosine = cosine; } public int getRank() { return 0; } public double getCosine() { return cosine; } public V getDocId() { return docId; } public String toString() { return "{docId=" + docId + ",cosine=" + cosine + "}"; } } }