TestSearch.java example

Explorer
blazegraph-master
- database-master
/*

Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016.  All rights reserved.

Contact:
     SYSTAP, LLC DBA Blazegraph
     2501 Calvert ST NW #106
     Washington, DC 20008
     licenses@blazegraph.com

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

*/
/*
 * Created on Apr 3, 2008
 */

package com.bigdata.search;

import java.io.StringReader;
import java.util.Iterator;
import java.util.Properties;
import java.util.concurrent.TimeUnit;

import com.bigdata.rdf.lexicon.ITextIndexer.FullTextQuery;

/**
 * Test suite using examples based on <a
 * href="http://www.ec-securehost.com/SIAM/SE17.html"><i>Understanding Search
 * Engines</i></a> by Barry and Browne. I recommend the book as a good
 * overview of search engine basis and an excellent reader for latent semantic
 * indexing (Barry was one of the original people involved with LSI).
 * <p>
 * There is a worksheet <code>src/architecture/search.xls</code> that gives
 * the background for this test suite.
 * 
 * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
 * @version $Id$
 */
public class TestSearch extends AbstractSearchTest {

    public TestSearch() {
        super();
    }

    public TestSearch(String name) {
        super(name);
    }

    /**
     * Note: the examples have been modified to only expose the terms that were
     * accepted by the indexer used in the book. Since the authors were using
     * stemming, the examples are pre-stemmed. This just reduces the complexity
     * of the system under test.
     */
    final String[] docs = new String[]{
            /* Infant & Toddler First Aid */
            "Infant Toddler",//
            /* Babies Children's Room (For Your Home) */
            "Bab Child Home",//
            /* Child Safety at Home */
            "Child Safety Home",// 
            /* Your Baby's Health and Safety: From Infant to Toddler */
            "Bab Health Safety Infant Toddler",//
            /* Baby Proofing Basics */
            "Bab Proofing",
            /* Your Guide To Easy Rust Proofing */
            "Guide Proofing",
            /* Beanie Babies Collector's Guide */
            "Bab Guide"
    };
    
    /**
     * Overrides some properties to setup the {@link FullTextIndex} configuration.
     */
    public Properties getProperties() {
        
        Properties properties = new Properties( super.getProperties() );

        /*
         * TODO Configure and test with various local, global, and document
         * normalization methods.
         */
        
        return properties;
        
    }

    public void test_ChildProofing() throws InterruptedException {

        /** all documents are in English. */
        final String languageCode = "EN";


        final boolean prefixMatch = false;
        final double minCosine = .0;
        final double maxCosine = 1.0d;
        final int minRank = 1;
        final int maxRank = Integer.MAX_VALUE;// (was 10000)
        final boolean matchAllTerms = false;
        final long timeout = Long.MAX_VALUE;
        final TimeUnit unit = TimeUnit.MILLISECONDS;
        final String regex = null;

        init();
        {

            /*
             * Index the documents.
             */
            long docId = 1;
            final int fieldId = 0;
            final TokenBuffer<Long> buffer = new TokenBuffer<Long>(docs.length, getNdx());
            for (String s : docs) {

                getNdx().index(buffer, Long.valueOf(docId++), fieldId,
                        languageCode, new StringReader(s));

            }

            // flush index writes to the database.
            buffer.flush();
        }

        // run query and verify results.
        {

            final String query = "child proofing";

            final Hiterator<Hit<Long>> itr = getNdx().search(new FullTextQuery(
            		query,
                    languageCode, prefixMatch, regex, 
                    matchAllTerms, false/* matchExact*/, 
                    minCosine, maxCosine,
                    minRank, maxRank, timeout, unit));
//                                query, languageCode, 0d/* minCosine */,
//                                Integer.MAX_VALUE/* maxRank */);
            
            assertSameHits(new IHit[] { //
                    new HT<Long>(5L, 0.44194173824159216d),//
                    new HT<Long>(6L, 0.44194173824159216d),//
                    new HT<Long>(2L, 0.35355339059327373d),//
                    new HT<Long>(3L, 0.35355339059327373d),//
            }, itr);
        }

    }
    
    /**
     * Compares the hit list to the expected hit list.
     * <p>
     * Note: Ties on cosine are broken by sorting the ties into increasing order
     * by docId.
     * 
     * @param hits
     *            The expected hits.
     * @param itr
     *            The iterator visiting the actual hits.
     */
    protected void assertSameHits(final IHit[] hits,
            final Iterator<? extends IHit> itr) {

        final int nhits = hits.length;

        for (int i = 0; i < nhits; i++) {

            assertTrue("Iterator exhausted after " + (i) + " hits out of "
                    + nhits, itr.hasNext());

            final IHit expected = hits[i];

            final IHit actual = itr.next();

            if(log.isInfoEnabled())
                log.info("rank=" + (i + 1) + ", expected=" + expected
                        + ", actual: " + actual);

            // first check the document.
            assertEquals("wrong document: rank=" + (i + 1),
                    expected.getDocId(), actual.getDocId());

            /*
             * Verify the cosine.
             * 
             * Note: This allows for some variation in the computed cosine. More
             * variation will be present when the local term weights in the
             * index are stored using single precision (versus double precision)
             * values.
             */
       
            final double expectedCosine = expected.getCosine();
       
            final double actualCosine = actual.getCosine();
            
            if (actualCosine < expectedCosine - .01d
                    || actualCosine > expectedCosine + .01d) {
            
                assertEquals("wrong cosine: rank=" + (i + 1), expected
                        .getCosine(), actual.getCosine());
                
            }

        }

        assertFalse("Iterator will visit too many hits - only " + nhits
                + " are expected", itr.hasNext());

    }

    private static class HT<V extends Comparable<V>> implements IHit<V> {

        final private V docId;

        final private double cosine;

        public HT(final V docId, final double cosine) {

            if (docId == null)
                throw new IllegalArgumentException();

            this.docId = docId;

            this.cosine = cosine;

        }

        public int getRank() {
        	
        	return 0;
        	
        }
        
        public double getCosine() {

            return cosine;

        }

        public V getDocId() {

            return docId;

        }

        public String toString() {

            return "{docId=" + docId + ",cosine=" + cosine + "}";

        }

    }

}