TestRerankTextSearch.java example

Explorer

lire-master
- samples
  - classifier
    - src
      - net
        semanticmetadata
        lire
        classifiers
        ClassifierTest.java
        HashingSearchBasedClassifierMod.java
  - liredemo
    - src
      - main
        java
        edu
        uniklu
        itec
        mosaix
        ImageFunctions.java
        engine
        Engine.java
        EngineObserver.java
        Experimental.java
        LeastUsedWeightingStrategy.java
        Logging.java
        ProportionWeightingStrategy.java
        RandomWeightingStrategy.java
        SimpleWeightingData.java
        SimpleWeightingDataFactory.java
        WeightingData.java
        WeightingDataFactory.java
        WeightingStrategy.java
        liredemo
        ImagePanel.java
        IndexingThread.java
        LireDemoFrame.java
        Main.java
        ProgressMonitor.java
        SearchResultsTableModel.java
        flickr
        FlickrDownloadThread.java
        FlickrIndexingThread.java
        FlickrPhoto.java
        FlickrPhotoGrabber.java
        indexing
        MetadataBuilder.java
        ParallelIndexer.java
      - test
        java
        liredemo
        flickr
        FlickrPhotoGrabberTest.java
  - simpleapplication
    - src
      - main
        java
        net
        semanticmetadata
        lire
        sampleapp
        CreateARFFFile.java
        ExtractFeatures.java
        ExtractMultipleFeatures.java
        ExtractSingleFeature.java
        Indexer.java
        IndexingAndSearchWithLocalFeatures.java
        ParallelIndexing.java
        Searcher.java
  - teaching
    - src
      - main
        java
        samples
        Indexing.java
        Search.java
- src

/*
 * This file is part of the LIRE project: http://www.semanticmetadata.net/lire
 * LIRE is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * LIRE is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with LIRE; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 *
 * We kindly ask you to refer the any or one of the following publications in
 * any publication mentioning or employing Lire:
 *
 * Lux Mathias, Savvas A. Chatzichristofis. Lire: Lucene Image Retrieval –
 * An Extensible Java CBIR Library. In proceedings of the 16th ACM International
 * Conference on Multimedia, pp. 1085-1088, Vancouver, Canada, 2008
 * URL: http://doi.acm.org/10.1145/1459359.1459577
 *
 * Lux Mathias. Content Based Image Retrieval with LIRE. In proceedings of the
 * 19th ACM International Conference on Multimedia, pp. 735-738, Scottsdale,
 * Arizona, USA, 2011
 * URL: http://dl.acm.org/citation.cfm?id=2072432
 *
 * Mathias Lux, Oge Marques. Visual Information Retrieval using Java and LIRE
 * Morgan & Claypool, 2013
 * URL: http://www.morganclaypool.com/doi/abs/10.2200/S00468ED1V01Y201301ICR025
 *
 * Copyright statement:
 * --------------------
 * (c) 2002-2013 by Mathias Lux (mathias@juggle.at)
 *     http://www.semanticmetadata.net/lire, http://www.lire-project.net
 */

package net.semanticmetadata.lire.lucene;

import junit.framework.TestCase;
import net.semanticmetadata.lire.DocumentBuilder;
import net.semanticmetadata.lire.DocumentBuilderFactory;
import net.semanticmetadata.lire.ImageSearchHits;
import net.semanticmetadata.lire.filter.RerankFilter;
import net.semanticmetadata.lire.imageanalysis.FCTH;
import net.semanticmetadata.lire.utils.FileUtils;
import org.apache.lucene.analysis.core.SimpleAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;

import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;

/**
 * A test class for image based re-ranking of results from a text search with Lucene.
 * It's based on Junit (it's either included in your IDE or just google for the jar)
 * The testdata employed is in the LIRE SVN at
 * https://code.google.com/p/lire/source/checkout
 *
 * @author Mathias Lux mathias@juggle.at
 *
 */
public class TestRerankTextSearch extends TestCase {
    // that's where we put the index for testing:
    private File testIndex = new File("textindextest");

    public void testIndexing() throws IOException, ParserConfigurationException, SAXException {
        IndexWriterConfig iwConf = new IndexWriterConfig(Version.LUCENE_42, new SimpleAnalyzer(Version.LUCENE_42));
        IndexWriter iw = new IndexWriter(FSDirectory.open(testIndex), iwConf);
        // if you want to append the index to a pre-existing one use the next line.
        // iwConf.setOpenMode(IndexWriterConfig.OpenMode.APPEND);
        // create a LIRE DocumentBuilder for extracting FCTH (just an example, every other feature will do).
        DocumentBuilder builder = DocumentBuilderFactory.getFCTHDocumentBuilder();
        ArrayList<File> files = FileUtils.getAllImageFiles(new File("testdata/ferrari"), true);
        // for handling the XML of the test data set
        SAXParserFactory spf = SAXParserFactory.newInstance();
        spf.setNamespaceAware(true);
        SAXParser saxParser = spf.newSAXParser();
        XMLReader xmlReader = saxParser.getXMLReader();
        for (Iterator<File> iterator = files.iterator(); iterator.hasNext(); ) {
            File img = iterator.next();
            String path = img.getCanonicalPath();
            // create the document with the LIRE DocumentBuilder, this adds the image features to the document.
            Document d = builder.createDocument(new FileInputStream(img), path);
            // handling the XML of the test data set
            path = path.substring(0,path.lastIndexOf('.')) + ".xml";
            TagHandler handler = new TagHandler();
            xmlReader.setContentHandler(handler);
            xmlReader.parse(new InputSource(new File(path).toURI().toString()));
            // add the text to the document ...
            d.add(new TextField("tags", handler.getTags(), Field.Store.YES));
            // don't forget to add the document to the index.
            iw.addDocument(d);
        }
        iw.close();
    }

    public void testSearch() throws IOException, ParseException {
        // create a Lucene IndexReader and the according IndexSearcher:
        IndexReader reader = DirectoryReader.open(FSDirectory.open(testIndex));
        IndexSearcher searcher = new IndexSearcher(reader);
        // The QueryParser takes a String and creates a query out of it. Make sure you use the same field
        // as for indexing, in this case "tags"
        QueryParser q = new QueryParser(Version.LUCENE_42, "tags", new SimpleAnalyzer(Version.LUCENE_42));
        // let's just take the tags of the first document in the index:
        Query query = q.parse(reader.document(1).getValues("tags")[0]);
        // now that's the actual search:
        // NOTE: The number of results here is critical. The less documents are returned here, the
        // less the image re-ranking can mess up. However, the recall (the absolute number of relevant
        // documents returned) is also influenced by this. Best to try several values like 10, 100, 200, 500, ...
        TopDocs results = searcher.search(query, 10);
        // here we print the results of the text search, just for the win.
        System.out.println("-----------> SEARCH RESULTS ...");
        for (int i = 0; i < results.scoreDocs.length; i++) {
            ScoreDoc scoreDoc = results.scoreDocs[i];
            System.out.print(scoreDoc.score + "\t: ");
            // reader.document(scoreDoc.doc).getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0] gets you the actual image file path.
            // LIRE manages all needed filed names as static Strings in DocumentBuilder ...
            System.out.print(reader.document(scoreDoc.doc).getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0] + " -> ");
            System.out.println(reader.document(scoreDoc.doc).getValues("tags")[0]);
        }
        // just for a visual example ... this will pop up a browser window
        FileUtils.browseUri(FileUtils.saveImageResultsToHtml("text", results, reader, reader.document(1).getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0]));

        // and now for the re-ranking:
        // make sure to use a low level feature that has been indexed -- check the DocumentBuilder in above method.
        RerankFilter rerank = new RerankFilter(FCTH.class, DocumentBuilder.FIELD_NAME_FCTH);
        // note that you need the document here, it contains the low level feature ...
        // if you don't have it but just the image you need to create a new one with the
        // appropriate DocumentBuilder -- check the DocumentBuilder in above method.
        ImageSearchHits hitsReranked = rerank.filter(results, reader, reader.document(1));
        // and here we print the re-ranked hits:
        System.out.println("-----------> RERANKED ...");
        for (int i = 0; i < hitsReranked.length(); i++) {
            System.out.print(hitsReranked.score(i) + "\t: ");
            System.out.print(hitsReranked.doc(i).getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0] + " -> ");
            System.out.println(hitsReranked.doc(i).getValues("tags")[0]);
        }
        // just for a visual example ... this will pop up a browser window.
        FileUtils.browseUri(FileUtils.saveImageResultsToHtml("reranked", hitsReranked, reader.document(1).getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0]));
    }

    // handling the XML of the test data set, we just want the tags.
    // no need for that if you get the text elsewhere.
    class TagHandler extends DefaultHandler {
        StringBuilder sb = new StringBuilder(1024);
        boolean inTag = false;

        public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
            if (localName.startsWith("tag")) inTag = true;
            super.startElement(uri, localName, qName, attributes);
        }

        public void endElement(String uri, String localName, String qName) throws SAXException {
            if (localName.startsWith("tag")) {
                inTag = false;
                sb.append(' ');
            }
            super.endElement(uri, localName, qName);
        }

        public void characters(char[] ch, int start, int length) throws SAXException {
            if (inTag) {
                sb.append(ch, start, length);
            }
        }

        public String getTags() {
            return sb.toString().trim();
        }
    }
}