/*
* This file is part of the LIRE project: http://www.semanticmetadata.net/lire
* LIRE is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* LIRE is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with LIRE; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
* We kindly ask you to refer the any or one of the following publications in
* any publication mentioning or employing Lire:
*
* Lux Mathias, Savvas A. Chatzichristofis. Lire: Lucene Image Retrieval –
* An Extensible Java CBIR Library. In proceedings of the 16th ACM International
* Conference on Multimedia, pp. 1085-1088, Vancouver, Canada, 2008
* URL: http://doi.acm.org/10.1145/1459359.1459577
*
* Lux Mathias. Content Based Image Retrieval with LIRE. In proceedings of the
* 19th ACM International Conference on Multimedia, pp. 735-738, Scottsdale,
* Arizona, USA, 2011
* URL: http://dl.acm.org/citation.cfm?id=2072432
*
* Mathias Lux, Oge Marques. Visual Information Retrieval using Java and LIRE
* Morgan & Claypool, 2013
* URL: http://www.morganclaypool.com/doi/abs/10.2200/S00468ED1V01Y201301ICR025
*
* Copyright statement:
* ====================
* (c) 2002-2013 by Mathias Lux (mathias@juggle.at)
* http://www.semanticmetadata.net/lire, http://www.lire-project.net
*
* Updated: 04.05.13 11:18
*/
package net.semanticmetadata.lire.indexing;
import net.semanticmetadata.lire.DocumentBuilder;
import net.semanticmetadata.lire.ImageSearchHits;
import net.semanticmetadata.lire.ImageSearcher;
import net.semanticmetadata.lire.imageanalysis.CEDD;
import net.semanticmetadata.lire.imageanalysis.LireFeature;
import net.semanticmetadata.lire.impl.GenericImageSearcher;
import net.semanticmetadata.lire.utils.LuceneUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.SimpleAnalyzer;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.*;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Bits;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.IOException;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
/**
* This class provides an indexing approach for approximate search based on the work of G. Amato
* (giuseppe.amato@isti.cnr.it). See also his paper "Approximate Similarity Search in Metric Spaces
* using Inverted Files"
* Date: 14.05.2009
* Time: 14:22:03
*
* @author Mathias Lux, mathias@juggle.at
*/
public class MetricSpacesInvertedListIndexing {
public static int numReferenceObjects = 500;
public static int numReferenceObjectsUsed = 50;
private static MetricSpacesInvertedListIndexing msili = new MetricSpacesInvertedListIndexing(CEDD.class, DocumentBuilder.FIELD_NAME_CEDD);
private Class<? extends LireFeature> featureClass;
private String featureFieldName;
private int numHits = 100;
private ProgressIndicator progress;
public enum State {
RoSelection, RoIndexing, Indexing, Idle
}
;
/**
* @param featureClass the feature being used for this new index (e.g. CEDD)
* @param featureFieldName the field hashFunctionsFileName where to find the feature.
*/
public MetricSpacesInvertedListIndexing(Class<? extends LireFeature> featureClass, String featureFieldName) {
this.featureClass = featureClass;
this.featureFieldName = featureFieldName;
progress = new ProgressIndicator();
}
public static MetricSpacesInvertedListIndexing getDefaultInstance() {
return msili;
}
/**
* Creates a set of reference objects and stores it in a new index (hashFunctionsFileName "<indexPath>-ro"). Then creates ordered
* lists of reference object positions for each data item in the index with given feature.
* Finally a new index (hashFunctionsFileName "<indexPath>-ms") is created where all the original documents as well as the new data
* are stored.
*
* @param indexPath the path to the original index
* @throws IOException
*/
public void createIndex(String indexPath) throws IOException {
IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexPath)));
int numDocs = reader.numDocs();
if (numDocs < numReferenceObjects) {
throw new UnsupportedOperationException("Too few documents in index.");
}
// progress report
progress.setNumDocsAll(numDocs);
progress.setCurrentState(State.RoSelection);
boolean hasDeletions = reader.hasDeletions();
// init reference objects:
IndexWriter iw = LuceneUtils.createIndexWriter(indexPath + "-ro", true);
HashSet<Integer> referenceObjsIds = new HashSet<Integer>(numReferenceObjects);
double numDocsDouble = (double) numDocs;
while (referenceObjsIds.size() < numReferenceObjects) {
referenceObjsIds.add((int) (numDocsDouble * Math.random()));
}
int count = 0;
if (hasDeletions) {
System.err.println("WARNING: There are deleted docs in your index. You should " +
"optimize your index before using this method.");
}
// progress report
progress.setCurrentState(State.RoIndexing);
// find them in the index and put them into a separate index:
for (int i : referenceObjsIds) {
count++;
Document document = reader.document(i);
document.add(new Field("ro-id", count + "", StringField.TYPE_STORED));
iw.addDocument(document);
}
iw.commit();
iw.close();
// progress report
progress.setCurrentState(State.Indexing);
// now find the reference objects for each entry ;)
IndexReader readerRo = DirectoryReader.open(FSDirectory.open(new File(indexPath + "-ro")));
ImageSearcher searcher = new GenericImageSearcher(numReferenceObjectsUsed, featureClass, featureFieldName);
Map<String, Analyzer> analyzerPerField = new HashMap<String, Analyzer>();
analyzerPerField.put("ro-order", new WhitespaceAnalyzer(LuceneUtils.LUCENE_VERSION));
PerFieldAnalyzerWrapper aWrapper = new PerFieldAnalyzerWrapper(new SimpleAnalyzer(LuceneUtils.LUCENE_VERSION), analyzerPerField);
iw = new IndexWriter(FSDirectory.open(new File(indexPath)), new IndexWriterConfig(LuceneUtils.LUCENE_VERSION, aWrapper).setOpenMode(IndexWriterConfig.OpenMode.CREATE));
StringBuilder sb = new StringBuilder(256);
// Needed for check whether the document is deleted.
Bits liveDocs = MultiFields.getLiveDocs(reader);
for (int i = 0; i < numDocs; i++) {
if (reader.hasDeletions() && !liveDocs.get(i)) continue; // if it is deleted, just ignore it.
Document document = reader.document(i);
ImageSearchHits hits = searcher.search(document, readerRo);
sb.delete(0, sb.length());
for (int j = 0; j < numReferenceObjectsUsed; j++) {
sb.append(hits.doc(j).getValues("ro-id")[0]);
sb.append(' ');
}
// System.out.println(sb.toString());
document.add(new TextField("ro-order", sb.toString(), Field.Store.YES));
iw.updateDocument(new Term(DocumentBuilder.FIELD_NAME_IDENTIFIER, document.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0]), document);
// progress report
progress.setNumDocsProcessed(progress.getNumDocsProcessed() + 1);
}
iw.commit();
iw.close();
// progress report
progress.setCurrentState(State.Idle);
}
/**
* We assume that the initial indexing has been done and a set of reference objects has been
* found and indexed in the separate fileList. However further documents were added and they
* now need to get a ranked list of reference objects. So we (i) get all these new documents
* missing the field "ro-order" and (ii) add this field.
*
* @param indexPath the index to update
* @throws IOException
*/
public void updateIndex(String indexPath) throws IOException {
IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexPath)));
int numDocs = reader.numDocs();
boolean hasDeletions = reader.hasDeletions();
int countUpdated = 0;
IndexReader readerRo = DirectoryReader.open(FSDirectory.open(new File(indexPath + "-ro")));
ImageSearcher searcher = new GenericImageSearcher(numReferenceObjectsUsed, featureClass, featureFieldName);
Map<String, Analyzer> perField = new HashMap<String, Analyzer>(1);
perField.put("ro-order", new WhitespaceAnalyzer(LuceneUtils.LUCENE_VERSION));
PerFieldAnalyzerWrapper aWrapper =
new PerFieldAnalyzerWrapper(new SimpleAnalyzer(LuceneUtils.LUCENE_VERSION), perField);
IndexWriter iw = new IndexWriter(FSDirectory.open(new File(indexPath)), new IndexWriterConfig(LuceneUtils.LUCENE_VERSION, aWrapper).setOpenMode(IndexWriterConfig.OpenMode.CREATE));
StringBuilder sb = new StringBuilder(256);
// Needed for check whether the document is deleted.
Bits liveDocs = MultiFields.getLiveDocs(reader);
for (int i = 0; i < numDocs; i++) {
if (reader.hasDeletions() && !liveDocs.get(i)) continue; // if it is deleted, just ignore it.
Document document = reader.document(i);
if (document.getField("ro-order") == null) { // if the field is not here we create it.
ImageSearchHits hits = searcher.search(document, readerRo);
sb.delete(0, sb.length());
for (int j = 0; j < numReferenceObjectsUsed; j++) {
sb.append(hits.doc(j).getValues("ro-id")[0]);
sb.append(' ');
}
// System.out.println(sb.toString());
document.add(new TextField("ro-order", sb.toString(), Field.Store.YES));
iw.updateDocument(new Term(DocumentBuilder.FIELD_NAME_IDENTIFIER, document.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0]), document);
countUpdated++;
}
// progress report
progress.setNumDocsProcessed(progress.getNumDocsProcessed() + 1);
// debug:
System.out.println("countUpdated = " + countUpdated);
}
iw.commit();
iw.close();
}
/**
* Provides basic search functions ...
*
* @param img
* @param indexPath
* @return
* @throws IOException
*/
public TopDocs search(BufferedImage img, String indexPath) throws IOException {
ImageSearcher searcher = new GenericImageSearcher(numReferenceObjectsUsed, featureClass, featureFieldName);
ImageSearchHits hits = searcher.search(img, DirectoryReader.open(FSDirectory.open(new File(indexPath + "-ro"))));
StringBuilder sb = new StringBuilder(numReferenceObjectsUsed * 4);
for (int j = 0; j < numReferenceObjectsUsed; j++) {
sb.append(hits.doc(j).getValues("ro-id")[0]);
sb.append(' ');
}
return scoreDocs(sb.toString(), DirectoryReader.open(FSDirectory.open(new File(indexPath))));
}
/**
* Provides basic search functions ...
*
* @param d
* @param indexPath
* @return
* @throws IOException
*/
public TopDocs search(Document d, String indexPath) throws IOException {
if (d.getField("ro-order") != null) // if the document already contains the information on reference object neighbourhood
return scoreDocs(d.getValues("ro-order")[0], DirectoryReader.open(FSDirectory.open(new File(indexPath))));
else { // if not we just create it :)
ImageSearcher searcher = new GenericImageSearcher(numReferenceObjectsUsed, featureClass, featureFieldName);
ImageSearchHits hits = searcher.search(d, DirectoryReader.open(FSDirectory.open(new File(indexPath + "-ro"))));
StringBuilder sb = new StringBuilder(numReferenceObjectsUsed * 4);
for (int j = 0; j < numReferenceObjectsUsed; j++) {
sb.append(hits.doc(j).getValues("ro-id")[0]);
sb.append(' ');
}
return scoreDocs(sb.toString(), DirectoryReader.open(FSDirectory.open(new File(indexPath))));
}
}
/**
* Scoring function based on the footrule distance.
*
* @param queryString
* @param reader
* @return
* @throws IOException
*/
protected TopDocs scoreDocs(String queryString, IndexReader reader) throws IOException {
/*
// TODO: optimize here ;) Perhaps focus on the most promising results
StringTokenizer st = new StringTokenizer(queryString);
int position = 0;
HashMap<Integer, Integer> doc2score = new HashMap<Integer, Integer>(1000);
HashMap<Integer, Integer> doc2count = new HashMap<Integer, Integer>(1000);
int currDoc = 0;
while (st.hasMoreTokens()) {
TermPositions tp = reader.termPositions(new Term("ro-order", st.nextToken()));
while (tp.next()) {
currDoc = tp.doc();
// System.out.println(tp.doc() + ": " + tp.nextPosition());
if (doc2score.get(currDoc) == null) {
doc2score.put(currDoc, Math.abs(tp.nextPosition() - position));
doc2count.put(currDoc, 1);
} else {
doc2score.put(currDoc, doc2score.get(currDoc) + Math.abs(tp.nextPosition() - position));
doc2count.put(currDoc, doc2count.get(currDoc) + 1);
}
}
position++;
}
int currdocscore = 0;
int maxScore = 0, minScore = (position - 1) * position;
TreeSet<ScoreDoc> results = new TreeSet<ScoreDoc>(new ScoreDocComparator());
for (Iterator<Integer> iterator = doc2count.keySet().iterator(); iterator.hasNext(); ) {
currDoc = iterator.next();
currdocscore = (position - 1) * position - // max score ... minus actual distance.
(doc2score.get(currDoc) + (position - doc2count.get(currDoc)) * (position - 1));
maxScore = Math.max(maxScore, currdocscore);
minScore = Math.min(minScore, currdocscore);
if (results.size() < numHits || currdocscore >= minScore) {
results.add(new ScoreDoc(currDoc, currdocscore));
}
}
while (results.size() > numHits) results.pollLast();
return new TopDocs(Math.min(results.size(), numHits), (ScoreDoc[]) results.toArray(new ScoreDoc[results.size()]), maxScore);
*/
throw new UnsupportedOperationException("Not supported currently in Lucene 4.0");
}
public int getNumHits() {
return numHits;
}
public void setNumHits(int numHits) {
this.numHits = numHits;
}
/**
* Returns a reader for the index consisting the documents with the approximate search information.
*
* @param indexPath
* @return
* @throws IOException
*/
public IndexReader getIndexReader(String indexPath) throws IOException {
return DirectoryReader.open(FSDirectory.open(new File(indexPath)));
}
public ProgressIndicator getProgress() {
return progress;
}
public void setProgress(ProgressIndicator progress) {
this.progress = progress;
}
// ******************************************************************************
// ** Inner class ...
// ******************************************************************************
private static class ScoreDocComparator implements Comparator<ScoreDoc> {
public int compare(ScoreDoc o1, ScoreDoc o2) {
return (int) Math.signum(o2.score - o1.score);
}
}
}