GlobalDocumentBuilder.java example

Explorer

lire-master
- samples
  - classifier
    - src
      - net
        semanticmetadata
        lire
        classifiers
        ClassifierTest.java
        HashingSearchBasedClassifierMod.java
  - liredemo
    - src
      - main
        java
        edu
        uniklu
        itec
        mosaix
        ImageFunctions.java
        engine
        Engine.java
        EngineObserver.java
        Experimental.java
        LeastUsedWeightingStrategy.java
        Logging.java
        ProportionWeightingStrategy.java
        RandomWeightingStrategy.java
        SimpleWeightingData.java
        SimpleWeightingDataFactory.java
        WeightingData.java
        WeightingDataFactory.java
        WeightingStrategy.java
        liredemo
        ImagePanel.java
        IndexingThread.java
        LireDemoFrame.java
        Main.java
        ProgressMonitor.java
        SearchResultsTableModel.java
        flickr
        FlickrDownloadThread.java
        FlickrIndexingThread.java
        FlickrPhoto.java
        FlickrPhotoGrabber.java
        indexing
        MetadataBuilder.java
        ParallelIndexer.java
      - test
        java
        liredemo
        flickr
        FlickrPhotoGrabberTest.java
  - simpleapplication
    - src
      - main
        java
        net
        semanticmetadata
        lire
        sampleapp
        CreateARFFFile.java
        ExtractFeatures.java
        ExtractMultipleFeatures.java
        ExtractSingleFeature.java
        Indexer.java
        IndexingAndSearchWithLocalFeatures.java
        ParallelIndexing.java
        Searcher.java
  - teaching
    - src
      - main
        java
        samples
        Indexing.java
        Search.java
- src

/*
 * This file is part of the LIRE project: http://lire-project.net
 * LIRE is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * LIRE is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with LIRE; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 *
 * We kindly ask you to refer the any or one of the following publications in
 * any publication mentioning or employing Lire:
 *
 * Lux Mathias, Savvas A. Chatzichristofis. Lire: Lucene Image Retrieval -
 * An Extensible Java CBIR Library. In proceedings of the 16th ACM International
 * Conference on Multimedia, pp. 1085-1088, Vancouver, Canada, 2008
 * URL: http://doi.acm.org/10.1145/1459359.1459577
 *
 * Lux Mathias. Content Based Image Retrieval with LIRE. In proceedings of the
 * 19th ACM International Conference on Multimedia, pp. 735-738, Scottsdale,
 * Arizona, USA, 2011
 * URL: http://dl.acm.org/citation.cfm?id=2072432
 *
 * Mathias Lux, Oge Marques. Visual Information Retrieval using Java and LIRE
 * Morgan & Claypool, 2013
 * URL: http://www.morganclaypool.com/doi/abs/10.2200/S00468ED1V01Y201301ICR025
 */

package net.semanticmetadata.lire.builders;

import net.semanticmetadata.lire.imageanalysis.features.GlobalFeature;
import net.semanticmetadata.lire.indexers.hashing.BitSampling;
import net.semanticmetadata.lire.indexers.hashing.LocalitySensitiveHashing;
import net.semanticmetadata.lire.indexers.hashing.MetricSpaces;
import net.semanticmetadata.lire.indexers.parallel.ExtractorItem;
import net.semanticmetadata.lire.utils.ImageUtils;
import net.semanticmetadata.lire.utils.SerializationUtils;
import org.apache.lucene.document.*;
import org.apache.lucene.util.BytesRef;

import java.awt.image.BufferedImage;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.Map;

/**
 * This class creates Lucene Documents from images using one or multiple Global Features.
 * Can also be used only for extraction.
 *
 * @author Nektarios Anagnostopoulos, nek.anag@gmail.com (c) 2015 by Nektarios Anagnostopoulos
 * @author Mathias Lux, mathias@juggle.at
 */
public class GlobalDocumentBuilder implements DocumentBuilder {

    private boolean useDocValues = false;

    public enum HashingMode {BitSampling, LSH, MetricSpaces, None}

    private HashingMode hashingMode = HashingMode.BitSampling;
    private boolean hashingEnabled = false;

    private HashMap<ExtractorItem, String[]> extractorItems = new HashMap<ExtractorItem, String[]>(10);
    private boolean docsCreated = false;

    public GlobalDocumentBuilder() {
    }

    public GlobalDocumentBuilder(boolean hashing) {
        this.hashingEnabled = hashing;
        if (hashingEnabled) testHashes();
    }

    public GlobalDocumentBuilder(boolean hashing, boolean useDocValues) {
        this.hashingEnabled = hashing;
        if (hashingEnabled) testHashes();
        this.useDocValues = useDocValues;
    }

    /**
     * Creates a GlobalDocumentBuilder with the specific hashing mode. Please note that you have to take care of the
     * initilization of the hashing subsystem yourself.
     *
     * @param hashing     true if you want hashing to be applied.
     * @param hashingMode the actual mode, eg. BitSampling or MetricSpaces.
     */
    public GlobalDocumentBuilder(boolean hashing, HashingMode hashingMode) {
        this.hashingEnabled = hashing;
        this.hashingMode = hashingMode;
        if (hashingEnabled) testHashes();
    }


    /**
     * Creates a GlobalDocumentBuilder with the specific hashing mode. Please note that you have to take care of the
     * initilization of the hashing subsystem yourself. Optionally use DocValues instead of TextField implementations
     * for storing the feature vector. Note that this cannot be read by ordinary linear searchers, but must be
     * implemented in a different way.
     *
     * @param hashing      true if you want hashing to be applied.
     * @param hashingMode  the actual mode, eg. BitSampling or MetricSpaces.
     * @param useDocValues set to true if you want to use DocValues instead of Lucene fields.
     */
    public GlobalDocumentBuilder(boolean hashing, HashingMode hashingMode, boolean useDocValues) {
        this.hashingEnabled = hashing;
        this.hashingMode = hashingMode;
        this.useDocValues = useDocValues;
        if (hashingEnabled) testHashes();
    }

    public GlobalDocumentBuilder(Class<? extends GlobalFeature> globalFeatureClass) {
        addExtractor(globalFeatureClass);
    }

    public GlobalDocumentBuilder(Class<? extends GlobalFeature> globalFeatureClass, boolean hashing) {
        addExtractor(globalFeatureClass);
        this.hashingEnabled = hashing;
        if (hashingEnabled) testHashes();
    }


    /**
     * Use DocValues instead of TextField implementations for storing the feature vector. Note that this cannot be
     * read by ordinary linear searchers, but must be implmented in a different way.
     *
     * @param globalFeatureClass
     * @param hashing            set to true if hashing should be performed.
     * @param useDocValues       set to true if you want to use DocValues instead of Lucene fields.
     */
    public GlobalDocumentBuilder(Class<? extends GlobalFeature> globalFeatureClass, boolean hashing, boolean useDocValues) {
        addExtractor(globalFeatureClass);
        this.useDocValues = useDocValues;
        this.hashingEnabled = hashing;
        if (hashingEnabled) testHashes();
    }

    public GlobalDocumentBuilder(ExtractorItem extractorItem) {
        addExtractor(extractorItem);
    }


    /**
     * Can be used to add global extractors.
     *
     * @param globalFeatureClass
     */
    public void addExtractor(Class<? extends GlobalFeature> globalFeatureClass) {
        addExtractor(new ExtractorItem(globalFeatureClass));
    }

    /**
     * Can be used to add global extractors.
     *
     * @param extractorItem
     */
    public void addExtractor(ExtractorItem extractorItem) {
        if (docsCreated)
            throw new UnsupportedOperationException("Cannot modify builder after documents have been created!");
        if (!extractorItem.isGlobal())
            throw new UnsupportedOperationException("ExtractorItem must contain GlobalFeature");

        String fieldName = extractorItem.getFieldName();
        extractorItems.put(extractorItem, new String[]{fieldName, fieldName + DocumentBuilder.HASH_FIELD_SUFFIX});
    }

    private static void testHashes() {
//        Let's try to read the hash functions right here and we don't have to care about it right now.
        try {
            BitSampling.readHashFunctions();
//            LocalitySensitiveHashing.readHashFunctions();
        } catch (Exception e) {
            System.err.println("Could not read BitSampling hashes from file when first creating a GlobalDocumentBuilder instance.");
            e.printStackTrace();
        }
    }

    /**
     * Images are resized so as not to exceed the {@link DocumentBuilder#MAX_IMAGE_DIMENSION}, after that
     * the feature is extracted using the given globalFeature.
     *
     * @param image         is the image
     * @param globalFeature selected global feature
     * @return the input globalFeature
     */
    public GlobalFeature extractGlobalFeature(BufferedImage image, GlobalFeature globalFeature) {
        assert (image != null);
        // Scaling image is especially with the correlogram features very important!
        // All images are scaled to guarantee a certain upper limit for indexing.
        if (Math.max(image.getHeight(), image.getWidth()) > DocumentBuilder.MAX_IMAGE_DIMENSION) {
            image = ImageUtils.scaleImage(image, DocumentBuilder.MAX_IMAGE_DIMENSION);
        }

        globalFeature.extract(image);
        return globalFeature;
    }

    /**
     * Extracts the global feature and returns the Lucene Fields for the selected image.
     *
     * @param image         is the selected image.
     * @param extractorItem is the extractor to be used to extract the features.
     * @return Lucene Fields.
     */
    private Field[] getGlobalDescriptorFields(BufferedImage image, ExtractorItem extractorItem) {
        Field[] result;
//        if (hashingEnabled) result = new Field[2];
//        else result = new Field[1];
        Field hash = null;
        Field vector = null;

        GlobalFeature globalFeature = extractGlobalFeature(image, (GlobalFeature) extractorItem.getExtractorInstance());

        if (!useDocValues) {
            // TODO: Stored field is compressed and upon search decompression takes a lot of time (> 50% with a small index with 50k images). Find something else ...
            vector = new StoredField(extractorItems.get(extractorItem)[0], new BytesRef(globalFeature.getByteArrayRepresentation()));
        } else {
            // Alternative: The DocValues field. It's extremely fast to read, but it's all in RAM most likely.
            vector = new BinaryDocValuesField(extractorItems.get(extractorItem)[0], new BytesRef(globalFeature.getByteArrayRepresentation()));
        }


        // if BitSampling is an issue we add a field with the given hashFunctionsFileName and the suffix "hash":
        if (hashingEnabled) {
            // TODO: check eventually if there is a more compressed string version of the integers. i.e. the hex string
            if (globalFeature.getFeatureVector().length <= 3100) {
                int[] hashes;
                if (hashingMode == HashingMode.BitSampling) {
                    hashes = BitSampling.generateHashes(globalFeature.getFeatureVector());
                    hash = new TextField(extractorItems.get(extractorItem)[1], SerializationUtils.arrayToString(hashes), Field.Store.YES);
                } else if (hashingMode == HashingMode.LSH) {
                    hashes = LocalitySensitiveHashing.generateHashes(globalFeature.getFeatureVector());
                    hash = new TextField(extractorItems.get(extractorItem)[1], SerializationUtils.arrayToString(hashes), Field.Store.YES);
                } else if (hashingMode == HashingMode.MetricSpaces) {
                    if (MetricSpaces.supportsFeature(globalFeature)) {
                        // the name of the field is set at "addExtractor" time.
                        hash = new TextField(extractorItems.get(extractorItem)[1], MetricSpaces.generateHashString(globalFeature), Field.Store.YES);
                    }
                }
            } else
                System.err.println("Could not create hashes, feature vector too long: " + globalFeature.getFeatureVector().length + " (" + globalFeature.getClass().getName() + ")");
        }
        if (hash != null) result = new Field[]{vector, hash};
        else result = new Field[]{vector};
        return result;
    }


    /**
     * @param image the image to analyze.
     * @return Lucene Fields.
     */
    @Override
    public Field[] createDescriptorFields(BufferedImage image) {
        docsCreated = true;
        LinkedList<Field> resultList = new LinkedList<Field>();
        Field[] fields;
        if (extractorItems.size() > 0) {
            for (Map.Entry<ExtractorItem, String[]> extractorItemEntry : extractorItems.entrySet()) {
                fields = getGlobalDescriptorFields(image, extractorItemEntry.getKey());

                Collections.addAll(resultList, fields);
            }
        }

        return resultList.toArray(new Field[resultList.size()]);
    }

    /**
     * @param image      the image to index. Cannot be NULL.
     * @param identifier an id for the image, for instance the filename or a URL. Can be NULL.
     * @return a Lucene Document.
     */
    @Override
    public Document createDocument(BufferedImage image, String identifier) {
        Document doc = new Document();

        if (identifier != null) {
            doc.add(new StringField(DocumentBuilder.FIELD_NAME_IDENTIFIER, identifier, Field.Store.YES));
        }

        Field[] fields = createDescriptorFields(image);
        for (Field field : fields) {
            doc.add(field);
        }

        return doc;
    }
}