LuceneIndexWriter.java example

Explorer

lire-master
- samples
  - classifier
    - src
      - net
        semanticmetadata
        lire
        classifiers
        ClassifierTest.java
        HashingSearchBasedClassifierMod.java
  - liredemo
    - src
      - main
        java
        edu
        uniklu
        itec
        mosaix
        ImageFunctions.java
        engine
        Engine.java
        EngineObserver.java
        Experimental.java
        LeastUsedWeightingStrategy.java
        Logging.java
        ProportionWeightingStrategy.java
        RandomWeightingStrategy.java
        SimpleWeightingData.java
        SimpleWeightingDataFactory.java
        WeightingData.java
        WeightingDataFactory.java
        WeightingStrategy.java
        liredemo
        ImagePanel.java
        IndexingThread.java
        LireDemoFrame.java
        Main.java
        ProgressMonitor.java
        SearchResultsTableModel.java
        flickr
        FlickrDownloadThread.java
        FlickrIndexingThread.java
        FlickrPhoto.java
        FlickrPhotoGrabber.java
        indexing
        MetadataBuilder.java
        ParallelIndexer.java
      - test
        java
        liredemo
        flickr
        FlickrPhotoGrabberTest.java
  - simpleapplication
    - src
      - main
        java
        net
        semanticmetadata
        lire
        sampleapp
        CreateARFFFile.java
        ExtractFeatures.java
        ExtractMultipleFeatures.java
        ExtractSingleFeature.java
        Indexer.java
        IndexingAndSearchWithLocalFeatures.java
        ParallelIndexing.java
        Searcher.java
  - teaching
    - src
      - main
        java
        samples
        Indexing.java
        Search.java
- src

package net.semanticmetadata.lire.indexers.tools.text;

import net.semanticmetadata.lire.builders.DocumentBuilder;
import net.semanticmetadata.lire.imageanalysis.features.GlobalFeature;
import net.semanticmetadata.lire.indexers.hashing.BitSampling;
import net.semanticmetadata.lire.indexers.hashing.MetricSpaces;
import net.semanticmetadata.lire.indexers.parallel.WorkItem;
import net.semanticmetadata.lire.utils.CommandLineUtils;
import net.semanticmetadata.lire.utils.LuceneUtils;
import net.semanticmetadata.lire.utils.SerializationUtils;
import org.apache.lucene.document.*;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.util.BytesRef;

import java.io.File;
import java.io.IOException;
import java.util.*;
import java.util.concurrent.LinkedBlockingQueue;

/**
 * Reading a file from {@link ParallelExtraction} and writing it to a lucene index.
 */
public class LuceneIndexWriter extends AbstractDocumentWriter {
    // -------------< static >------------------------
    private static String helpMessage = "Usage of LuceneIndexWriter\n" +
            "==========================\n" +
            "\n" +
            "$> java LuceneIndexWriter -i <file> -o <index-directory> [-hb] [-hm]\n" +
            "\n" +
            "-i  ... path to the input file\n" +
            "-o  ... path to the Lucene index for output\n" +
            "-d  ... use DocValues\n" +
            "-hb ... employ BitSampling Hashing (overrules MetricSpaces, loads all *.mds files from current directory)\n" +
            "-hm ... employ MetricSpaces Indexing";

    // -------------< instance >------------------------
    private IndexWriter iw;
    protected LinkedBlockingQueue<QueueItem> queue = new LinkedBlockingQueue<>(500);
    List<Thread> threads;
    private int numThreads = 8;

    public LuceneIndexWriter(File infile, File indexDirectory, boolean doHashingBitSampling, boolean doMetricSpaceIndexing, boolean useDocValues) throws IOException {
        super(infile, useDocValues, doHashingBitSampling, doMetricSpaceIndexing);
        this.iw = LuceneUtils.createIndexWriter(indexDirectory.getPath(), true, LuceneUtils.AnalyzerType.WhitespaceAnalyzer);
    }

    public static void main(String[] args) {
        Properties cmdLine = CommandLineUtils.getProperties(args, helpMessage, new String[]{"-i", "-o"});
        File inputFile = new File(cmdLine.getProperty("-i"));
        File outputFile = new File(cmdLine.getProperty("-o"));
        if (!inputFile.exists()) {
            System.err.println("Input file does not exist.");
            System.out.println(helpMessage);
            System.exit(1);
        } else {
            try {
                LuceneIndexWriter liw = new LuceneIndexWriter(inputFile, outputFile, cmdLine.containsKey("-hb"), cmdLine.containsKey("-hm"), cmdLine.containsKey("-d"));
                Thread t = new Thread(liw);
                t.start();
                t.join();
            } catch (IOException e) {
                e.printStackTrace();
            } catch (InterruptedException e) {
                e.printStackTrace();
            }
        }

    }

    /**
     * Called after the last line is read.
     */
    protected void finishWriting() {
        try {
            for (int i = 0; i < 20; i++) {
                queue.put(new QueueItem(null, null));
            }
            for (Iterator<Thread> iterator = threads.iterator(); iterator.hasNext(); ) {
                iterator.next().join();
            }
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
        try {
            iw.commit();
            iw.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    @Override
    protected void startWriting() {
        threads = new LinkedList<>();
        for (int i = 0; i < numThreads; i++) {
            Thread t = new Thread(new Consumer());
            t.start();
            threads.add(t);
        }
    }

    /**
     * Called for each line in the file.
     *
     * @param fileName       the content of the first column
     * @param listOfFeatures the 2nd to nth column values already parsed.
     */
    protected void write(String fileName, ArrayList<GlobalFeature> listOfFeatures) {
        // clone the features first:
        ArrayList<GlobalFeature> tmp = new ArrayList<>(listOfFeatures.size());
        try {
            for (Iterator<GlobalFeature> iterator = listOfFeatures.iterator(); iterator.hasNext(); ) {
                GlobalFeature f = iterator.next();
                GlobalFeature n = (GlobalFeature) f.getClass().newInstance();
                n.setByteArrayRepresentation(f.getByteArrayRepresentation());
                tmp.add(n);
            }
            queue.put(new QueueItem(fileName, tmp));
        } catch (InstantiationException e) {
            e.printStackTrace();
        } catch (IllegalAccessException e) {
            e.printStackTrace();
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
    }

    class QueueItem {
        String id;
        List<GlobalFeature> features;

        public QueueItem(String id, List<GlobalFeature> features) {
            this.id = id;
            this.features = features;
        }
    }

    class Consumer implements Runnable {
        HashMap<String, Object> document = new HashMap<>();

        @Override
        public void run() {
            try {
                QueueItem data = queue.take();
                while (data.id != null) {
                    document.clear();
                    document.put(DocumentBuilder.FIELD_NAME_IDENTIFIER, data.id);
                    document.put("title", data.id);
                    for (Iterator<GlobalFeature> iterator = data.features.iterator(); iterator.hasNext(); ) {
                        GlobalFeature f = iterator.next();
                        document.put(f.getFieldName(), f.getByteArrayRepresentation());
                        if (doHashingBitSampling) {
                            document.put(f.getFieldName() + DocumentBuilder.HASH_FIELD_SUFFIX,
                                    SerializationUtils.arrayToString((BitSampling.generateHashes(f.getFeatureVector()))));

                        } else if (doMetricSpaceIndexing) {
                            if (MetricSpaces.supportsFeature(f)) {
                                document.put(f.getFieldName() + DocumentBuilder.HASH_FIELD_SUFFIX,
                                        MetricSpaces.generateHashString(f));
                            }

                        }
                    }
                    output(document);
                    data = queue.take();
                }
            } catch (InterruptedException e) {
                e.printStackTrace();
            }
        }

        private void output(HashMap<String, Object> document) {
            StringBuilder sb = new StringBuilder();
            sb.append("<doc>");
            for (Iterator<String> iterator = document.keySet().iterator(); iterator.hasNext(); ) {
                String fieldName = iterator.next();
                sb.append("<field name=\"" + fieldName + "\">");
                sb.append(document.get(fieldName));
                sb.append("</field>");
            }
            sb.append("</doc>\n");
            LinkedList<IndexableField> fields = new LinkedList<>();
            // add all the features.
            for (Iterator<String> iterator = document.keySet().iterator(); iterator.hasNext(); ) {
                String fieldName = iterator.next();
                if (fieldName.startsWith("id") || fieldName.startsWith("title") || fieldName.startsWith(DocumentBuilder.FIELD_NAME_IDENTIFIER) ) {
                    fields.add(new StringField(fieldName, (String) document.get(fieldName), Field.Store.YES));
                } else if (fieldName.endsWith(DocumentBuilder.HASH_FIELD_SUFFIX)) {
                    fields.add(new TextField(fieldName, (String) document.get(fieldName), Field.Store.NO));
                } else {
                    if (!useDocValues) {
                        fields.add(new StoredField(fieldName, (byte[]) document.get(fieldName)));
                    } else {
                        // Alternative: The DocValues field. It's extremely fast to read, but it's all in RAM most likely.
                        fields.add(new BinaryDocValuesField(fieldName, new BytesRef((byte[]) document.get(fieldName))));
                    }
                }
            }
            try {
                synchronized (iw) {
                    iw.addDocument(fields);
                }
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

}
/*
Usage of LuceneIndexWriter
==========================

$> java LuceneIndexWriter -i <file> -o <index-directory> [-hb] [-hm]

-i  ... path to the input file
-o  ... path to the Lucene index for output
-d  ... use DocValues
-hb ... employ BitSampling Hashing (overrules MetricSpaces, loads all *.mds files from current directory)
-hm ... employ MetricSpaces Indexing
 */