package net.semanticmetadata.lire.indexers.tools.text; import net.semanticmetadata.lire.builders.DocumentBuilder; import net.semanticmetadata.lire.imageanalysis.features.GlobalFeature; import net.semanticmetadata.lire.indexers.hashing.BitSampling; import net.semanticmetadata.lire.indexers.hashing.MetricSpaces; import net.semanticmetadata.lire.indexers.parallel.WorkItem; import net.semanticmetadata.lire.utils.CommandLineUtils; import net.semanticmetadata.lire.utils.LuceneUtils; import net.semanticmetadata.lire.utils.SerializationUtils; import org.apache.lucene.document.*; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexableField; import org.apache.lucene.util.BytesRef; import java.io.File; import java.io.IOException; import java.util.*; import java.util.concurrent.LinkedBlockingQueue; /** * Reading a file from {@link ParallelExtraction} and writing it to a lucene index. */ public class LuceneIndexWriter extends AbstractDocumentWriter { // -------------< static >------------------------ private static String helpMessage = "Usage of LuceneIndexWriter\n" + "==========================\n" + "\n" + "$> java LuceneIndexWriter -i <file> -o <index-directory> [-hb] [-hm]\n" + "\n" + "-i ... path to the input file\n" + "-o ... path to the Lucene index for output\n" + "-d ... use DocValues\n" + "-hb ... employ BitSampling Hashing (overrules MetricSpaces, loads all *.mds files from current directory)\n" + "-hm ... employ MetricSpaces Indexing"; // -------------< instance >------------------------ private IndexWriter iw; protected LinkedBlockingQueue<QueueItem> queue = new LinkedBlockingQueue<>(500); List<Thread> threads; private int numThreads = 8; public LuceneIndexWriter(File infile, File indexDirectory, boolean doHashingBitSampling, boolean doMetricSpaceIndexing, boolean useDocValues) throws IOException { super(infile, useDocValues, doHashingBitSampling, doMetricSpaceIndexing); this.iw = LuceneUtils.createIndexWriter(indexDirectory.getPath(), true, LuceneUtils.AnalyzerType.WhitespaceAnalyzer); } public static void main(String[] args) { Properties cmdLine = CommandLineUtils.getProperties(args, helpMessage, new String[]{"-i", "-o"}); File inputFile = new File(cmdLine.getProperty("-i")); File outputFile = new File(cmdLine.getProperty("-o")); if (!inputFile.exists()) { System.err.println("Input file does not exist."); System.out.println(helpMessage); System.exit(1); } else { try { LuceneIndexWriter liw = new LuceneIndexWriter(inputFile, outputFile, cmdLine.containsKey("-hb"), cmdLine.containsKey("-hm"), cmdLine.containsKey("-d")); Thread t = new Thread(liw); t.start(); t.join(); } catch (IOException e) { e.printStackTrace(); } catch (InterruptedException e) { e.printStackTrace(); } } } /** * Called after the last line is read. */ protected void finishWriting() { try { for (int i = 0; i < 20; i++) { queue.put(new QueueItem(null, null)); } for (Iterator<Thread> iterator = threads.iterator(); iterator.hasNext(); ) { iterator.next().join(); } } catch (InterruptedException e) { e.printStackTrace(); } try { iw.commit(); iw.close(); } catch (IOException e) { e.printStackTrace(); } } @Override protected void startWriting() { threads = new LinkedList<>(); for (int i = 0; i < numThreads; i++) { Thread t = new Thread(new Consumer()); t.start(); threads.add(t); } } /** * Called for each line in the file. * * @param fileName the content of the first column * @param listOfFeatures the 2nd to nth column values already parsed. */ protected void write(String fileName, ArrayList<GlobalFeature> listOfFeatures) { // clone the features first: ArrayList<GlobalFeature> tmp = new ArrayList<>(listOfFeatures.size()); try { for (Iterator<GlobalFeature> iterator = listOfFeatures.iterator(); iterator.hasNext(); ) { GlobalFeature f = iterator.next(); GlobalFeature n = (GlobalFeature) f.getClass().newInstance(); n.setByteArrayRepresentation(f.getByteArrayRepresentation()); tmp.add(n); } queue.put(new QueueItem(fileName, tmp)); } catch (InstantiationException e) { e.printStackTrace(); } catch (IllegalAccessException e) { e.printStackTrace(); } catch (InterruptedException e) { e.printStackTrace(); } } class QueueItem { String id; List<GlobalFeature> features; public QueueItem(String id, List<GlobalFeature> features) { this.id = id; this.features = features; } } class Consumer implements Runnable { HashMap<String, Object> document = new HashMap<>(); @Override public void run() { try { QueueItem data = queue.take(); while (data.id != null) { document.clear(); document.put(DocumentBuilder.FIELD_NAME_IDENTIFIER, data.id); document.put("title", data.id); for (Iterator<GlobalFeature> iterator = data.features.iterator(); iterator.hasNext(); ) { GlobalFeature f = iterator.next(); document.put(f.getFieldName(), f.getByteArrayRepresentation()); if (doHashingBitSampling) { document.put(f.getFieldName() + DocumentBuilder.HASH_FIELD_SUFFIX, SerializationUtils.arrayToString((BitSampling.generateHashes(f.getFeatureVector())))); } else if (doMetricSpaceIndexing) { if (MetricSpaces.supportsFeature(f)) { document.put(f.getFieldName() + DocumentBuilder.HASH_FIELD_SUFFIX, MetricSpaces.generateHashString(f)); } } } output(document); data = queue.take(); } } catch (InterruptedException e) { e.printStackTrace(); } } private void output(HashMap<String, Object> document) { StringBuilder sb = new StringBuilder(); sb.append("<doc>"); for (Iterator<String> iterator = document.keySet().iterator(); iterator.hasNext(); ) { String fieldName = iterator.next(); sb.append("<field name=\"" + fieldName + "\">"); sb.append(document.get(fieldName)); sb.append("</field>"); } sb.append("</doc>\n"); LinkedList<IndexableField> fields = new LinkedList<>(); // add all the features. for (Iterator<String> iterator = document.keySet().iterator(); iterator.hasNext(); ) { String fieldName = iterator.next(); if (fieldName.startsWith("id") || fieldName.startsWith("title") || fieldName.startsWith(DocumentBuilder.FIELD_NAME_IDENTIFIER) ) { fields.add(new StringField(fieldName, (String) document.get(fieldName), Field.Store.YES)); } else if (fieldName.endsWith(DocumentBuilder.HASH_FIELD_SUFFIX)) { fields.add(new TextField(fieldName, (String) document.get(fieldName), Field.Store.NO)); } else { if (!useDocValues) { fields.add(new StoredField(fieldName, (byte[]) document.get(fieldName))); } else { // Alternative: The DocValues field. It's extremely fast to read, but it's all in RAM most likely. fields.add(new BinaryDocValuesField(fieldName, new BytesRef((byte[]) document.get(fieldName)))); } } } try { synchronized (iw) { iw.addDocument(fields); } } catch (IOException e) { e.printStackTrace(); } } } } /* Usage of LuceneIndexWriter ========================== $> java LuceneIndexWriter -i <file> -o <index-directory> [-hb] [-hm] -i ... path to the input file -o ... path to the Lucene index for output -d ... use DocValues -hb ... employ BitSampling Hashing (overrules MetricSpaces, loads all *.mds files from current directory) -hm ... employ MetricSpaces Indexing */