package com.senseidb.clue.commands; import java.io.PrintStream; import java.util.Arrays; import java.util.List; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.util.BytesRef; import com.senseidb.clue.ClueContext; public class DocSetInfoCommand extends ClueCommand { private static final int DEFAULT_BUCKET_SIZE = 1000; public DocSetInfoCommand(ClueContext ctx) { super(ctx); } @Override public String getName() { return "docsetinfo"; } @Override public String help() { return "doc id set info and stats"; } private static double[] PERCENTILES = new double[] { 50.0, 75.0, 90.0, 95.0, 99.0 }; @Override public void execute(String[] args, PrintStream out) throws Exception { String field = null; String termVal = null; int bucketSize = DEFAULT_BUCKET_SIZE; try{ field = args[0]; } catch(Exception e){ field = null; } try { bucketSize = Integer.parseInt(args[1]); } catch(Exception e){ } if (field != null){ String[] parts = field.split(":"); if (parts.length > 1){ field = parts[0]; termVal = parts[1]; } } if (field == null || termVal == null){ out.println("usage: field:term"); out.flush(); return; } IndexReader reader = ctx.getIndexReader(); List<LeafReaderContext> leaves = reader.leaves(); PostingsEnum postingsEnum = null; for (LeafReaderContext leaf : leaves) { LeafReader atomicReader = leaf.reader(); Terms terms = atomicReader.terms(field); if (terms == null){ continue; } if (terms != null && termVal != null){ TermsEnum te = terms.iterator(); if (te.seekExact(new BytesRef(termVal))){ postingsEnum = te.postings(postingsEnum, PostingsEnum.FREQS); int docFreq = te.docFreq(); int minDocId = -1, maxDocId = -1; int doc, count = 0; int[] percentDocs = new int[PERCENTILES.length]; int percentileIdx = 0; while ((doc = postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { maxDocId = doc; if (minDocId == -1) { minDocId = doc; } count ++; double perDocs = (double) count / (double) docFreq * 100.0; while (percentileIdx < percentDocs.length) { if (perDocs > PERCENTILES[percentileIdx]) { percentDocs[percentileIdx] = doc; percentileIdx++; } else { break; } } } // calculate histogram int[] buckets = null; if (maxDocId > 0) { buckets = new int[maxDocId / bucketSize + 1]; postingsEnum = te.postings(postingsEnum, PostingsEnum.FREQS); while ((doc = postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { int bucketIdx = doc / bucketSize; buckets[bucketIdx]++; } } double density = (double) docFreq / (double) (maxDocId - minDocId) ; out.println(String.format("min: %d, max: %d, count: %d, density: %.2f", minDocId, maxDocId, docFreq, density)); out.println("percentiles: " + Arrays.toString(PERCENTILES) + " => " + Arrays.toString(percentDocs)); out.println("histogram: (bucketsize=" + bucketSize+")"); out.println(Arrays.toString(buckets)); } } } } }