DocSetInfoCommand.java example

Explorer

clue-master
- src
  - main
    - java
      - com
        senseidb
        clue
        ClueApplication.java
        ClueConfiguration.java
        ClueContext.java
        HdfsDirectory.java
        api
        BytesRefDisplay.java
        BytesRefPrinter.java
        DefaultDirectoryBuilder.java
        DefaultIndexReaderFactory.java
        DefaultQueryBuilder.java
        DirectoryBuilder.java
        IndexReaderFactory.java
        QueryBuilder.java
        RawBytesRefDisplay.java
        StringBytesRefDisplay.java
        commands
        ClueCommand.java
        DeleteCommand.java
        DeleteUserCommitData.java
        DirectoryCommand.java
        DocSetInfoCommand.java
        DocValCommand.java
        DumpDocCommand.java
        ExitCommand.java
        ExplainCommand.java
        ExportCommand.java
        GetUserCommitDataCommand.java
        HelpCommand.java
        IndexTrimCommand.java
        InfoCommand.java
        MergeCommand.java
        NormsCommand.java
        PostingsCommand.java
        ReadonlyCommand.java
        ReconstructCommand.java
        SaveUserCommitData.java
        SearchCommand.java
        StoredFieldCommand.java
        TermVectorCommand.java
        TermsCommand.java
        util
        CustomBufferedIndexInput.java
        IntArrayDocIdSetIterator.java
        MatchSomeDocsQuery.java
  - test
    - java
      - com
        senseidb
        clue
        test
        BuildSampleIndex.java
        PayloadTokenizer.java

package com.senseidb.clue.commands;

import java.io.PrintStream;
import java.util.Arrays;
import java.util.List;

import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.BytesRef;

import com.senseidb.clue.ClueContext;

public class DocSetInfoCommand extends ClueCommand {

  private static final int DEFAULT_BUCKET_SIZE = 1000;
  public DocSetInfoCommand(ClueContext ctx) {
    super(ctx);
  }

  @Override
  public String getName() {
    return "docsetinfo";
  }

  @Override
  public String help() {
    return "doc id set info and stats";
  }
  
  private static double[] PERCENTILES = new double[] {
    50.0, 75.0, 90.0, 95.0, 99.0
  };

  @Override
  public void execute(String[] args, PrintStream out) throws Exception {
    String field = null;
    String termVal = null;
    int bucketSize = DEFAULT_BUCKET_SIZE;
    
    try{
      field = args[0];
    }
    catch(Exception e){
      field = null;
    }
    
    try {
      bucketSize = Integer.parseInt(args[1]);
    }
    catch(Exception e){
    }
    
    if (field != null){
      String[] parts = field.split(":");
      if (parts.length > 1){
        field = parts[0];
        termVal = parts[1];
      }
    }
    
    if (field == null || termVal == null){
      out.println("usage: field:term");
      out.flush();
      return;
    }
    
    IndexReader reader = ctx.getIndexReader();
    List<LeafReaderContext> leaves = reader.leaves();
    

    PostingsEnum postingsEnum = null;
    for (LeafReaderContext leaf : leaves) {
      LeafReader atomicReader = leaf.reader();
      Terms terms = atomicReader.terms(field);
      if (terms == null){
        continue;
      }
      if (terms != null && termVal != null){        
        TermsEnum te = terms.iterator();
        
        if (te.seekExact(new BytesRef(termVal))){
          postingsEnum = te.postings(postingsEnum, PostingsEnum.FREQS);
          
          int docFreq = te.docFreq();
          
          int minDocId = -1, maxDocId = -1;
          int doc, count = 0;
          
          int[] percentDocs = new int[PERCENTILES.length];
          
          int percentileIdx = 0;
          
          while ((doc = postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
            maxDocId = doc;
            if (minDocId == -1) {
              minDocId = doc;
            }
            count ++;
            
            double perDocs = (double) count / (double) docFreq * 100.0;
            while (percentileIdx < percentDocs.length) {
              if (perDocs > PERCENTILES[percentileIdx]) {
                percentDocs[percentileIdx] = doc;
                percentileIdx++;
              } else {
                break;
              }
            }
          }
          
          // calculate histogram          
          int[] buckets = null;
          if (maxDocId > 0) {
            buckets = new int[maxDocId / bucketSize + 1];
            
            postingsEnum = te.postings(postingsEnum, PostingsEnum.FREQS);
            while ((doc = postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
              int bucketIdx = doc / bucketSize;
              buckets[bucketIdx]++;
            }
          }
          
          double density = (double) docFreq / (double) (maxDocId - minDocId) ; 
          out.println(String.format("min: %d, max: %d, count: %d, density: %.2f", minDocId, maxDocId, docFreq, density));
          out.println("percentiles: " + Arrays.toString(PERCENTILES) + " => " + Arrays.toString(percentDocs));
          out.println("histogram: (bucketsize=" + bucketSize+")");
          out.println(Arrays.toString(buckets));
        }
      }
    }
  }
}