FieldCache.java example

Explorer
heliosearch-master
- lucene
- solr
package org.apache.lucene.search;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;
import java.io.PrintStream;

import org.apache.lucene.analysis.NumericTokenStream;
import org.apache.lucene.document.DoubleField;
import org.apache.lucene.document.FloatField;
import org.apache.lucene.document.IntField;
import org.apache.lucene.document.LongField;
import org.apache.lucene.document.NumericDocValuesField;
import org.apache.lucene.index.AtomicReader;
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.DocTermOrds;
import org.apache.lucene.index.IndexReader; // javadocs
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.NumericUtils;
import org.apache.lucene.util.RamUsageEstimator;

/**
 * Expert: Maintains caches of term values.
 *
 * <p>Created: May 19, 2004 11:13:14 AM
 *
 * @since   lucene 1.4
 * @see org.apache.lucene.util.FieldCacheSanityChecker
 *
 * @lucene.internal
 */
public interface FieldCache {

  /** Field values as 32-bit signed integers */
  public static abstract class Ints {
    /** Return an integer representation of this field's value. */
    public abstract int get(int docID);
    
    /** Zero value for every document */
    public static final Ints EMPTY = new Ints() {
      @Override
      public int get(int docID) {
        return 0;
      }
    };
  }

  /** Field values as 64-bit signed long integers */
  public static abstract class Longs {
    /** Return an long representation of this field's value. */
    public abstract long get(int docID);
    
    /** Zero value for every document */
    public static final Longs EMPTY = new Longs() {
      @Override
      public long get(int docID) {
        return 0;
      }
    };
  }

  /** Field values as 32-bit floats */
  public static abstract class Floats {
    /** Return an float representation of this field's value. */
    public abstract float get(int docID);
    
    /** Zero value for every document */
    public static final Floats EMPTY = new Floats() {
      @Override
      public float get(int docID) {
        return 0;
      }
    };
  }

  /** Field values as 64-bit doubles */
  public static abstract class Doubles {
    /** Return an double representation of this field's value. */
    public abstract double get(int docID);
    
    /** Zero value for every document */
    public static final Doubles EMPTY = new Doubles() {
      @Override
      public double get(int docID) {
        return 0;
      }
    };
  }

  /**
   * Placeholder indicating creation of this cache is currently in-progress.
   */
  public static final class CreationPlaceholder {
    Object value;
  }

  /**
   * Marker interface as super-interface to all parsers. It
   * is used to specify a custom parser to {@link
   * SortField#SortField(String, FieldCache.Parser)}.
   */
  public interface Parser {
    
    /**
     * Pulls a {@link TermsEnum} from the given {@link Terms}. This method allows certain parsers
     * to filter the actual TermsEnum before the field cache is filled.
     * 
     * @param terms the {@link Terms} instance to create the {@link TermsEnum} from.
     * @return a possibly filtered {@link TermsEnum} instance, this method must not return <code>null</code>.
     * @throws IOException if an {@link IOException} occurs
     */
    public TermsEnum termsEnum(Terms terms) throws IOException;
  }

  /** Interface to parse ints from document fields.
   * @see FieldCache#getInts(AtomicReader, String, FieldCache.IntParser, boolean)
   */
  public interface IntParser extends Parser {
    /** Return an integer representation of this field's value. */
    public int parseInt(BytesRef term);
  }

  /** Interface to parse floats from document fields.
   * @see FieldCache#getFloats(AtomicReader, String, FieldCache.FloatParser, boolean)
   */
  public interface FloatParser extends Parser {
    /** Return an float representation of this field's value. */
    public float parseFloat(BytesRef term);
  }

  /** Interface to parse long from document fields.
   * @see FieldCache#getLongs(AtomicReader, String, FieldCache.LongParser, boolean)
   */
  public interface LongParser extends Parser {
    /** Return an long representation of this field's value. */
    public long parseLong(BytesRef term);
  }

  /** Interface to parse doubles from document fields.
   * @see FieldCache#getDoubles(AtomicReader, String, FieldCache.DoubleParser, boolean)
   */
  public interface DoubleParser extends Parser {
    /** Return an double representation of this field's value. */
    public double parseDouble(BytesRef term);
  }

  /** Expert: The cache used internally by sorting and range query classes. */
  public static FieldCache DEFAULT = new FieldCacheImpl();

  /**
   * A parser instance for int values encoded by {@link NumericUtils}, e.g. when indexed
   * via {@link IntField}/{@link NumericTokenStream}.
   */
  public static final IntParser NUMERIC_UTILS_INT_PARSER=new IntParser(){
    @Override
    public int parseInt(BytesRef term) {
      return NumericUtils.prefixCodedToInt(term);
    }
    
    @Override
    public TermsEnum termsEnum(Terms terms) throws IOException {
      return NumericUtils.filterPrefixCodedInts(terms.iterator(null));
    }
    
    @Override
    public String toString() { 
      return FieldCache.class.getName()+".NUMERIC_UTILS_INT_PARSER"; 
    }
  };

  /**
   * A parser instance for float values encoded with {@link NumericUtils}, e.g. when indexed
   * via {@link FloatField}/{@link NumericTokenStream}.
   */
  public static final FloatParser NUMERIC_UTILS_FLOAT_PARSER=new FloatParser(){
    @Override
    public float parseFloat(BytesRef term) {
      return NumericUtils.sortableIntToFloat(NumericUtils.prefixCodedToInt(term));
    }
    @Override
    public String toString() { 
      return FieldCache.class.getName()+".NUMERIC_UTILS_FLOAT_PARSER"; 
    }
    
    @Override
    public TermsEnum termsEnum(Terms terms) throws IOException {
      return NumericUtils.filterPrefixCodedInts(terms.iterator(null));
    }
  };

  /**
   * A parser instance for long values encoded by {@link NumericUtils}, e.g. when indexed
   * via {@link LongField}/{@link NumericTokenStream}.
   */
  public static final LongParser NUMERIC_UTILS_LONG_PARSER = new LongParser(){
    @Override
    public long parseLong(BytesRef term) {
      return NumericUtils.prefixCodedToLong(term);
    }
    @Override
    public String toString() { 
      return FieldCache.class.getName()+".NUMERIC_UTILS_LONG_PARSER"; 
    }
    
    @Override
    public TermsEnum termsEnum(Terms terms) throws IOException {
      return NumericUtils.filterPrefixCodedLongs(terms.iterator(null));
    }
  };

  /**
   * A parser instance for double values encoded with {@link NumericUtils}, e.g. when indexed
   * via {@link DoubleField}/{@link NumericTokenStream}.
   */
  public static final DoubleParser NUMERIC_UTILS_DOUBLE_PARSER = new DoubleParser(){
    @Override
    public double parseDouble(BytesRef term) {
      return NumericUtils.sortableLongToDouble(NumericUtils.prefixCodedToLong(term));
    }
    @Override
    public String toString() { 
      return FieldCache.class.getName()+".NUMERIC_UTILS_DOUBLE_PARSER"; 
    }
    
    @Override
    public TermsEnum termsEnum(Terms terms) throws IOException {
      return NumericUtils.filterPrefixCodedLongs(terms.iterator(null));
    }
  };
  
  /** Checks the internal cache for an appropriate entry, and if none is found,
   *  reads the terms in <code>field</code> and returns a bit set at the size of
   *  <code>reader.maxDoc()</code>, with turned on bits for each docid that 
   *  does have a value for this field.
   */
  public Bits getDocsWithField(AtomicReader reader, String field) throws IOException;

  /**
   * Returns an {@link Ints} over the values found in documents in the given
   * field.
   *
   * @see #getInts(AtomicReader, String, IntParser, boolean)
   */
  public Ints getInts(AtomicReader reader, String field, boolean setDocsWithField) throws IOException;

  /**
   * Returns an {@link Ints} over the values found in documents in the given
   * field. If the field was indexed as {@link NumericDocValuesField}, it simply
   * uses {@link AtomicReader#getNumericDocValues(String)} to read the values.
   * Otherwise, it checks the internal cache for an appropriate entry, and if
   * none is found, reads the terms in <code>field</code> as ints and returns
   * an array of size <code>reader.maxDoc()</code> of the value each document
   * has in the given field.
   * 
   * @param reader
   *          Used to get field values.
   * @param field
   *          Which field contains the longs.
   * @param parser
   *          Computes int for string values. May be {@code null} if the
   *          requested field was indexed as {@link NumericDocValuesField} or
   *          {@link IntField}.
   * @param setDocsWithField
   *          If true then {@link #getDocsWithField} will also be computed and
   *          stored in the FieldCache.
   * @return The values in the given field for each document.
   * @throws IOException
   *           If any error occurs.
   */
  public Ints getInts(AtomicReader reader, String field, IntParser parser, boolean setDocsWithField) throws IOException;

  /**
   * Returns a {@link Floats} over the values found in documents in the given
   * field.
   *
   * @see #getFloats(AtomicReader, String, FloatParser, boolean)
   */
  public Floats getFloats(AtomicReader reader, String field, boolean setDocsWithField) throws IOException;

  /**
   * Returns a {@link Floats} over the values found in documents in the given
   * field. If the field was indexed as {@link NumericDocValuesField}, it simply
   * uses {@link AtomicReader#getNumericDocValues(String)} to read the values.
   * Otherwise, it checks the internal cache for an appropriate entry, and if
   * none is found, reads the terms in <code>field</code> as floats and returns
   * an array of size <code>reader.maxDoc()</code> of the value each document
   * has in the given field.
   * 
   * @param reader
   *          Used to get field values.
   * @param field
   *          Which field contains the floats.
   * @param parser
   *          Computes float for string values. May be {@code null} if the
   *          requested field was indexed as {@link NumericDocValuesField} or
   *          {@link FloatField}.
   * @param setDocsWithField
   *          If true then {@link #getDocsWithField} will also be computed and
   *          stored in the FieldCache.
   * @return The values in the given field for each document.
   * @throws IOException
   *           If any error occurs.
   */
  public Floats getFloats(AtomicReader reader, String field, FloatParser parser, boolean setDocsWithField) throws IOException;

  /**
   * Returns a {@link Longs} over the values found in documents in the given
   * field.
   *
   * @see #getLongs(AtomicReader, String, LongParser, boolean)
   */
  public Longs getLongs(AtomicReader reader, String field, boolean setDocsWithField) throws IOException;

  /**
   * Returns a {@link Longs} over the values found in documents in the given
   * field. If the field was indexed as {@link NumericDocValuesField}, it simply
   * uses {@link AtomicReader#getNumericDocValues(String)} to read the values.
   * Otherwise, it checks the internal cache for an appropriate entry, and if
   * none is found, reads the terms in <code>field</code> as longs and returns
   * an array of size <code>reader.maxDoc()</code> of the value each document
   * has in the given field.
   * 
   * @param reader
   *          Used to get field values.
   * @param field
   *          Which field contains the longs.
   * @param parser
   *          Computes long for string values. May be {@code null} if the
   *          requested field was indexed as {@link NumericDocValuesField} or
   *          {@link LongField}.
   * @param setDocsWithField
   *          If true then {@link #getDocsWithField} will also be computed and
   *          stored in the FieldCache.
   * @return The values in the given field for each document.
   * @throws IOException
   *           If any error occurs.
   */
  public Longs getLongs(AtomicReader reader, String field, LongParser parser, boolean setDocsWithField) throws IOException;

  /**
   * Returns a {@link Doubles} over the values found in documents in the given
   * field.
   *
   * @see #getDoubles(AtomicReader, String, DoubleParser, boolean)
   */
  public Doubles getDoubles(AtomicReader reader, String field, boolean setDocsWithField) throws IOException;

  /**
   * Returns a {@link Doubles} over the values found in documents in the given
   * field. If the field was indexed as {@link NumericDocValuesField}, it simply
   * uses {@link AtomicReader#getNumericDocValues(String)} to read the values.
   * Otherwise, it checks the internal cache for an appropriate entry, and if
   * none is found, reads the terms in <code>field</code> as doubles and returns
   * an array of size <code>reader.maxDoc()</code> of the value each document
   * has in the given field.
   * 
   * @param reader
   *          Used to get field values.
   * @param field
   *          Which field contains the longs.
   * @param parser
   *          Computes double for string values. May be {@code null} if the
   *          requested field was indexed as {@link NumericDocValuesField} or
   *          {@link DoubleField}.
   * @param setDocsWithField
   *          If true then {@link #getDocsWithField} will also be computed and
   *          stored in the FieldCache.
   * @return The values in the given field for each document.
   * @throws IOException
   *           If any error occurs.
   */
  public Doubles getDoubles(AtomicReader reader, String field, DoubleParser parser, boolean setDocsWithField) throws IOException;

  /** Checks the internal cache for an appropriate entry, and if none
   * is found, reads the term values in <code>field</code>
   * and returns a {@link BinaryDocValues} instance, providing a
   * method to retrieve the term (as a BytesRef) per document.
   * @param reader  Used to get field values.
   * @param field   Which field contains the strings.
   * @param setDocsWithField  If true then {@link #getDocsWithField} will
   *        also be computed and stored in the FieldCache.
   * @return The values in the given field for each document.
   * @throws IOException  If any error occurs.
   */
  public BinaryDocValues getTerms(AtomicReader reader, String field, boolean setDocsWithField) throws IOException;

  /** Expert: just like {@link #getTerms(AtomicReader,String,boolean)},
   *  but you can specify whether more RAM should be consumed in exchange for
   *  faster lookups (default is "true").  Note that the
   *  first call for a given reader and field "wins",
   *  subsequent calls will share the same cache entry. */
  public BinaryDocValues getTerms(AtomicReader reader, String field, boolean setDocsWithField, float acceptableOverheadRatio) throws IOException;

  /** Checks the internal cache for an appropriate entry, and if none
   * is found, reads the term values in <code>field</code>
   * and returns a {@link SortedDocValues} instance,
   * providing methods to retrieve sort ordinals and terms
   * (as a ByteRef) per document.
   * @param reader  Used to get field values.
   * @param field   Which field contains the strings.
   * @return The values in the given field for each document.
   * @throws IOException  If any error occurs.
   */
  public SortedDocValues getTermsIndex(AtomicReader reader, String field) throws IOException;

  /** Expert: just like {@link
   *  #getTermsIndex(AtomicReader,String)}, but you can specify
   *  whether more RAM should be consumed in exchange for
   *  faster lookups (default is "true").  Note that the
   *  first call for a given reader and field "wins",
   *  subsequent calls will share the same cache entry. */
  public SortedDocValues getTermsIndex(AtomicReader reader, String field, float acceptableOverheadRatio) throws IOException;

  /**
   * Checks the internal cache for an appropriate entry, and if none is found, reads the term values
   * in <code>field</code> and returns a {@link DocTermOrds} instance, providing a method to retrieve
   * the terms (as ords) per document.
   *
   * @param reader  Used to build a {@link DocTermOrds} instance
   * @param field   Which field contains the strings.
   * @return a {@link DocTermOrds} instance
   * @throws IOException  If any error occurs.
   */
  public SortedSetDocValues getDocTermOrds(AtomicReader reader, String field) throws IOException;

  /**
   * EXPERT: A unique Identifier/Description for each item in the FieldCache. 
   * Can be useful for logging/debugging.
   * @lucene.experimental
   */
  public final class CacheEntry {

    private final Object readerKey;
    private final String fieldName;
    private final Class<?> cacheType;
    private final Object custom;
    private final Object value;
    private String size;

    public CacheEntry(Object readerKey, String fieldName,
                      Class<?> cacheType,
                      Object custom,
                      Object value) {
      this.readerKey = readerKey;
      this.fieldName = fieldName;
      this.cacheType = cacheType;
      this.custom = custom;
      this.value = value;
    }

    public Object getReaderKey() {
      return readerKey;
    }

    public String getFieldName() {
      return fieldName;
    }

    public Class<?> getCacheType() {
      return cacheType;
    }

    public Object getCustom() {
      return custom;
    }

    public Object getValue() {
      return value;
    }

    /** 
     * Computes (and stores) the estimated size of the cache Value 
     * @see #getEstimatedSize
     */
    public void estimateSize() {
      long bytesUsed = RamUsageEstimator.sizeOf(getValue());
      size = RamUsageEstimator.humanReadableUnits(bytesUsed);
    }

    /**
     * The most recently estimated size of the value, null unless 
     * estimateSize has been called.
     */
    public String getEstimatedSize() {
      return size;
    }
    
    @Override
    public String toString() {
      StringBuilder b = new StringBuilder();
      b.append("'").append(getReaderKey()).append("'=>");
      b.append("'").append(getFieldName()).append("',");
      b.append(getCacheType()).append(",").append(getCustom());
      b.append("=>").append(getValue().getClass().getName()).append("#");
      b.append(System.identityHashCode(getValue()));
      
      String s = getEstimatedSize();
      if(null != s) {
        b.append(" (size =~ ").append(s).append(')');
      }

      return b.toString();
    }
  }
  
  /**
   * EXPERT: Generates an array of CacheEntry objects representing all items 
   * currently in the FieldCache.
   * <p>
   * NOTE: These CacheEntry objects maintain a strong reference to the 
   * Cached Values.  Maintaining references to a CacheEntry the AtomicIndexReader 
   * associated with it has garbage collected will prevent the Value itself
   * from being garbage collected when the Cache drops the WeakReference.
   * </p>
   * @lucene.experimental
   */
  public CacheEntry[] getCacheEntries();

  /**
   * <p>
   * EXPERT: Instructs the FieldCache to forcibly expunge all entries 
   * from the underlying caches.  This is intended only to be used for 
   * test methods as a way to ensure a known base state of the Cache 
   * (with out needing to rely on GC to free WeakReferences).  
   * It should not be relied on for "Cache maintenance" in general 
   * application code.
   * </p>
   * @lucene.experimental
   */
  public void purgeAllCaches();

  /**
   * Expert: drops all cache entries associated with this
   * reader {@link IndexReader#getCoreCacheKey}.  NOTE: this cache key must
   * precisely match the reader that the cache entry is
   * keyed on. If you pass a top-level reader, it usually
   * will have no effect as Lucene now caches at the segment
   * reader level.
   */
  public void purgeByCacheKey(Object coreCacheKey);

  /**
   * If non-null, FieldCacheImpl will warn whenever
   * entries are created that are not sane according to
   * {@link org.apache.lucene.util.FieldCacheSanityChecker}.
   */
  public void setInfoStream(PrintStream stream);

  /** counterpart of {@link #setInfoStream(PrintStream)} */
  public PrintStream getInfoStream();
}