LuceneIterator.java example

Explorer
mahout-rbmClassifier-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.mahout.utils.vectors.lucene;

import java.io.IOException;
import java.util.Collections;
import java.util.Iterator;

import com.google.common.base.Preconditions;
import com.google.common.collect.AbstractIterator;
import org.apache.lucene.document.FieldSelector;
import org.apache.lucene.document.SetBasedFieldSelector;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.index.TermFreqVector;
import org.apache.mahout.math.NamedVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.utils.Bump125;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * An {@link Iterator} over {@link Vector}s that uses a Lucene index as the source for creating the
 * {@link Vector}s. The field used to create the vectors currently must have term vectors stored for it.
 */
public final class LuceneIterator extends AbstractIterator<Vector> {

  private static final Logger log = LoggerFactory.getLogger(LuceneIterator.class);

  private final IndexReader indexReader;
  private final String field;
  private final String idField;
  private final FieldSelector idFieldSelector;
  private final VectorMapper mapper;
  private final double normPower;
  private final TermDocs termDocs;

  private int numErrorDocs = 0;
  private int maxErrorDocs = 0;
  private final Bump125 bump = new Bump125();
  private long nextLogRecord = bump.increment();
  private int skippedErrorMessages = 0;

  /**
   * Produce a LuceneIterable that can create the Vector plus normalize it.
   *
   * @param indexReader {@link IndexReader} to read the documents from.
   * @param idField field containing the id. May be null.
   * @param field  field to use for the Vector
   * @param mapper {@link VectorMapper} for creating {@link Vector}s from Lucene's TermVectors.
   * @param normPower the normalization value. Must be nonnegative, or {@link LuceneIterable#NO_NORMALIZING}
   */
  public LuceneIterator(IndexReader indexReader,
                        String idField,
                        String field,
                        VectorMapper mapper,
                        double normPower) throws IOException {
    this(indexReader, idField, field, mapper, normPower, 0.0);
  }

  /**
   * @see #LuceneIterator(IndexReader, String, String, VectorMapper, double)
   * @param maxPercentErrorDocs most documents that will be tolerated without a term freq vector. In [0,1].
   */
  public LuceneIterator(IndexReader indexReader,
                        String idField,
                        String field,
                        VectorMapper mapper,
                        double normPower,
                        double maxPercentErrorDocs) throws IOException {
    // term docs(null) is a better way of iterating all the docs in Lucene
    Preconditions.checkArgument(normPower == LuceneIterable.NO_NORMALIZING || normPower >= 0,
                                "If specified normPower must be nonnegative", normPower);
    Preconditions.checkArgument(maxPercentErrorDocs >= 0.0 && maxPercentErrorDocs <= 1.0);
    idFieldSelector = new SetBasedFieldSelector(Collections.singleton(idField), Collections.<String>emptySet());
    this.indexReader = indexReader;
    this.idField = idField;
    this.field = field;
    this.mapper = mapper;
    this.normPower = normPower;
    // term docs(null) is a better way of iterating all the docs in Lucene
    this.termDocs = indexReader.termDocs(null);
    this.maxErrorDocs = (int) (maxPercentErrorDocs * indexReader.numDocs());
  }

  @Override
  protected Vector computeNext() {
    try {
      if (!termDocs.next()) {
        return endOfData();
      }

      int doc = termDocs.doc();
      TermFreqVector termFreqVector = indexReader.getTermFreqVector(doc, field);
      if (termFreqVector == null) {
        numErrorDocs++;
        if (numErrorDocs >= maxErrorDocs) {
          log.error("There are too many documents that do not have a term vector for {}", field);
          throw new IllegalStateException("There are too many documents that do not have a term vector for " + field);
        }
        if (numErrorDocs >= nextLogRecord) {
          if (skippedErrorMessages == 0) {
            log.warn("{} does not have a term vector for {}", indexReader.document(doc).get(idField), field);
          } else {
            log.warn("{} documents do not have a term vector for {}", numErrorDocs, field);
          }
          nextLogRecord = bump.increment();
          skippedErrorMessages = 0;
        } else {
          skippedErrorMessages++;
        }
        computeNext();
      }

      indexReader.getTermFreqVector(doc, field, mapper);
      mapper.setDocumentNumber(doc);
      Vector result = mapper.getVector();
      if (result == null) {
        // TODO is this right? last version would produce null in the iteration in this case, though it
        // seems like that may not be desirable
        return null;
      }
      String name;
      if (idField != null) {
        name = indexReader.document(doc, idFieldSelector).get(idField);
      } else {
        name = String.valueOf(doc);
      }
      if (normPower == LuceneIterable.NO_NORMALIZING) {
        result = new NamedVector(result, name);
      } else {
        result = new NamedVector(result.normalize(normPower), name);
      }
      return result;
    } catch (IOException ioe) {
      throw new IllegalStateException(ioe);
    }
  }

}