/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.mahout.utils.vectors.lucene; import java.io.IOException; import java.util.Collections; import java.util.Iterator; import com.google.common.base.Preconditions; import com.google.common.collect.AbstractIterator; import org.apache.lucene.document.FieldSelector; import org.apache.lucene.document.SetBasedFieldSelector; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.TermDocs; import org.apache.lucene.index.TermFreqVector; import org.apache.mahout.math.NamedVector; import org.apache.mahout.math.Vector; import org.apache.mahout.utils.Bump125; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * An {@link Iterator} over {@link Vector}s that uses a Lucene index as the source for creating the * {@link Vector}s. The field used to create the vectors currently must have term vectors stored for it. */ public final class LuceneIterator extends AbstractIterator<Vector> { private static final Logger log = LoggerFactory.getLogger(LuceneIterator.class); private final IndexReader indexReader; private final String field; private final String idField; private final FieldSelector idFieldSelector; private final VectorMapper mapper; private final double normPower; private final TermDocs termDocs; private int numErrorDocs = 0; private int maxErrorDocs = 0; private final Bump125 bump = new Bump125(); private long nextLogRecord = bump.increment(); private int skippedErrorMessages = 0; /** * Produce a LuceneIterable that can create the Vector plus normalize it. * * @param indexReader {@link IndexReader} to read the documents from. * @param idField field containing the id. May be null. * @param field field to use for the Vector * @param mapper {@link VectorMapper} for creating {@link Vector}s from Lucene's TermVectors. * @param normPower the normalization value. Must be nonnegative, or {@link LuceneIterable#NO_NORMALIZING} */ public LuceneIterator(IndexReader indexReader, String idField, String field, VectorMapper mapper, double normPower) throws IOException { this(indexReader, idField, field, mapper, normPower, 0.0); } /** * @see #LuceneIterator(IndexReader, String, String, VectorMapper, double) * @param maxPercentErrorDocs most documents that will be tolerated without a term freq vector. In [0,1]. */ public LuceneIterator(IndexReader indexReader, String idField, String field, VectorMapper mapper, double normPower, double maxPercentErrorDocs) throws IOException { // term docs(null) is a better way of iterating all the docs in Lucene Preconditions.checkArgument(normPower == LuceneIterable.NO_NORMALIZING || normPower >= 0, "If specified normPower must be nonnegative", normPower); Preconditions.checkArgument(maxPercentErrorDocs >= 0.0 && maxPercentErrorDocs <= 1.0); idFieldSelector = new SetBasedFieldSelector(Collections.singleton(idField), Collections.<String>emptySet()); this.indexReader = indexReader; this.idField = idField; this.field = field; this.mapper = mapper; this.normPower = normPower; // term docs(null) is a better way of iterating all the docs in Lucene this.termDocs = indexReader.termDocs(null); this.maxErrorDocs = (int) (maxPercentErrorDocs * indexReader.numDocs()); } @Override protected Vector computeNext() { try { if (!termDocs.next()) { return endOfData(); } int doc = termDocs.doc(); TermFreqVector termFreqVector = indexReader.getTermFreqVector(doc, field); if (termFreqVector == null) { numErrorDocs++; if (numErrorDocs >= maxErrorDocs) { log.error("There are too many documents that do not have a term vector for {}", field); throw new IllegalStateException("There are too many documents that do not have a term vector for " + field); } if (numErrorDocs >= nextLogRecord) { if (skippedErrorMessages == 0) { log.warn("{} does not have a term vector for {}", indexReader.document(doc).get(idField), field); } else { log.warn("{} documents do not have a term vector for {}", numErrorDocs, field); } nextLogRecord = bump.increment(); skippedErrorMessages = 0; } else { skippedErrorMessages++; } computeNext(); } indexReader.getTermFreqVector(doc, field, mapper); mapper.setDocumentNumber(doc); Vector result = mapper.getVector(); if (result == null) { // TODO is this right? last version would produce null in the iteration in this case, though it // seems like that may not be desirable return null; } String name; if (idField != null) { name = indexReader.document(doc, idFieldSelector).get(idField); } else { name = String.valueOf(doc); } if (normPower == LuceneIterable.NO_NORMALIZING) { result = new NamedVector(result, name); } else { result = new NamedVector(result.normalize(normPower), name); } return result; } catch (IOException ioe) { throw new IllegalStateException(ioe); } } }