/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.mahout.utils.vectors.lucene; import com.google.common.collect.AbstractIterator; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.util.BytesRef; import org.apache.mahout.math.NamedVector; import org.apache.mahout.math.Vector; import org.apache.mahout.utils.Bump125; import org.apache.mahout.utils.vectors.TermInfo; import org.apache.mahout.vectorizer.Weight; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; /** * Iterate over a Lucene index, extracting term vectors. * Subclasses define how much information to retrieve from the Lucene index. */ public abstract class AbstractLuceneIterator extends AbstractIterator<Vector> { private static final Logger log = LoggerFactory.getLogger(LuceneIterator.class); protected final IndexReader indexReader; protected final String field; protected final TermInfo terminfo; protected final double normPower; protected final Weight weight; protected final Bump125 bump = new Bump125(); protected int nextDocId; protected int maxErrorDocs; protected int numErrorDocs; protected long nextLogRecord = bump.increment(); protected int skippedErrorMessages; public AbstractLuceneIterator(TermInfo terminfo, double normPower, IndexReader indexReader, Weight weight, double maxPercentErrorDocs, String field) { this.terminfo = terminfo; this.normPower = normPower; this.indexReader = indexReader; this.weight = weight; this.nextDocId = 0; this.maxErrorDocs = (int) (maxPercentErrorDocs * indexReader.numDocs()); this.field = field; } /** * Given the document name, derive a name for the vector. This may involve * reading the document from Lucene and setting up any other state that the * subclass wants. This will be called once for each document that the * iterator processes. * @param documentIndex the lucene document index. * @return the name to store in the vector. */ protected abstract String getVectorName(int documentIndex) throws IOException; @Override protected Vector computeNext() { try { int doc; Terms termFreqVector; String name; do { doc = this.nextDocId; nextDocId++; if (doc >= indexReader.maxDoc()) { return endOfData(); } termFreqVector = indexReader.getTermVector(doc, field); name = getVectorName(doc); if (termFreqVector == null) { numErrorDocs++; if (numErrorDocs >= maxErrorDocs) { log.error("There are too many documents that do not have a term vector for {}", field); throw new IllegalStateException("There are too many documents that do not have a term vector for " + field); } if (numErrorDocs >= nextLogRecord) { if (skippedErrorMessages == 0) { log.warn("{} does not have a term vector for {}", name, field); } else { log.warn("{} documents do not have a term vector for {}", numErrorDocs, field); } nextLogRecord = bump.increment(); skippedErrorMessages = 0; } else { skippedErrorMessages++; } } } while (termFreqVector == null); // The loop exits with termFreqVector and name set. TermsEnum te = termFreqVector.iterator(null); BytesRef term; TFDFMapper mapper = new TFDFMapper(indexReader.numDocs(), weight, this.terminfo); mapper.setExpectations(field, termFreqVector.size()); while ((term = te.next()) != null) { mapper.map(term, (int) te.totalTermFreq()); } Vector result = mapper.getVector(); if (result == null) { // TODO is this right? last version would produce null in the iteration in this case, though it // seems like that may not be desirable return null; } if (normPower == LuceneIterable.NO_NORMALIZING) { result = new NamedVector(result, name); } else { result = new NamedVector(result.normalize(normPower), name); } return result; } catch (IOException ioe) { throw new IllegalStateException(ioe); } } }