/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.mahout.utils.vectors.lucene; import com.google.common.io.Closeables; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.Fieldable; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.Version; import org.apache.mahout.math.NamedVector; import org.apache.mahout.math.Vector; import org.apache.mahout.utils.MahoutTestCase; import org.apache.mahout.utils.vectors.TermInfo; import org.apache.mahout.vectorizer.TFIDF; import org.apache.mahout.vectorizer.Weight; import org.junit.Test; import java.io.IOException; import java.util.Iterator; public final class LuceneIterableTest extends MahoutTestCase { private static final String [] DOCS = { "The quick red fox jumped over the lazy brown dogs.", "Mary had a little lamb whose fleece was white as snow.", "Moby Dick is a story of a whale and a man obsessed.", "The robber wore a black fleece jacket and a baseball cap.", "The English Springer Spaniel is the best of all dogs." }; private RAMDirectory directory; @Override public void setUp() throws Exception { super.setUp(); directory = createTestIndex(Field.TermVector.YES); } @Test public void testIterable() throws Exception { IndexReader reader = IndexReader.open(directory, true); Weight weight = new TFIDF(); TermInfo termInfo = new CachedTermInfo(reader, "content", 1, 100); VectorMapper mapper = new TFDFMapper(reader, weight, termInfo); LuceneIterable iterable = new LuceneIterable(reader, "id", "content", mapper); //TODO: do something more meaningful here for (Vector vector : iterable) { assertNotNull(vector); assertTrue("vector is not an instanceof " + NamedVector.class, vector instanceof NamedVector); assertTrue("vector Size: " + vector.size() + " is not greater than: " + 0, vector.size() > 0); assertTrue(((NamedVector) vector).getName().startsWith("doc_")); } iterable = new LuceneIterable(reader, "id", "content", mapper, 3); //TODO: do something more meaningful here for (Vector vector : iterable) { assertNotNull(vector); assertTrue("vector is not an instanceof " + NamedVector.class, vector instanceof NamedVector); assertTrue("vector Size: " + vector.size() + " is not greater than: " + 0, vector.size() > 0); assertTrue(((NamedVector) vector).getName().startsWith("doc_")); } } @Test(expected = IllegalStateException.class) public void testIterable_noTermVectors() throws IOException { RAMDirectory directory = createTestIndex(Field.TermVector.NO); IndexReader reader = IndexReader.open(directory, true); Weight weight = new TFIDF(); TermInfo termInfo = new CachedTermInfo(reader, "content", 1, 100); VectorMapper mapper = new TFDFMapper(reader, weight, termInfo); LuceneIterable iterable = new LuceneIterable(reader, "id", "content", mapper); Iterator<Vector> iterator = iterable.iterator(); iterator.hasNext(); iterator.next(); } @Test public void testIterable_someNoiseTermVectors() throws IOException { //get noise vectors RAMDirectory directory = createTestIndex(Field.TermVector.YES, new RAMDirectory(), true, 0); //get real vectors createTestIndex(Field.TermVector.NO, directory, false, 5); IndexReader reader = IndexReader.open(directory, true); Weight weight = new TFIDF(); TermInfo termInfo = new CachedTermInfo(reader, "content", 1, 100); VectorMapper mapper = new TFDFMapper(reader, weight, termInfo); boolean exceptionThrown; //0 percent tolerance LuceneIterable iterable = new LuceneIterable(reader, "id", "content", mapper); try { for (Object a : iterable) { } exceptionThrown = false; } catch(IllegalStateException ise) { exceptionThrown = true; } assertTrue(exceptionThrown); //100 percent tolerance iterable = new LuceneIterable(reader, "id", "content", mapper, -1, 1.0); try { for (Object a : iterable) { } exceptionThrown = false; } catch(IllegalStateException ise) { exceptionThrown = true; } assertFalse(exceptionThrown); //50 percent tolerance iterable = new LuceneIterable(reader, "id", "content", mapper, -1, 0.5); Iterator<Vector> iterator = iterable.iterator(); iterator.next(); iterator.next(); iterator.next(); iterator.next(); iterator.next(); try { while (iterator.hasNext()) { iterator.next(); } exceptionThrown = false; } catch(IllegalStateException ise) { exceptionThrown = true; } assertTrue(exceptionThrown); } private static RAMDirectory createTestIndex(Field.TermVector termVector) throws IOException { return createTestIndex(termVector, new RAMDirectory(), true, 0); } private static RAMDirectory createTestIndex(Field.TermVector termVector, RAMDirectory directory, boolean createNew, int startingId) throws IOException { IndexWriter writer = new IndexWriter( directory, new StandardAnalyzer(Version.LUCENE_31), createNew, IndexWriter.MaxFieldLength.UNLIMITED); try { for (int i = 0; i < DOCS.length; i++) { Document doc = new Document(); Fieldable id = new Field("id", "doc_" + (i + startingId), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS); doc.add(id); //Store both position and offset information Fieldable text = new Field("content", DOCS[i], Field.Store.NO, Field.Index.ANALYZED, termVector); doc.add(text); writer.addDocument(doc); } } finally { Closeables.closeQuietly(writer); } return directory; } }