/* * Copyright (C) 2014 Indeed Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except * in compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed under the * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either * express or implied. See the License for the specific language governing permissions and * limitations under the License. */ package com.indeed.flamdex.simple; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import com.google.common.primitives.Longs; import com.indeed.util.io.Files; import com.indeed.flamdex.api.DocIdStream; import com.indeed.flamdex.api.FlamdexOutOfMemoryException; import com.indeed.flamdex.api.IntTermIterator; import com.indeed.flamdex.api.IntValueLookup; import com.indeed.flamdex.api.RawStringTermDocIterator; import com.indeed.flamdex.api.StringTermIterator; import com.indeed.flamdex.writer.IntFieldWriter; import com.indeed.flamdex.writer.StringFieldWriter; import junit.framework.TestCase; import org.junit.Test; import java.io.IOException; import java.util.Collections; import java.util.List; import java.util.Map; import java.util.Random; /** * @author jsgroth */ public class SimpleFlamdexTest extends TestCase { private final Random rand = new Random(); @Test public void testEmptyFields() throws IOException { final String dir = Files.getTempDirectory("flamdex-test", "foo"); try { SimpleFlamdexWriter w = new SimpleFlamdexWriter(dir, 5L, true); w.getIntFieldWriter("if1").close(); w.getStringFieldWriter("sf1").close(); w.close(); SimpleFlamdexReader r = SimpleFlamdexReader.open(dir); IntTermIterator it = r.getIntTermIterator("if1"); assertFalse(it.next()); StringTermIterator sit = r.getStringTermIterator("sf1"); assertFalse(sit.next()); it.close(); sit.close(); r.close(); } finally { Files.delete(dir); } } @Test public void testIt() throws IOException { final String dir = Files.getTempDirectory("flamdex-test", "foo"); try { writeAndRead(dir); } finally { Files.delete(dir); } } @Test public void testGetMetric() throws IOException, FlamdexOutOfMemoryException { final String dir = Files.getTempDirectory("flamdex-test", "foo"); try { internalTestGetMetric(dir); } finally { Files.delete(dir); } } private void internalTestGetMetric(String dir) throws IOException, FlamdexOutOfMemoryException { getMetricCase(dir, 2); getMetricCase(dir, 256); getMetricCase(dir, 65536); getMetricCase(dir, Integer.MAX_VALUE); } private void getMetricCase(String dir, int maxTermVal) throws IOException, FlamdexOutOfMemoryException { for (int i = 0; i < 10; ++i) { long[] cache = writeGetMetricIndex(dir, maxTermVal); SimpleFlamdexReader r = SimpleFlamdexReader.open(dir); // do it multiple times because these methods update internal state, make sure nothing unexpectedly weird happens for (int j = 0; j < 3; ++j) { long memReq = r.memoryRequired("if1"); IntValueLookup ivl = r.getMetric("if1"); assertEquals(memReq, ivl.memoryUsed()); int[] docIds = new int[r.getNumDocs()]; long[] values = new long[r.getNumDocs()]; for (int doc = 0; doc < docIds.length; ++doc) docIds[doc] = doc; ivl.lookup(docIds, values, r.getNumDocs()); assertEquals(Longs.asList(cache), Longs.asList(values)); } } } private long[] writeGetMetricIndex(String dir, int maxTermVal) throws IOException { SimpleFlamdexWriter w = new SimpleFlamdexWriter(dir, 10L, true); IntFieldWriter ifw = w.getIntFieldWriter("if1"); List<Integer> docs = Lists.newArrayList(0, 1, 2, 3, 4, 5, 6, 7, 8, 9); long[] cache = new long[10]; if (maxTermVal < docs.size()) { int term = rand.nextInt(maxTermVal); ifw.nextTerm(term); for (int doc : docs) { ifw.nextDoc(doc); cache[doc] = term; } } else { Map<Integer, List<Integer>> map = Maps.newTreeMap(); while (!docs.isEmpty()) { int term = rand.nextInt(maxTermVal); if (map.containsKey(term)) continue; int numDocs = docs.size() > 1 ? rand.nextInt(docs.size() - 1) + 1 : 1; List<Integer> selectedDocs = Lists.newArrayList(); for (int i = 0; i < numDocs; ++i) { selectedDocs.add(docs.remove(rand.nextInt(docs.size()))); } Collections.sort(selectedDocs); map.put(term, selectedDocs); } for (int term : map.keySet()) { ifw.nextTerm(term); List<Integer> selectedDocs = map.get(term); for (int doc : selectedDocs) { ifw.nextDoc(doc); cache[doc] = term; } } } ifw.close(); w.close(); return cache; } public void writeAndRead(String dir) throws IOException { writeIndex(dir); readCase1(dir); readCase2(dir); readCase3(dir); } private void readCase3(String dir) throws IOException { final SimpleFlamdexReader r = SimpleFlamdexReader.open(dir); final RawStringTermDocIterator it = r.getStringTermDocIterator("f2"); final int[] docBuffer = new int[20]; assertTrue(it.nextTerm()); assertEquals(it.term(), ""); assertEquals(it.termStringLength(), 0); assertEquals(it.docFreq(), 2); assertEquals(it.fillDocIdBuffer(docBuffer), 2); assertEquals(docBuffer[0], 2); assertEquals(docBuffer[1], 5); assertTrue(it.nextTerm()); assertEquals(it.term(), "a"); assertEquals(it.termStringLength(), 1); assertEquals(it.docFreq(), 2); assertEquals(it.fillDocIdBuffer(docBuffer), 2); assertEquals(docBuffer[0], 4); assertEquals(docBuffer[1], 7); assertTrue(it.nextTerm()); assertEquals(it.term(), "ffffffffff"); assertEquals(it.termStringLength(), 10); assertEquals(it.docFreq(), 3); assertEquals(it.fillDocIdBuffer(docBuffer), 3); assertEquals(docBuffer[0], 2); assertEquals(docBuffer[1], 5); assertEquals(docBuffer[2], 9); assertTrue(it.nextTerm()); assertEquals(it.term(), "lollerskates"); assertEquals(it.termStringLength(), 12); assertEquals(it.docFreq(), 2); assertEquals(it.fillDocIdBuffer(docBuffer), 2); assertEquals(docBuffer[0], 7); assertEquals(docBuffer[1], 8); assertFalse(it.nextTerm()); } private void readCase2(String dir) throws IOException { final SimpleFlamdexReader r = SimpleFlamdexReader.open(dir); final DocIdStream dis = r.getDocIdStream(); final int[] docIdBuf = new int[2]; final StringTermIterator strItr = r.getStringTermIterator("f2"); strItr.reset("ffffffffff"); assertTrue(strItr.next()); assertEquals("ffffffffff", strItr.term()); assertEquals(3, strItr.docFreq()); dis.reset(strItr); assertEquals(2, dis.fillDocIdBuffer(docIdBuf)); assertEquals(2, docIdBuf[0]); assertEquals(5, docIdBuf[1]); assertEquals(1, dis.fillDocIdBuffer(docIdBuf)); assertEquals(9, docIdBuf[0]); assertTrue(strItr.next()); assertEquals("lollerskates", strItr.term()); assertEquals(2, strItr.docFreq()); dis.reset(strItr); assertEquals(2, dis.fillDocIdBuffer(docIdBuf)); assertEquals(7, docIdBuf[0]); assertEquals(8, docIdBuf[1]); assertEquals(0, dis.fillDocIdBuffer(docIdBuf)); assertFalse(strItr.next()); strItr.reset("zzzzzzzzzzzzz"); assertFalse(strItr.next()); final IntTermIterator intItr = r.getIntTermIterator("f1"); intItr.reset(9000); assertTrue(intItr.next()); assertEquals(9000, intItr.term()); assertEquals(4, intItr.docFreq()); dis.reset(intItr); assertEquals(2, dis.fillDocIdBuffer(docIdBuf)); assertEquals(3, docIdBuf[0]); assertEquals(7, docIdBuf[1]); assertEquals(2, dis.fillDocIdBuffer(docIdBuf)); assertEquals(8, docIdBuf[0]); assertEquals(9, docIdBuf[1]); assertEquals(0, dis.fillDocIdBuffer(docIdBuf)); assertFalse(intItr.next()); intItr.reset(999999999); assertFalse(intItr.next()); } private void readCase1(String dir) throws IOException { final SimpleFlamdexReader reader = SimpleFlamdexReader.open(dir); assertEquals(1, reader.getIntFields().size()); assertEquals("f1", reader.getIntFields().iterator().next()); assertEquals(1, reader.getStringFields().size()); assertEquals("f2", reader.getStringFields().iterator().next()); assertEquals(10, reader.getNumDocs()); final DocIdStream dis = reader.getDocIdStream(); final int[] docIdBuf = new int[2]; final SimpleIntTermIterator intItr = reader.getIntTermIterator("f1"); assertTrue(intItr.next()); assertEquals(2, intItr.term()); assertEquals(3, intItr.docFreq()); assertEquals(0L, intItr.getOffset()); dis.reset(intItr); assertEquals(2, dis.fillDocIdBuffer(docIdBuf)); assertEquals(0, docIdBuf[0]); assertEquals(4, docIdBuf[1]); assertEquals(1, dis.fillDocIdBuffer(docIdBuf)); assertEquals(9, docIdBuf[0]); assertTrue(intItr.next()); assertEquals(99, intItr.term()); assertEquals(2, intItr.docFreq()); dis.reset(intItr); assertEquals(2, dis.fillDocIdBuffer(docIdBuf)); assertEquals(5, docIdBuf[0]); assertEquals(6, docIdBuf[1]); assertEquals(0, dis.fillDocIdBuffer(docIdBuf)); assertTrue(intItr.next()); assertEquals(101, intItr.term()); assertEquals(3, intItr.docFreq()); dis.reset(intItr); assertEquals(2, dis.fillDocIdBuffer(docIdBuf)); assertEquals(0, docIdBuf[0]); assertEquals(1, docIdBuf[1]); assertEquals(1, dis.fillDocIdBuffer(docIdBuf)); assertEquals(2, docIdBuf[0]); assertTrue(intItr.next()); assertEquals(9000, intItr.term()); assertEquals(4, intItr.docFreq()); dis.reset(intItr); assertEquals(2, dis.fillDocIdBuffer(docIdBuf)); assertEquals(3, docIdBuf[0]); assertEquals(7, docIdBuf[1]); assertEquals(2, dis.fillDocIdBuffer(docIdBuf)); assertEquals(8, docIdBuf[0]); assertEquals(9, docIdBuf[1]); assertEquals(0, dis.fillDocIdBuffer(docIdBuf)); assertFalse(intItr.next()); final SimpleStringTermIterator strItr = reader.getStringTermIterator("f2"); assertTrue(strItr.next()); assertEquals("", strItr.term()); assertEquals(2, strItr.docFreq()); dis.reset(strItr); assertEquals(2, dis.fillDocIdBuffer(docIdBuf)); assertEquals(2, docIdBuf[0]); assertEquals(5, docIdBuf[1]); assertEquals(0, dis.fillDocIdBuffer(docIdBuf)); assertTrue(strItr.next()); assertEquals("a", strItr.term()); assertEquals(2, strItr.docFreq()); dis.reset(strItr); assertEquals(2, dis.fillDocIdBuffer(docIdBuf)); assertEquals(4, docIdBuf[0]); assertEquals(7, docIdBuf[1]); assertEquals(0, dis.fillDocIdBuffer(docIdBuf)); assertTrue(strItr.next()); assertEquals("ffffffffff", strItr.term()); assertEquals(3, strItr.docFreq()); dis.reset(strItr); assertEquals(2, dis.fillDocIdBuffer(docIdBuf)); assertEquals(2, docIdBuf[0]); assertEquals(5, docIdBuf[1]); assertEquals(1, dis.fillDocIdBuffer(docIdBuf)); assertEquals(9, docIdBuf[0]); assertTrue(strItr.next()); assertEquals("lollerskates", strItr.term()); assertEquals(2, strItr.docFreq()); dis.reset(strItr); assertEquals(2, dis.fillDocIdBuffer(docIdBuf)); assertEquals(7, docIdBuf[0]); assertEquals(8, docIdBuf[1]); assertEquals(0, dis.fillDocIdBuffer(docIdBuf)); assertFalse(strItr.next()); } private void writeIndex(String dir) throws IOException { final SimpleFlamdexWriter writer = new SimpleFlamdexWriter(dir, 10); final IntFieldWriter ifw = writer.getIntFieldWriter("f1"); ifw.nextTerm(2); ifw.nextDoc(0); ifw.nextDoc(4); ifw.nextDoc(9); try { ifw.nextDoc(10); // doc >= numDocs assertTrue(false); } catch (IllegalArgumentException e) { // pass } try { ifw.nextDoc(5); // doc out of order assertTrue(false); } catch (IllegalArgumentException e) { // pass } ifw.nextTerm(3); // should not be written because docFreq == 0 ifw.nextTerm(99); ifw.nextDoc(5); ifw.nextDoc(6); ifw.nextTerm(101); ifw.nextDoc(0); ifw.nextDoc(1); ifw.nextDoc(2); try { ifw.nextTerm(10); // term out of order assertTrue(false); } catch (IllegalArgumentException e) { // pass } ifw.nextTerm(9000); ifw.nextDoc(3); ifw.nextDoc(7); ifw.nextDoc(8); ifw.nextDoc(9); ifw.close(); final StringFieldWriter sfw = writer.getStringFieldWriter("f2"); sfw.nextTerm(""); sfw.nextDoc(2); sfw.nextDoc(5); sfw.nextTerm("a"); sfw.nextDoc(4); sfw.nextDoc(7); sfw.nextTerm("d"); // should not be written because docFreq == 0 sfw.nextTerm("ffffffffff"); sfw.nextDoc(2); sfw.nextDoc(5); sfw.nextDoc(9); try { sfw.nextTerm("eeeeeeeeeeeeeeeeeeeeeeeee"); // term out of order assertTrue(false); } catch (IllegalArgumentException e) { // pass } sfw.nextTerm("lollerskates"); sfw.nextDoc(7); sfw.nextDoc(8); sfw.close(); writer.close(); } }