/*
* Copyright (C) 2014 Indeed Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
* in compliance with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the
* License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
* express or implied. See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.indeed.flamdex;
import com.google.common.base.Function;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.io.ByteArrayDataOutput;
import com.google.common.io.ByteStreams;
import com.indeed.util.core.shell.PosixFileOperations;
import com.indeed.flamdex.api.DocIdStream;
import com.indeed.flamdex.api.FlamdexReader;
import com.indeed.flamdex.api.IntTermIterator;
import com.indeed.flamdex.api.StringTermIterator;
import com.indeed.flamdex.simple.SimpleFlamdexReader;
import com.indeed.flamdex.simple.SimpleFlamdexWriter;
import com.indeed.flamdex.writer.FlamdexDocument;
import com.indeed.flamdex.writer.IntFieldWriter;
import com.indeed.flamdex.writer.StringFieldWriter;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
import javax.annotation.Nullable;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.Random;
import static org.junit.Assert.*;
/**
* @author jsgroth
*/
public class TestMemoryFlamdex {
File tmpDir;
@Before
public void setUp() throws Exception {
tmpDir = File.createTempFile("tmp", "", new File("."));
tmpDir.delete();
tmpDir.mkdirs();
}
@After
public void tearDown() throws Exception {
PosixFileOperations.rmrf(tmpDir);
}
@Test
public void testIntMaxValue() throws IOException {
MemoryFlamdex fdx = new MemoryFlamdex().setNumDocs(1);
IntFieldWriter ifw = fdx.getIntFieldWriter("if1");
ifw.nextTerm(Integer.MAX_VALUE);
ifw.nextDoc(0);
ifw.close();
fdx.close();
ByteArrayDataOutput out = ByteStreams.newDataOutput();
fdx.write(out);
MemoryFlamdex fdx2 = new MemoryFlamdex();
fdx2.readFields(ByteStreams.newDataInput(out.toByteArray()));
innerTestIntMaxValue(fdx2);
innerTestIntMaxValue(MemoryFlamdex.streamer(ByteStreams.newDataInput(out.toByteArray())));
}
private void innerTestIntMaxValue(FlamdexReader fdx2) {
IntTermIterator iter = fdx2.getIntTermIterator("if1");
DocIdStream dis = fdx2.getDocIdStream();
int[] buf = new int[2];
assertTrue(iter.next());
assertEquals(Integer.MAX_VALUE, iter.term());
dis.reset(iter);
assertEquals(1, dis.fillDocIdBuffer(buf));
assertEquals(0, buf[0]);
assertFalse(iter.next());
}
@Test
public void testStreamerDocIdStream() throws IOException {
MemoryFlamdex fdx = new MemoryFlamdex();
FlamdexDocument doc = new FlamdexDocument();
doc.setIntField("if1", 5);
for (int i = 0; i < 5; ++i) {
fdx.addDocument(doc);
}
fdx.close();
ByteArrayDataOutput out = ByteStreams.newDataOutput();
fdx.write(out);
FlamdexReader r = MemoryFlamdex.streamer(ByteStreams.newDataInput(out.toByteArray()));
IntTermIterator iter = r.getIntTermIterator("if1");
assertTrue(iter.next());
assertEquals(5, iter.term());
DocIdStream dis = r.getDocIdStream();
dis.reset(iter);
int[] buf = new int[2];
assertEquals(2, dis.fillDocIdBuffer(buf));
assertArrayEquals(new int[]{0, 1}, buf);
assertEquals(2, dis.fillDocIdBuffer(buf));
assertArrayEquals(new int[]{2, 3}, buf);
assertEquals(1, dis.fillDocIdBuffer(buf));
assertEquals(4, buf[0]);
}
@Test
public void testDocWithRepeatingTerms() throws IOException {
MemoryFlamdex fdx = new MemoryFlamdex();
FlamdexDocument doc = new FlamdexDocument();
doc.setIntField("if1", new long[]{1, 1, 1, 3, 3});
fdx.addDocument(doc);
doc.setIntField("if1", new long[]{1, 2, 2, 2, 4});
fdx.addDocument(doc);
fdx.close();
innerTestBadDoc(fdx);
ByteArrayDataOutput out = ByteStreams.newDataOutput();
fdx.write(out);
MemoryFlamdex fdx2 = new MemoryFlamdex();
fdx2.readFields(ByteStreams.newDataInput(out.toByteArray()));
innerTestBadDoc(fdx2);
}
private void innerTestBadDoc(MemoryFlamdex fdx) {
assertEquals(2, fdx.getNumDocs());
IntTermIterator iter = fdx.getIntTermIterator("if1");
DocIdStream dis = fdx.getDocIdStream();
int[] buf = new int[64];
assertTrue(iter.next());
assertEquals(1, iter.term());
assertEquals(2, iter.docFreq());
dis.reset(iter);
assertEquals(2, dis.fillDocIdBuffer(buf));
assertArrayEquals(new int[]{0, 1}, Arrays.copyOf(buf, 2));
assertTrue(iter.next());
assertEquals(2, iter.term());
assertEquals(1, iter.docFreq());
dis.reset(iter);
assertEquals(1, dis.fillDocIdBuffer(buf));
assertEquals(1, buf[0]);
assertTrue(iter.next());
assertEquals(3, iter.term());
assertEquals(1, iter.docFreq());
dis.reset(iter);
assertEquals(1, dis.fillDocIdBuffer(buf));
assertEquals(0, buf[0]);
assertTrue(iter.next());
assertEquals(4, iter.term());
assertEquals(1, iter.docFreq());
dis.reset(iter);
assertEquals(1, dis.fillDocIdBuffer(buf));
assertEquals(1, buf[0]);
assertFalse(iter.next());
dis.close();
iter.close();
}
@Test
public void testTermWrite() throws IOException {
MemoryFlamdex fdx = new MemoryFlamdex().setNumDocs(10);
IntFieldWriter ifw = fdx.getIntFieldWriter("if1");
ifw.nextTerm(5);
ifw.nextDoc(0);
ifw.nextDoc(3);
ifw.nextDoc(4);
ifw.nextDoc(5);
ifw.nextTerm(6);
ifw.nextTerm(99);
ifw.nextDoc(1);
ifw.nextDoc(2);
ifw.nextDoc(7);
ifw.close();
StringFieldWriter sfw = fdx.getStringFieldWriter("sf1");
sfw.nextTerm("a");
sfw.nextDoc(2);
sfw.nextDoc(8);
sfw.nextDoc(9);
sfw.nextTerm("b");
sfw.nextDoc(1);
sfw.nextDoc(8);
sfw.nextTerm("c");
sfw.nextTerm("d");
sfw.nextDoc(5);
sfw.nextDoc(6);
sfw.close();
fdx.close();
ByteArrayDataOutput out = ByteStreams.newDataOutput();
fdx.write(out);
MemoryFlamdex fdx2 = new MemoryFlamdex();
fdx2.readFields(ByteStreams.newDataInput(out.toByteArray()));
verify(fdx2);
verify(fdx);
verify(fdx2.shallowCopy());
verify(fdx.shallowCopy());
}
private void verify(FlamdexReader fdx) {
assertEquals(10, fdx.getNumDocs());
DocIdStream dis = fdx.getDocIdStream();
IntTermIterator iti = fdx.getIntTermIterator("if1");
assertTrue(iti.next());
assertEquals(5, iti.term());
assertEquals(4, iti.docFreq());
int[] buf = new int[64];
dis.reset(iti);
assertEquals(4, dis.fillDocIdBuffer(buf));
assertArrayEquals(new int[]{0, 3, 4, 5}, Arrays.copyOf(buf, 4));
assertTrue(iti.next());
assertEquals(99, iti.term());
assertEquals(3, iti.docFreq());
dis.reset(iti);
assertEquals(3, dis.fillDocIdBuffer(buf));
assertArrayEquals(new int[]{1, 2, 7}, Arrays.copyOf(buf, 3));
assertFalse(iti.next());
iti.reset(6);
assertTrue(iti.next());
assertEquals(99, iti.term());
dis.reset(iti);
assertEquals(3, dis.fillDocIdBuffer(buf));
assertArrayEquals(new int[]{1, 2, 7}, Arrays.copyOf(buf, 3));
assertFalse(iti.next());
iti.close();
StringTermIterator sti = fdx.getStringTermIterator("sf1");
assertTrue(sti.next());
assertEquals("a", sti.term());
assertEquals(3, sti.docFreq());
dis.reset(sti);
assertEquals(3, dis.fillDocIdBuffer(buf));
assertArrayEquals(new int[]{2, 8, 9}, Arrays.copyOf(buf, 3));
assertTrue(sti.next());
assertEquals("b", sti.term());
assertEquals(2, sti.docFreq());
dis.reset(sti);
assertEquals(2, dis.fillDocIdBuffer(buf));
assertArrayEquals(new int[]{1, 8}, Arrays.copyOf(buf, 2));
assertTrue(sti.next());
assertEquals("d", sti.term());
assertEquals(2, sti.docFreq());
dis.reset(sti);
assertEquals(2, dis.fillDocIdBuffer(buf));
assertArrayEquals(new int[]{5, 6}, Arrays.copyOf(buf, 2));
assertFalse(sti.next());
sti.reset("c");
assertTrue(sti.next());
assertEquals("d", sti.term());
dis.reset(sti);
assertEquals(2, dis.fillDocIdBuffer(buf));
assertArrayEquals(new int[]{5, 6}, Arrays.copyOf(buf, 2));
assertFalse(sti.next());
sti.close();
dis.close();
}
@Test
public void testSerial() throws IOException {
final MemoryFlamdex fdx = new MemoryFlamdex();
final FlamdexDocument doc0 = new FlamdexDocument();
doc0.setIntField("if1", 0);
doc0.setIntField("if2", new long[]{5, 6, 7});
doc0.setStringField("sf1", "asdf");
fdx.addDocument(doc0);
final FlamdexDocument doc1 = new FlamdexDocument();
doc1.setIntField("if2", new long[]{3, 6});
fdx.addDocument(doc1);
final ByteArrayDataOutput out = ByteStreams.newDataOutput();
fdx.write(out);
final MemoryFlamdex fdx2 = new MemoryFlamdex();
fdx2.readFields(ByteStreams.newDataInput(out.toByteArray()));
assertEquals(2, fdx2.getNumDocs());
fdx.readFields(ByteStreams.newDataInput(out.toByteArray()));
assertEquals(2, fdx.getNumDocs());
Map<String, List<FlamdexDocument>> ret = Maps.newHashMap();
Random r = new Random();
StringBuilder sb = new StringBuilder();
for (int i = 0; i < 300; ++i) {
Map<String, List<Long>> intFields = Maps.newHashMap();
int numFields = r.nextInt(5) + 5;
for (int j = 0; j < numFields; ++j) {
int numTerms = r.nextInt(50) + 50;
List<Long> terms = Lists.newArrayList();
for (int k = 0; k < numTerms; ++k) {
terms.add(r.nextLong());
}
intFields.put("if" + j, terms);
}
Map<String, List<String>> stringFields = Maps.newHashMap();
int numSFields = r.nextInt(5) + 5;
for (int j = 0; j < numSFields; ++j) {
int numTerms = r.nextInt(10) + 10;
List<String> terms = Lists.newArrayList();
for (int k = 0; k < numTerms; ++k) {
int termLen = r.nextInt(10) + 10;
sb.setLength(0);
for (int l = 0; l < termLen; ++l) {
sb.append((char)(r.nextInt(26) + 'a'));
}
terms.add(sb.toString());
}
stringFields.put("sf" + j, terms);
}
String key = "foo" + (r.nextInt(3) + 1);
if (!ret.containsKey(key)) {
ret.put(key, new ArrayList<FlamdexDocument>());
}
MockDoc doc = new MockDoc(Maps.transformValues(intFields, new Function<List<Long>, List<String>>() {
@Nullable
public List<String> apply(@Nullable final List<Long> input) {
return Lists.transform(input, new Function<Long, String>() {
@Nullable
public String apply(@Nullable final Long input) {
return String.valueOf(input);
}
});
}
}), stringFields);
ret.get(key).add(doc.convert());
}
MemoryFlamdex original = new MemoryFlamdex();
for (List<FlamdexDocument> docs : ret.values()) {
for (FlamdexDocument doc : docs) {
original.addDocument(doc);
}
}
MemoryFlamdex copy = new MemoryFlamdex();
ByteArrayDataOutput out2 = ByteStreams.newDataOutput();
original.write(out2);
copy.readFields(ByteStreams.newDataInput(out2.toByteArray()));
assertTrue(FlamdexCompare.unorderedEquals(original, copy));
int numDocs = 0;
List<MemoryFlamdex> flamdexes = new ArrayList<MemoryFlamdex>();
for (List<FlamdexDocument> docs : ret.values()) {
MemoryFlamdex flamdex = new MemoryFlamdex();
for (FlamdexDocument doc : docs) {
flamdex.addDocument(doc);
}
numDocs += flamdex.getNumDocs();
flamdexes.add(flamdex);
}
MemoryFlamdex merged = new MemoryFlamdex();
merged.setNumDocs(numDocs);
SimpleFlamdexWriter.merge(flamdexes, merged);
assertTrue(FlamdexCompare.unorderedEquals(merged, original));
File tmpFlamdexDir = new File(tmpDir, "tmpfdx");
SimpleFlamdexWriter writer = new SimpleFlamdexWriter(tmpFlamdexDir.getPath(), numDocs, true);
SimpleFlamdexWriter.writeFlamdex(merged, writer);
writer.close();
SimpleFlamdexReader reader = SimpleFlamdexReader.open(tmpFlamdexDir.getPath());
assertTrue(FlamdexCompare.unorderedEquals(reader, original));
}
public class MockDoc {
public Map<String, List<String>> intFields;
public Map<String, List<String>> stringFields;
public MockDoc() {}
public MockDoc(Map<String, List<String>> intFields, Map<String, List<String>> stringFields) {
this.intFields = intFields;
this.stringFields = stringFields;
}
public FlamdexDocument convert() {
FlamdexDocument doc = new FlamdexDocument();
for (Map.Entry<String, List<String>> e : intFields.entrySet()) {
doc.setIntField(e.getKey(), Lists.transform(e.getValue(), new Function<String, Long>() {
@Nullable
public Long apply(@Nullable final String input) {
return Long.parseLong(input);
}
}));
}
for (Map.Entry<String, List<String>> e : stringFields.entrySet()) {
doc.setStringField(e.getKey(), e.getValue());
}
return doc;
}
}
@Test
public void test2() throws IOException {
final MemoryFlamdex fdx = new MemoryFlamdex();
final FlamdexDocument doc0 = new FlamdexDocument();
doc0.setIntField("clicked", 1);
fdx.addDocument(doc0);
fdx.close();
final IntTermIterator it = fdx.getIntTermIterator("clicked");
assertTrue(it.next());
assertEquals(1, it.term());
assertEquals(1, it.docFreq());
assertFalse(it.next());
ByteArrayDataOutput out = ByteStreams.newDataOutput();
fdx.write(out);
final MemoryFlamdex fdx2 = new MemoryFlamdex();
fdx2.readFields(ByteStreams.newDataInput(out.toByteArray()));
IntTermIterator iter = fdx2.getIntTermIterator("clicked");
assertTrue(iter.next());
assertEquals(1, iter.term());
assertEquals(1, iter.docFreq());
assertFalse(iter.next());
}
}