/*
* Copyright (C) 2014 Indeed Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
* in compliance with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the
* License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
* express or implied. See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.indeed.flamdex.simple;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import com.google.common.primitives.Ints;
import com.google.common.primitives.Longs;
import com.indeed.util.io.Files;
import com.indeed.flamdex.api.DocIdStream;
import com.indeed.flamdex.api.FlamdexReader;
import com.indeed.flamdex.api.IntTermIterator;
import com.indeed.flamdex.utils.FlamdexReinverter;
import com.indeed.flamdex.writer.FlamdexDocWriter;
import com.indeed.flamdex.writer.FlamdexDocument;
import org.apache.commons.io.FileUtils;
import org.apache.log4j.Appender;
import org.apache.log4j.BasicConfigurator;
import org.apache.log4j.ConsoleAppender;
import org.apache.log4j.Layout;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.log4j.PatternLayout;
import org.apache.log4j.varia.LevelRangeFilter;
import org.junit.After;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;
import java.io.File;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.Set;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
/**
* @author jsgroth
*/
public class TestSimpleFlamdexDocWriter {
private String tempDir;
@BeforeClass
public static void initLog4j() {
BasicConfigurator.resetConfiguration();
BasicConfigurator.configure();
final Layout LAYOUT = new PatternLayout("[ %d{ISO8601} %-5p ] [%c{1}] %m%n");
LevelRangeFilter ERROR_FILTER = new LevelRangeFilter();
ERROR_FILTER.setLevelMin(Level.ERROR);
ERROR_FILTER.setLevelMax(Level.FATAL);
// everything including ERROR
final Appender STDOUT = new ConsoleAppender(LAYOUT, ConsoleAppender.SYSTEM_OUT);
// just things <= ERROR
final Appender STDERR = new ConsoleAppender(LAYOUT, ConsoleAppender.SYSTEM_ERR);
STDERR.addFilter(ERROR_FILTER);
final Logger ROOT_LOGGER = Logger.getRootLogger();
ROOT_LOGGER.removeAllAppenders();
ROOT_LOGGER.setLevel(Level.WARN); // don't care about higher
ROOT_LOGGER.addAppender(STDOUT);
ROOT_LOGGER.addAppender(STDERR);
}
@Before
public void setUp() throws Exception {
tempDir = Files.getTempDirectory("flamdex-test", "dir");
}
@After
public void tearDown() throws Exception {
Files.delete(tempDir);
}
@Test
public void bigRandomTest() throws IOException {
runRandomTest(150);
runRandomTest(10);
runRandomTest(3);
}
private void runRandomTest(int mergeFactor) throws IOException {
long elapsed = -System.currentTimeMillis();
final FlamdexDocWriter w = new SimpleFlamdexDocWriter(tempDir, new SimpleFlamdexDocWriter.Config().setDocBufferSize(100).setMergeFactor(mergeFactor));
final Random rand = new Random();
final int numDocs = rand.nextInt(20000) + 20000;
final List<FlamdexDocument> expected = Lists.newArrayList();
for (int i = 0; i < numDocs; ++i) {
final FlamdexDocument doc = new FlamdexDocument();
final int nif = rand.nextInt(5) + 5;
for (int j = 0; j < nif; ++j) {
final int nt = rand.nextInt(5) + 5;
for (int k = 0; k < nt; ++k) {
doc.addIntTerm("if" + j, rand.nextInt() & Integer.MAX_VALUE);
}
}
final int nsf = rand.nextInt(5) + 5;
for (int j = 0; j < nsf; ++j) {
final int nt = rand.nextInt(3) + 1;
for (int k = 0; k < nt; ++k) {
final int nc = rand.nextInt(20) + 1;
final StringBuilder sb = new StringBuilder(nc);
for (int l = 0; l < nc; ++l) {
sb.append((char)(rand.nextInt('z' - 'a') + 'a'));
}
doc.addStringTerm("sf" + j, sb.toString());
}
}
w.addDocument(doc);
expected.add(doc);
}
w.close();
final long size = FileUtils.sizeOfDirectory(new File(tempDir));
elapsed += System.currentTimeMillis();
System.out.println("time for writing " + size + " byte index with " + numDocs + " documents: " + elapsed + " ms");
final SimpleFlamdexReader r = SimpleFlamdexReader.open(tempDir);
final List<FlamdexDocument> actual = FlamdexReinverter.reinvertInMemory(r);
r.close();
assertEquals(expected.size(), actual.size());
for (int i = 0; i < expected.size(); ++i) {
final FlamdexDocument ed = expected.get(i);
final FlamdexDocument ad = actual.get(i);
assertTrue(unorderedEquals(ed.getIntFields(), ad.getIntFields()));
assertTrue(unorderedEquals(ed.getStringFields(), ad.getStringFields()));
}
}
private static <T> boolean unorderedEquals(Map<String, ? extends List<T>> o1, Map<String, ? extends List<T>> o2) {
if (!o1.keySet().equals(o2.keySet())) return false;
for (String s : o1.keySet()) {
Set<T> s1 = Sets.newHashSet(o1.get(s));
Set<T> s2 = Sets.newHashSet(o2.get(s));
if (!s1.equals(s2)) return false;
}
return true;
}
@Test
public void testBufferedOnly() throws IOException {
final SimpleFlamdexDocWriter.Config config = new SimpleFlamdexDocWriter.Config().setDocBufferSize(999999999).setMergeFactor(999999999);
writeFlamdex(tempDir, config);
final FlamdexReader r = SimpleFlamdexReader.open(tempDir);
assertTrue(r.getIntFields().size() == 2);
assertTrue(r.getIntFields().contains("if1"));
assertTrue(r.getIntFields().contains("if2"));
assertTrue(r.getStringFields().size() == 2);
assertTrue(r.getStringFields().contains("sf1"));
assertTrue(r.getStringFields().contains("sf2"));
DocIdStream dis = r.getDocIdStream();
int[] docIdBuffer = new int[64];
IntTermIterator iter = r.getIntTermIterator("if1");
assertTrue(iter.next());
assertEquals(0, iter.term());
assertEquals(2, iter.docFreq());
dis.reset(iter);
assertEquals(2, dis.fillDocIdBuffer(docIdBuffer));
assertEquals(Ints.asList(0, 3), Ints.asList(docIdBuffer).subList(0, 2));
assertTrue(iter.next());
assertEquals(5, iter.term());
assertEquals(1, iter.docFreq());
dis.reset(iter);
assertEquals(1, dis.fillDocIdBuffer(docIdBuffer));
assertEquals(Ints.asList(0), Ints.asList(docIdBuffer).subList(0, 1));
iter.close();
}
@Test
public void testEmpty() throws IOException {
new SimpleFlamdexDocWriter(tempDir, new SimpleFlamdexDocWriter.Config()).close();
FlamdexReader r = SimpleFlamdexReader.open(tempDir);
assertEquals(0, r.getNumDocs());
}
private void writeFlamdex(String dir, SimpleFlamdexDocWriter.Config config) throws IOException {
final FlamdexDocWriter w = new SimpleFlamdexDocWriter(dir, config);
final FlamdexDocument doc0 = new FlamdexDocument();
doc0.setIntField("if1", Longs.asList(0, 5, 99));
doc0.setIntField("if2", Longs.asList(3, 7));
doc0.setStringField("sf1", Arrays.asList("a", "b", "c"));
doc0.setStringField("sf2", Arrays.asList("0", "-234", "bob"));
w.addDocument(doc0);
final FlamdexDocument doc1 = new FlamdexDocument();
doc1.setIntField("if2", Longs.asList(6, 7, 99));
doc1.setStringField("sf1", Arrays.asList("b", "d", "f"));
doc1.setStringField("sf2", Arrays.asList("a", "b", "bob"));
w.addDocument(doc1);
final FlamdexDocument doc2 = new FlamdexDocument();
doc2.setStringField("sf1", Arrays.asList("", "a", "aa"));
w.addDocument(doc2);
final FlamdexDocument doc3 = new FlamdexDocument();
doc3.setIntField("if1", Longs.asList(0, 10000));
doc3.setIntField("if2", Longs.asList(9));
w.addDocument(doc3);
w.close();
}
}