package com.yahoo.glimmer.indexing.generator;
/*
* Copyright (c) 2012 Yahoo! Inc. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.
* You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software distributed under the License is
* distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and limitations under the License.
* See accompanying LICENSE file.
*/
import static org.junit.Assert.assertArrayEquals;
import static org.junit.Assert.assertEquals;
import it.unimi.dsi.fastutil.chars.CharArraySet;
import it.unimi.dsi.fastutil.chars.CharSet;
import it.unimi.dsi.io.DelimitedWordReader;
import java.io.IOException;
import java.util.Collections;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Counters;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.hamcrest.BaseMatcher;
import org.hamcrest.Description;
import org.jmock.Expectations;
import org.jmock.Mockery;
import org.jmock.lib.legacy.ClassImposteriser;
import org.junit.Before;
import org.junit.Test;
import com.yahoo.glimmer.indexing.RDFDocument;
import com.yahoo.glimmer.indexing.RDFDocumentFactory.IndexType;
import com.yahoo.glimmer.indexing.generator.TermValue.Type;
public class DocumentMapperTest {
private static final CharSet DELIMITER = new CharArraySet(Collections.singleton(' '));
private static final Text DOC_TEXT = new Text("TheRecordAsText");
private Mockery context;
private Mapper<LongWritable, Text, TermKey, TermValue>.Context mapperContext;
private Configuration mapperConf;
private RDFDocument doc;
private Counters counters;
@SuppressWarnings("unchecked")
@Before
public void before() {
context = new Mockery();
context.setImposteriser(ClassImposteriser.INSTANCE);
mapperContext = context.mock(Context.class, "mapperContext");
mapperConf = new Configuration();
doc = context.mock(RDFDocument.class, "doc");
counters = new Counters();
}
@Test
public void emptyDocTest() throws IOException, InterruptedException {
mapperConf.setEnum("IndexType", IndexType.HORIZONTAL);
mapperConf.setStrings("RdfFieldNames", "fieldZero");
context.checking(new Expectations(){{
allowing(mapperContext).getConfiguration();
will(returnValue(mapperConf));
one(doc).getIndexType();
will(returnValue(IndexType.HORIZONTAL));
one(doc).setContent(with(DOC_TEXT.getBytes()), with(DOC_TEXT.getLength()));
allowing(doc).getSubject();
will(returnValue("http://subject/"));
allowing(doc).getId();
will(returnValue(5l));
one(mapperContext).getCounter(DocumentMapper.Counters.NUMBER_OF_RECORDS);
will(returnValue(counters.findCounter(DocumentMapper.Counters.NUMBER_OF_RECORDS)));
allowing(doc).content(0);
will(returnValue(new DelimitedWordReader("".toCharArray(), DELIMITER)));
}});
DocumentMapper mapper = new DocumentMapper();
mapper.setup(mapperContext);
assertArrayEquals(new String[]{"fieldZero"}, mapper.getFields());
assertEquals(IndexType.HORIZONTAL, mapper.getDoc().getIndexType());
// Replace the doc to the mock one.
mapper.setDoc(doc);
mapper.map(null, DOC_TEXT, mapperContext);
context.assertIsSatisfied();
}
@Test
public void horiztalTest() throws IOException, InterruptedException {
mapperConf.setEnum("IndexType", IndexType.HORIZONTAL);
mapperConf.setStrings("RdfFieldNames", "subject", "subjectText", "object", "predicate", "context");
context.checking(new Expectations(){{
allowing(mapperContext).getConfiguration();
will(returnValue(mapperConf));
one(doc).setContent(with(DOC_TEXT.getBytes()), with(DOC_TEXT.getLength()));
allowing(mapperContext).setStatus(with(any(String.class)));
allowing(mapperContext).getCounter(DocumentMapper.Counters.NUMBER_OF_RECORDS);
will(returnValue(counters.findCounter(DocumentMapper.Counters.NUMBER_OF_RECORDS)));
allowing(mapperContext).getCounter(DocumentMapper.Counters.INDEXED_OCCURRENCES);
will(returnValue(counters.findCounter(DocumentMapper.Counters.INDEXED_OCCURRENCES)));
allowing(doc).getSubject();
will(returnValue("http://subject/"));
allowing(doc).getId();
will(returnValue(10l));
allowing(doc).getIndexType();
will(returnValue(IndexType.HORIZONTAL));
allowing(doc).content(0);
will(returnValue(new DelimitedWordReader("subject field value".toCharArray(), DELIMITER)));
// The occurrences
one(mapperContext).write(with(new TermKeyMatcher(0, "subject", Type.OCCURRENCE, 10, 0)), with(new TermValueMatcher(Type.OCCURRENCE, 10, 0)));
one(mapperContext).write(with(new TermKeyMatcher(0, "field", Type.OCCURRENCE, 10, 1)), with(new TermValueMatcher(Type.OCCURRENCE, 10, 1)));
one(mapperContext).write(with(new TermKeyMatcher(0, "value", Type.OCCURRENCE, 10, 2)), with(new TermValueMatcher(Type.OCCURRENCE, 10, 2)));
// The doc size.
one(mapperContext).write(with(new TermKeyMatcher(0, "", Type.DOC_SIZE, 10, 3)), with(new TermValueMatcher(Type.DOC_SIZE, 10, 3)));
// Occurrence counts and last positions.
one(mapperContext).write(with(new TermKeyMatcher(0, "subject", Type.TERM_STATS, 1, 0)), with(new TermValueMatcher(Type.TERM_STATS, 1, 0)));
one(mapperContext).write(with(new TermKeyMatcher(0, "field", Type.TERM_STATS, 1, 1)), with(new TermValueMatcher(Type.TERM_STATS, 1, 1)));
one(mapperContext).write(with(new TermKeyMatcher(0, "value", Type.TERM_STATS, 1, 2)), with(new TermValueMatcher(Type.TERM_STATS, 1, 2)));
allowing(doc).content(1);
will(returnValue(new DelimitedWordReader("subjectText field value value".toCharArray(), DELIMITER)));
// The occurrences
one(mapperContext).write(with(new TermKeyMatcher(1, "subjectText", Type.OCCURRENCE, 10, 0)), with(new TermValueMatcher(Type.OCCURRENCE, 10, 0)));
one(mapperContext).write(with(new TermKeyMatcher(1, "field", Type.OCCURRENCE, 10, 1)), with(new TermValueMatcher(Type.OCCURRENCE, 10, 1)));
one(mapperContext).write(with(new TermKeyMatcher(1, "value", Type.OCCURRENCE, 10, 2)), with(new TermValueMatcher(Type.OCCURRENCE, 10, 2)));
one(mapperContext).write(with(new TermKeyMatcher(1, "value", Type.OCCURRENCE, 10, 3)), with(new TermValueMatcher(Type.OCCURRENCE, 10, 3)));
// The doc size.
one(mapperContext).write(with(new TermKeyMatcher(1, "", Type.DOC_SIZE, 10, 4)), with(new TermValueMatcher(Type.DOC_SIZE, 10, 4)));
// Occurrence counts and last positions.
one(mapperContext).write(with(new TermKeyMatcher(1, "subjectText", Type.TERM_STATS, 1, 0)), with(new TermValueMatcher(Type.TERM_STATS, 1, 0)));
one(mapperContext).write(with(new TermKeyMatcher(1, "field", Type.TERM_STATS, 1, 1)), with(new TermValueMatcher(Type.TERM_STATS, 1, 1)));
one(mapperContext).write(with(new TermKeyMatcher(1, "value", Type.TERM_STATS, 2, 3)), with(new TermValueMatcher(Type.TERM_STATS, 2, 3)));
allowing(doc).content(2);
will(returnValue(new DelimitedWordReader("o1 o2 o3".toCharArray(), DELIMITER)));
// The occurrences
one(mapperContext).write(with(new TermKeyMatcher(2, "o1", Type.OCCURRENCE, 10, 0)), with(new TermValueMatcher(Type.OCCURRENCE, 10, 0)));
one(mapperContext).write(with(new TermKeyMatcher(2, "o2", Type.OCCURRENCE, 10, 1)), with(new TermValueMatcher(Type.OCCURRENCE, 10, 1)));
one(mapperContext).write(with(new TermKeyMatcher(2, "o3", Type.OCCURRENCE, 10, 2)), with(new TermValueMatcher(Type.OCCURRENCE, 10, 2)));
// The doc size.
one(mapperContext).write(with(new TermKeyMatcher(2, "", Type.DOC_SIZE, 10, 3)), with(new TermValueMatcher(Type.DOC_SIZE, 10, 3)));
// Occurrence counts and last positions.
one(mapperContext).write(with(new TermKeyMatcher(2, "o1", Type.TERM_STATS, 1, 0)), with(new TermValueMatcher(Type.TERM_STATS, 1, 0)));
one(mapperContext).write(with(new TermKeyMatcher(2, "o2", Type.TERM_STATS, 1, 1)), with(new TermValueMatcher(Type.TERM_STATS, 1, 1)));
one(mapperContext).write(with(new TermKeyMatcher(2, "o3", Type.TERM_STATS, 1, 2)), with(new TermValueMatcher(Type.TERM_STATS, 1, 2)));
allowing(doc).content(3);
will(returnValue(new DelimitedWordReader("p1 p2 p2".toCharArray(), DELIMITER)));
// The occurrences
one(mapperContext).write(with(new TermKeyMatcher(3, "p1", Type.OCCURRENCE, 10, 0)), with(new TermValueMatcher(Type.OCCURRENCE, 10, 0)));
one(mapperContext).write(with(new TermKeyMatcher(3, "p2", Type.OCCURRENCE, 10, 1)), with(new TermValueMatcher(Type.OCCURRENCE, 10, 1)));
one(mapperContext).write(with(new TermKeyMatcher(3, "p2", Type.OCCURRENCE, 10, 2)), with(new TermValueMatcher(Type.OCCURRENCE, 10, 2)));
// The doc size.
one(mapperContext).write(with(new TermKeyMatcher(3, "", Type.DOC_SIZE, 10, 3)), with(new TermValueMatcher(Type.DOC_SIZE, 10, 3)));
// Occurrence counts and last positions.
one(mapperContext).write(with(new TermKeyMatcher(3, "p1", Type.TERM_STATS, 1, 0)), with(new TermValueMatcher(Type.TERM_STATS, 1, 0)));
one(mapperContext).write(with(new TermKeyMatcher(3, "p2", Type.TERM_STATS, 2, 2)), with(new TermValueMatcher(Type.TERM_STATS, 2, 2)));
allowing(doc).content(4);
will(returnValue(new DelimitedWordReader("c1 c1 c1".toCharArray(), DELIMITER)));
// The occurrences
one(mapperContext).write(with(new TermKeyMatcher(4, "c1", Type.OCCURRENCE, 10, 0)), with(new TermValueMatcher(Type.OCCURRENCE, 10, 0)));
one(mapperContext).write(with(new TermKeyMatcher(4, "c1", Type.OCCURRENCE, 10, 1)), with(new TermValueMatcher(Type.OCCURRENCE, 10, 1)));
one(mapperContext).write(with(new TermKeyMatcher(4, "c1", Type.OCCURRENCE, 10, 2)), with(new TermValueMatcher(Type.OCCURRENCE, 10, 2)));
// The doc size.
one(mapperContext).write(with(new TermKeyMatcher(4, "", Type.DOC_SIZE, 10, 3)), with(new TermValueMatcher(Type.DOC_SIZE, 10, 3)));
// Occurrence counts and last positions.
one(mapperContext).write(with(new TermKeyMatcher(4, "c1", Type.TERM_STATS, 3, 2)), with(new TermValueMatcher(Type.TERM_STATS, 3, 2)));
}});
DocumentMapper mapper = new DocumentMapper();
mapper.setup(mapperContext);
assertArrayEquals(new String[]{"subject", "subjectText", "object", "predicate", "context"}, mapper.getFields());
assertEquals(IndexType.HORIZONTAL, mapper.getDoc().getIndexType());
mapper.setDoc(doc);
mapper.map(null, DOC_TEXT, mapperContext);
context.assertIsSatisfied();
assertEquals(1l, counters.findCounter(DocumentMapper.Counters.NUMBER_OF_RECORDS).getValue());
assertEquals(16l, counters.findCounter(DocumentMapper.Counters.INDEXED_OCCURRENCES).getValue());
}
@Test
public void vertialTest() throws IOException, InterruptedException {
mapperConf.setEnum("IndexType", IndexType.VERTICAL);
mapperConf.setStrings("RdfFieldNames", "fieldZero", "fieldOne", "fieldTwo");
context.checking(new Expectations(){{
allowing(mapperContext).getConfiguration();
will(returnValue(mapperConf));
one(doc).setContent(with(DOC_TEXT.getBytes()), with(DOC_TEXT.getLength()));
allowing(mapperContext).setStatus(with(any(String.class)));
allowing(mapperContext).getCounter(DocumentMapper.Counters.NUMBER_OF_RECORDS);
will(returnValue(counters.findCounter(DocumentMapper.Counters.NUMBER_OF_RECORDS)));
allowing(mapperContext).getCounter(DocumentMapper.Counters.INDEXED_OCCURRENCES);
will(returnValue(counters.findCounter(DocumentMapper.Counters.INDEXED_OCCURRENCES)));
allowing(doc).getSubject();
will(returnValue("http://subject/"));
allowing(doc).getId();
will(returnValue(10l));
allowing(doc).getIndexType();
will(returnValue(IndexType.VERTICAL));
allowing(doc).content(0);
will(returnValue(new DelimitedWordReader("a literal b".toCharArray(), DELIMITER)));
// Occurrence counts and last positions.
one(mapperContext).write(with(new TermKeyMatcher(0, "a", Type.TERM_STATS, 1, 0)), with(new TermValueMatcher(Type.TERM_STATS, 1, 0)));
one(mapperContext).write(with(new TermKeyMatcher(0, "literal", Type.TERM_STATS, 1, 1)), with(new TermValueMatcher(Type.TERM_STATS, 1, 1)));
one(mapperContext).write(with(new TermKeyMatcher(0, "b", Type.TERM_STATS, 1, 2)), with(new TermValueMatcher(Type.TERM_STATS, 1, 2)));
// The occurrences
one(mapperContext).write(with(new TermKeyMatcher(0, "a", Type.OCCURRENCE, 10, 0)), with(new TermValueMatcher(Type.OCCURRENCE, 10, 0)));
one(mapperContext).write(with(new TermKeyMatcher(0, "literal", Type.OCCURRENCE, 10, 1)), with(new TermValueMatcher(Type.OCCURRENCE, 10, 1)));
one(mapperContext).write(with(new TermKeyMatcher(0, "b", Type.OCCURRENCE, 10, 2)), with(new TermValueMatcher(Type.OCCURRENCE, 10, 2)));
allowing(doc).content(1);
will(returnValue(new DelimitedWordReader("X Y X".toCharArray(), DELIMITER)));
// Occurrence counts and last positions.
one(mapperContext).write(with(new TermKeyMatcher(1, "X", Type.TERM_STATS, 2, 2)), with(new TermValueMatcher(Type.TERM_STATS, 2, 2)));
one(mapperContext).write(with(new TermKeyMatcher(1, "Y", Type.TERM_STATS, 1, 1)), with(new TermValueMatcher(Type.TERM_STATS, 1, 1)));
// The occurrences
one(mapperContext).write(with(new TermKeyMatcher(1, "X", Type.OCCURRENCE, 10, 0)), with(new TermValueMatcher(Type.OCCURRENCE, 10, 0)));
one(mapperContext).write(with(new TermKeyMatcher(1, "Y", Type.OCCURRENCE, 10, 1)), with(new TermValueMatcher(Type.OCCURRENCE, 10, 1)));
one(mapperContext).write(with(new TermKeyMatcher(1, "X", Type.OCCURRENCE, 10, 2)), with(new TermValueMatcher(Type.OCCURRENCE, 10, 2)));
allowing(doc).content(2);
will(returnValue(new DelimitedWordReader("Y Y Z Z Z".toCharArray(), DELIMITER)));
// Occurrence counts and last positions.
one(mapperContext).write(with(new TermKeyMatcher(2, "Y", Type.TERM_STATS, 2, 1)), with(new TermValueMatcher(Type.TERM_STATS, 2, 1)));
one(mapperContext).write(with(new TermKeyMatcher(2, "Z", Type.TERM_STATS, 3, 4)), with(new TermValueMatcher(Type.TERM_STATS, 3, 4)));
// The occurrences
one(mapperContext).write(with(new TermKeyMatcher(2, "Y", Type.OCCURRENCE, 10, 0)), with(new TermValueMatcher(Type.OCCURRENCE, 10, 0)));
one(mapperContext).write(with(new TermKeyMatcher(2, "Y", Type.OCCURRENCE, 10, 1)), with(new TermValueMatcher(Type.OCCURRENCE, 10, 1)));
one(mapperContext).write(with(new TermKeyMatcher(2, "Z", Type.OCCURRENCE, 10, 2)), with(new TermValueMatcher(Type.OCCURRENCE, 10, 2)));
one(mapperContext).write(with(new TermKeyMatcher(2, "Z", Type.OCCURRENCE, 10, 3)), with(new TermValueMatcher(Type.OCCURRENCE, 10, 3)));
one(mapperContext).write(with(new TermKeyMatcher(2, "Z", Type.OCCURRENCE, 10, 4)), with(new TermValueMatcher(Type.OCCURRENCE, 10, 4)));
// The ALIGNMENT_INDEX is created for Vertical indexes only. It's just a map between terms and the index they occur in.
one(mapperContext).write(with(new TermKeyMatcher(DocumentMapper.ALIGNMENT_INDEX, "a", Type.INDEX_ID, 0)), with(new TermValueMatcher(Type.INDEX_ID, 0)));
one(mapperContext).write(with(new TermKeyMatcher(DocumentMapper.ALIGNMENT_INDEX, "literal", Type.INDEX_ID, 0)), with(new TermValueMatcher(Type.INDEX_ID, 0)));
one(mapperContext).write(with(new TermKeyMatcher(DocumentMapper.ALIGNMENT_INDEX, "b", Type.INDEX_ID, 0)), with(new TermValueMatcher(Type.INDEX_ID, 0)));
one(mapperContext).write(with(new TermKeyMatcher(DocumentMapper.ALIGNMENT_INDEX, "X", Type.INDEX_ID, 1)), with(new TermValueMatcher(Type.INDEX_ID, 1)));
one(mapperContext).write(with(new TermKeyMatcher(DocumentMapper.ALIGNMENT_INDEX, "Y", Type.INDEX_ID, 1)), with(new TermValueMatcher(Type.INDEX_ID, 1)));
one(mapperContext).write(with(new TermKeyMatcher(DocumentMapper.ALIGNMENT_INDEX, "Y", Type.INDEX_ID, 2)), with(new TermValueMatcher(Type.INDEX_ID, 2)));
one(mapperContext).write(with(new TermKeyMatcher(DocumentMapper.ALIGNMENT_INDEX, "Z", Type.INDEX_ID, 2)), with(new TermValueMatcher(Type.INDEX_ID, 2)));
}});
DocumentMapper mapper = new DocumentMapper();
mapper.setup(mapperContext);
assertArrayEquals(new String[]{"fieldZero", "fieldOne", "fieldTwo"}, mapper.getFields());
assertEquals(IndexType.VERTICAL, mapper.getDoc().getIndexType());
mapper.setDoc(doc);
mapper.map(null, DOC_TEXT, mapperContext);
context.assertIsSatisfied();
assertEquals(1l, counters.findCounter(DocumentMapper.Counters.NUMBER_OF_RECORDS).getValue());
assertEquals(11l, counters.findCounter(DocumentMapper.Counters.INDEXED_OCCURRENCES).getValue());
}
private static class TermValueMatcher extends BaseMatcher<TermValue> {
private TermValue occurrence;
public TermValueMatcher(Type type, int v1) {
occurrence = new TermValue(type, v1);
}
public TermValueMatcher(Type type, int v1, int v2) {
occurrence = new TermValue(type, v1, v2);
}
@Override
public boolean matches(Object object) {
return occurrence.equals(object);
}
@Override
public void describeTo(Description description) {
description.appendText(occurrence.toString());
}
}
}