package com.yahoo.glimmer.indexing.preprocessor;
/*
* Copyright (c) 2012 Yahoo! Inc. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.
* You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software distributed under the License is
* distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and limitations under the License.
* See accompanying LICENSE file.
*/
import java.io.IOException;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.jmock.Expectations;
import org.jmock.Mockery;
import org.jmock.lib.legacy.ClassImposteriser;
import org.junit.Before;
import org.junit.Ignore;
import org.junit.Test;
import com.yahoo.glimmer.indexing.preprocessor.TuplesToResourcesMapper;
import com.yahoo.glimmer.indexing.preprocessor.TuplesToResourcesMapper.Counters;
public class TuplesToResourcesMapperTest {
private Mockery context;
private Mapper<LongWritable,Text,Text,Object>.Context mrContext;
private Counter nxParserExceptionCounter;
private InputSplit inputSplit;
@SuppressWarnings("unchecked")
@Before
public void before() {
context = new Mockery();
context.setImposteriser(ClassImposteriser.INSTANCE);
mrContext = context.mock(Mapper.Context.class, "mrContext");
nxParserExceptionCounter = context.mock(Counter.class, "nxParserExceptionCounter");
inputSplit = new FileSplit(new Path("split1"), 5, 1000, new String[]{"host1"});
}
@Test
public void literalObjectText() throws IOException, InterruptedException {
context.checking(new Expectations(){{
allowing(mrContext).getInputSplit();
will(returnValue(inputSplit));
one(mrContext).write(with(new TextMatcher("http://www.example.org/terms/name")), with(new TextMatcher("PREDICATE")));
one(mrContext).write(with(new TextMatcher("http://www.example.org/staffid/85740")), with(new TextMatcher("<http://www.example.org/terms/name> \"Smith\" .")));
one(mrContext).write(with(new TextMatcher("http://www.example.org/terms/name")), with(new TextMatcher("PREDICATE")));
one(mrContext).write(with(new TextMatcher("http://www.example.org/staffid/85741")), with(new TextMatcher("<http://www.example.org/terms/name> \"\n Johnson \t\"@en .")));
one(mrContext).write(with(new TextMatcher("http://www.example.org/terms/name")), with(new TextMatcher("PREDICATE")));
one(mrContext).write(with(new TextMatcher("http://www.example.org/staffid/85742")), with(new TextMatcher("<http://www.example.org/terms/name> \"Ray\"@ .")));
}});
TuplesToResourcesMapper mapper = new TuplesToResourcesMapper();
mapper.map(new LongWritable(5l), new Text(
"<http://www.example.org/staffid/85740> <http://www.example.org/terms/name> \"Smith\" ."), mrContext);
mapper.map(new LongWritable(6l), new Text(
"<http://www.example.org/staffid/85741> <http://www.example.org/terms/name> \"\n Johnson \t\"@en ."), mrContext);
mapper.map(new LongWritable(7l), new Text(
"<http://www.example.org/staffid/85742> <http://www.example.org/terms/name> \"Ray\"@ ."), mrContext);
context.assertIsSatisfied();
}
@Test
public void resourceObjectTest() throws IOException, InterruptedException {
context.checking(new Expectations(){{
allowing(mrContext).getInputSplit();
will(returnValue(inputSplit));
one(mrContext).write(with(new TextMatcher("http://purl.org/dc/elements/1.1/creator")), with(new TextMatcher("PREDICATE")));
one(mrContext).write(with(new TextMatcher("http://www.example.org/staffid/85740")), with(new TextMatcher("OBJECT")));
one(mrContext).write(with(new TextMatcher("http://context/")), with(new TextMatcher("CONTEXT")));
one(mrContext).write(with(new TextMatcher("http://www.example.org/index.html")), with(new TextMatcher("<http://purl.org/dc/elements/1.1/creator> <http://www.example.org/staffid/85740> <http://context/> .")));
}});
TuplesToResourcesMapper mapper = new TuplesToResourcesMapper();
mapper.map(new LongWritable(5l), new Text(
"<http://www.example.org/index.html> <http://purl.org/dc/elements/1.1/creator> <http://www.example.org/staffid/85740> <http://context/> ."), mrContext);
context.assertIsSatisfied();
}
@Test
public void noContextsObjectTest() throws IOException, InterruptedException {
context.checking(new Expectations(){{
allowing(mrContext).getInputSplit();
will(returnValue(inputSplit));
one(mrContext).write(with(new TextMatcher("http://purl.org/dc/elements/1.1/creator")), with(new TextMatcher("PREDICATE")));
one(mrContext).write(with(new TextMatcher("http://www.example.org/staffid/85740")), with(new TextMatcher("OBJECT")));
one(mrContext).write(with(new TextMatcher("http://www.example.org/index.html")), with(new TextMatcher("<http://purl.org/dc/elements/1.1/creator> <http://www.example.org/staffid/85740> .")));
}});
TuplesToResourcesMapper mapper = new TuplesToResourcesMapper();
mapper.setIncludeContexts(false);
mapper.map(new LongWritable(5l), new Text(
"<http://www.example.org/index.html> <http://purl.org/dc/elements/1.1/creator> <http://www.example.org/staffid/85740> <http://context/> ."), mrContext);
context.assertIsSatisfied();
}
/*
* NxParser 1.2.2 fails with typed literals. The map method should remove the type and try again.
*/
@Ignore
@Test
public void qualifiedIntNxp122Test() throws IOException, InterruptedException {
context.checking(new Expectations(){{
allowing(mrContext).getInputSplit();
will(returnValue(inputSplit));
one(mrContext).getCounter(Counters.NX_PARSER_EXCEPTION);
will(returnValue(nxParserExceptionCounter));
one(nxParserExceptionCounter).increment(1l);
one(mrContext).write(with(new TextMatcher("http://www.example.org/terms/age")), with(new TextMatcher("PREDICATE")));
one(mrContext).write(with(new TextMatcher("http://www.example.org/staffid/85740")), with(new TextMatcher("<http://www.example.org/terms/age> \"27\" .")));
}});
TuplesToResourcesMapper mapper = new TuplesToResourcesMapper();
mapper.map(new LongWritable(5l), new Text(
"<http://www.example.org/staffid/85740> <http://www.example.org/terms/age> \"27\"^^<http://www.w3.org/2001/XMLSchema#integer> ."), mrContext);
context.assertIsSatisfied();
}
@Test
public void qualifiedIntNxp123Test() throws IOException, InterruptedException {
context.checking(new Expectations(){{
allowing(mrContext).getInputSplit();
will(returnValue(inputSplit));
one(mrContext).write(with(new TextMatcher("http://www.example.org/terms/age")), with(new TextMatcher("PREDICATE")));
one(mrContext).write(with(new TextMatcher("http://www.example.org/staffid/85740")), with(new TextMatcher("<http://www.example.org/terms/age> \"27\"^^<http://www.w3.org/2001/XMLSchema#integer> .")));
}});
TuplesToResourcesMapper mapper = new TuplesToResourcesMapper();
mapper.map(new LongWritable(5l), new Text(
"<http://www.example.org/staffid/85740> <http://www.example.org/terms/age> \"27\"^^<http://www.w3.org/2001/XMLSchema#integer> ."), mrContext);
context.assertIsSatisfied();
}
@Test
public void bNodeTest() throws IOException, InterruptedException {
context.checking(new Expectations(){{
allowing(mrContext).getInputSplit();
will(returnValue(inputSplit));
one(mrContext).write(with(new TextMatcher("http://www.example.org/terms/place")), with(new TextMatcher("PREDICATE")));
one(mrContext).write(with(new TextMatcher("NodeABC")), with(new TextMatcher("OBJECT")));
one(mrContext).write(with(new TextMatcher("nodeXYZ")), with(new TextMatcher("<http://www.example.org/terms/place> _:NodeABC .")));
}});
TuplesToResourcesMapper mapper = new TuplesToResourcesMapper();
mapper.map(new LongWritable(5l), new Text(
"_:nodeXYZ <http://www.example.org/terms/place> _:NodeABC ."), mrContext);
context.assertIsSatisfied();
}
@Test
public void filterSubjectOrObjectTest() throws IOException, InterruptedException {
context.checking(new Expectations(){{
allowing(mrContext).getInputSplit();
will(returnValue(inputSplit));
one(mrContext).write(with(new TextMatcher("http://p2")), with(new TextMatcher("PREDICATE")));
one(mrContext).write(with(new TextMatcher("http://s3")), with(new TextMatcher("OBJECT")));
one(mrContext).write(with(new TextMatcher("http://context/")), with(new TextMatcher("CONTEXT")));
one(mrContext).write(with(new TextMatcher("http://s1")), with(new TextMatcher("<http://p2> <http://s3> <http://context/> .")));
one(mrContext).write(with(new TextMatcher("http://p3")), with(new TextMatcher("PREDICATE")));
one(mrContext).write(with(new TextMatcher("http://s3")), with(new TextMatcher("OBJECT")));
one(mrContext).write(with(new TextMatcher("http://context/")), with(new TextMatcher("CONTEXT")));
one(mrContext).write(with(new TextMatcher("http://s2")), with(new TextMatcher("<http://p3> <http://s3> <http://context/> .")));
one(mrContext).write(with(new TextMatcher("http://p5")), with(new TextMatcher("PREDICATE")));
one(mrContext).write(with(new TextMatcher("http://context/")), with(new TextMatcher("CONTEXT")));
one(mrContext).write(with(new TextMatcher("http://s3")), with(new TextMatcher("<http://p5> \"o5\" <http://context/> .")));
}});
TuplesToResourcesMapper mapper = new TuplesToResourcesMapper();
RegexTupleFilter regexTupleFilter = new RegexTupleFilter();
regexTupleFilter.setSubjectRegex("^<http://s3");
regexTupleFilter.setObjectRegex("s3>$");
// Use OR
regexTupleFilter.setAndNotOrConjunction(false);
// We should get the 2nd, 3rd and 5th tuple only.
mapper.setFilter(regexTupleFilter);
mapper.map(new LongWritable(15l), new Text(
"<http://s1> <http://p1> <http://s2> <http://context/> ."), mrContext);
mapper.map(new LongWritable(16l), new Text(
"<http://s1> <http://p2> <http://s3> <http://context/> ."), mrContext);
mapper.map(new LongWritable(17l), new Text(
"<http://s2> <http://p3> <http://s3> <http://context/> ."), mrContext);
mapper.map(new LongWritable(18l), new Text(
"<http://s2> <http://p4> \"o4\" <http://context/> ."), mrContext);
mapper.map(new LongWritable(19l), new Text(
"<http://s3> <http://p5> \"o5\" <http://context/> ."), mrContext);
context.assertIsSatisfied();
}
@Test
public void filterSubjectAndObjectTest() throws IOException, InterruptedException {
context.checking(new Expectations(){{
allowing(mrContext).getInputSplit();
will(returnValue(inputSplit));
one(mrContext).write(with(new TextMatcher("http://p2")), with(new TextMatcher("PREDICATE")));
one(mrContext).write(with(new TextMatcher("http://s3")), with(new TextMatcher("OBJECT")));
one(mrContext).write(with(new TextMatcher("http://context/")), with(new TextMatcher("CONTEXT")));
one(mrContext).write(with(new TextMatcher("http://s1")), with(new TextMatcher("<http://p2> <http://s3> <http://context/> .")));
one(mrContext).write(with(new TextMatcher("http://p5")), with(new TextMatcher("PREDICATE")));
one(mrContext).write(with(new TextMatcher("http://context/")), with(new TextMatcher("CONTEXT")));
one(mrContext).write(with(new TextMatcher("http://s3")), with(new TextMatcher("<http://p5> \"o5\" <http://context/> .")));
}});
TuplesToResourcesMapper mapper = new TuplesToResourcesMapper();
RegexTupleFilter filter = new RegexTupleFilter();
// We should get the 2nd and 5th tuple only.
filter.setSubjectRegex("s1|s3");
filter.setObjectRegex("(s3|o5)");
// Use AND
filter.setAndNotOrConjunction(true);
mapper.setFilter(filter);
mapper.map(new LongWritable(25l), new Text(
"<http://s1> <http://p1> <http://s2> <http://context/> ."), mrContext);
mapper.map(new LongWritable(26l), new Text(
"<http://s1> <http://p2> <http://s3> <http://context/> ."), mrContext);
mapper.map(new LongWritable(27l), new Text(
"<http://s2> <http://p3> <http://s3> <http://context/> ."), mrContext);
mapper.map(new LongWritable(28l), new Text(
"<http://s2> <http://p4> \"o4\" <http://context/> ."), mrContext);
mapper.map(new LongWritable(29l), new Text(
"<http://s3> <http://p5> \"o5\" <http://context/> ."), mrContext);
context.assertIsSatisfied();
}
@Test
public void filterPredicateTest() throws IOException, InterruptedException {
context.checking(new Expectations(){{
allowing(mrContext).getInputSplit();
will(returnValue(inputSplit));
one(mrContext).write(with(new TextMatcher("http://schema.org/p1")), with(new TextMatcher("PREDICATE")));
one(mrContext).write(with(new TextMatcher("http://context/1")), with(new TextMatcher("CONTEXT")));
one(mrContext).write(with(new TextMatcher("http://s1")), with(new TextMatcher("<http://schema.org/p1> \"o1\" <http://context/1> .")));
one(mrContext).write(with(new TextMatcher("http://schema.org/p2")), with(new TextMatcher("PREDICATE")));
one(mrContext).write(with(new TextMatcher("http://context/1")), with(new TextMatcher("CONTEXT")));
one(mrContext).write(with(new TextMatcher("http://s2")), with(new TextMatcher("<http://schema.org/p2> \"o2\" <http://context/1> .")));
one(mrContext).write(with(new TextMatcher("http://schema.org/p4")), with(new TextMatcher("PREDICATE")));
one(mrContext).write(with(new TextMatcher("http://o4")), with(new TextMatcher("OBJECT")));
one(mrContext).write(with(new TextMatcher("http://context/2")), with(new TextMatcher("CONTEXT")));
one(mrContext).write(with(new TextMatcher("http://s3")), with(new TextMatcher("<http://schema.org/p4> <http://o4> <http://context/2> .")));
}});
TuplesToResourcesMapper mapper = new TuplesToResourcesMapper();
RegexTupleFilter filter = new RegexTupleFilter();
filter.setPredicateRegex("schema\\.org");
mapper.setFilter(filter);
mapper.map(new LongWritable(5l), new Text(
"<http://s1> <http://schema.org/p1> \"o1\" <http://context/1> ."), mrContext);
mapper.map(new LongWritable(6l), new Text(
"<http://s2> <http://schema.org/p2> \"o2\" <http://context/1> ."), mrContext);
mapper.map(new LongWritable(7l), new Text(
"<http://s2> <http://nothing.org/p3> \"o3\" <http://context/1> ."), mrContext);
mapper.map(new LongWritable(8l), new Text(
"<http://s3> <http://schema.org/p4> <http://o4> <http://context/2> ."), mrContext);
mapper.map(new LongWritable(9l), new Text(
"<http://s3> <http://nothing.org/p5> <http://o5> <http://context/2> ."), mrContext);
context.assertIsSatisfied();
}
}