package com.yahoo.glimmer.indexing.preprocessor; /* * Copyright (c) 2012 Yahoo! Inc. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. * You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 * Unless required by applicable law or agreed to in writing, software distributed under the License is * distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and limitations under the License. * See accompanying LICENSE file. */ import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; import java.util.Arrays; import java.util.Random; import org.apache.commons.codec.binary.Hex; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.hamcrest.BaseMatcher; import org.hamcrest.Description; import org.jmock.Expectations; import org.jmock.Mockery; import org.jmock.lib.legacy.ClassImposteriser; import org.junit.Before; import org.junit.Rule; import org.junit.Test; import org.junit.rules.TemporaryFolder; import com.yahoo.glimmer.indexing.preprocessor.ResourceRecordWriter.OUTPUT; import com.yahoo.glimmer.indexing.preprocessor.ResourceRecordWriter.OutputCount; import com.yahoo.glimmer.util.BlockCompressedDocumentCollection; import com.yahoo.glimmer.util.BySubjectRecord; import com.yahoo.glimmer.util.BySubjectRecord.BySubjectRecordException; public class ResourceRecordWriterTest { private Mockery context; private Expectations e; private FileSystem fs; private FSDataOutputStream allOs; private FSDataOutputStream subjectOs; private FSDataOutputStream predicateOs; private FSDataOutputStream objectOs; private FSDataOutputStream contextOs; @Rule public TemporaryFolder tempFolder = new TemporaryFolder(); private Path tempDirPath; @Before public void before() throws IOException { tempDirPath = new Path(tempFolder.getRoot().getCanonicalPath()); context = new Mockery(); context.setImposteriser(ClassImposteriser.INSTANCE); fs = context.mock(FileSystem.class); allOs = context.mock(FSDataOutputStream.class, "allOs"); subjectOs = context.mock(FSDataOutputStream.class, "subjectOs"); predicateOs = context.mock(FSDataOutputStream.class, "predicateOs"); objectOs = context.mock(FSDataOutputStream.class, "objectOs"); contextOs = context.mock(FSDataOutputStream.class, "contextOs"); e = new Expectations() { { one(fs).exists(with(tempDirPath)); will(returnValue(false)); one(fs).mkdirs(with(tempDirPath)); one(fs).create(with(new Path(tempDirPath, "all")), with(false)); will(returnValue(allOs)); one(fs).create(with(new Path(tempDirPath, "subjects")), with(false)); will(returnValue(subjectOs)); one(fs).create(with(new Path(tempDirPath, "predicates")), with(false)); will(returnValue(predicateOs)); one(fs).create(with(new Path(tempDirPath, "objects")), with(false)); will(returnValue(objectOs)); one(fs).create(with(new Path(tempDirPath, "contexts")), with(false)); will(returnValue(contextOs)); one(allOs).close(); one(subjectOs).close(); one(predicateOs).close(); one(objectOs).close(); one(contextOs).close(); } }; } @Test public void writeSubjectAndObjectTest() throws IOException, InterruptedException, ClassNotFoundException { ByteArrayOutputStream bySubjectBos = new ByteArrayOutputStream(1024); FSDataOutputStream bySubjectOs = new FSDataOutputStream(bySubjectBos, null); ByteArrayOutputStream bySubjectOffsetsBos = new ByteArrayOutputStream(1024); FSDataOutputStream bySubjectOffsetsOs = new FSDataOutputStream(bySubjectOffsetsBos, null); e.one(fs).create(e.with(new Path(tempDirPath, "bySubject.bz2")), e.with(false)); e.will(Expectations.returnValue(bySubjectOs)); e.one(fs).create(e.with(new Path(tempDirPath, "bySubject.blockOffsets")), e.with(false)); e.will(Expectations.returnValue(bySubjectOffsetsOs)); e.one(allOs).write(e.with(new ByteMatcher("http://a/key1\nhttp://a/key2\nhttp://a/key3\n", true)), e.with(0), e.with(42)); e.one(contextOs).write(e.with(new ByteMatcher("http://a/key\n", true)), e.with(0), e.with(13)); e.one(objectOs).write(e.with(new ByteMatcher("http://a/key\nbNode123\n", true)), e.with(0), e.with(22)); e.one(predicateOs).write(e.with(new ByteMatcher("3\thttp://a/key\n", true)), e.with(0), e.with(15)); e.one(subjectOs).write(e.with(new ByteMatcher("http://a/key\n", true)), e.with(0), e.with(13)); context.checking(e); ResourceRecordWriter writer = new ResourceRecordWriter(fs, tempDirPath, null); OutputCount outputCount = new OutputCount(); outputCount.output = OUTPUT.PREDICATE; outputCount.count = 3; writer.write(new Text("http://a/key"), outputCount); outputCount.output = OUTPUT.OBJECT; outputCount.count = 0; writer.write(new Text("http://a/key"), outputCount); outputCount.output = OUTPUT.CONTEXT; outputCount.count = 0; writer.write(new Text("http://a/key"), outputCount); outputCount.output = OUTPUT.ALL; outputCount.count = 0; writer.write(new Text("http://a/key1"), outputCount); writer.write(new Text("http://a/key2"), outputCount); writer.write(new Text("http://a/key3"), outputCount); BySubjectRecord record = new BySubjectRecord(); record.setId(66); record.setPreviousId(55); record.setSubject("http://a/key"); record.addRelation("<http://predicate/> <http://Object> ."); writer.write(new Text("http://a/key"), record); outputCount.output = OUTPUT.OBJECT; outputCount.count = 0; writer.write(new Text("bNode123"), outputCount); writer.close(null); context.assertIsSatisfied(); BlockCompressedDocumentCollection collection = new BlockCompressedDocumentCollection("foo", null, 10); InputStream blockOffsetsInputStream = new ByteArrayInputStream(bySubjectOffsetsBos.toByteArray()); File bySubjectTempFile = File.createTempFile(ResourceRecordWriterTest.class.getSimpleName(), "tmp"); FileOutputStream tempFileOutputStream = new FileOutputStream(bySubjectTempFile); bySubjectBos.writeTo(tempFileOutputStream); tempFileOutputStream.flush(); tempFileOutputStream.close(); FileInputStream bySubjectFileInputStream = new FileInputStream(bySubjectTempFile); collection.init(bySubjectFileInputStream.getChannel(), blockOffsetsInputStream, 100000); blockOffsetsInputStream.close(); // Size of collection. This is the same as the number of lines written to ALL. assertEquals(3l, collection.size()); InputStream documentInputStream = collection.stream(65l); assertEquals(-1, documentInputStream.read()); documentInputStream = collection.stream(67l); assertEquals(-1, documentInputStream.read()); documentInputStream = collection.stream(66l); assertNotNull(documentInputStream); collection.close(); bySubjectFileInputStream.close(); } @Test public void bySubjectsTest() throws IOException, InterruptedException, NoSuchAlgorithmException, BySubjectRecordException { FSDataOutputStream bySubjectOs = new FSDataOutputStream(new FileOutputStream(new File(tempDirPath.toUri().getPath(), "bySubject.bz2")), null); FSDataOutputStream bySubjectOffsetsOs = new FSDataOutputStream(new FileOutputStream(new File(tempDirPath.toUri().getPath(), "bySubject.blockOffsets")), null); e.one(fs).create(e.with(new Path(tempDirPath, "bySubject.bz2")), e.with(false)); e.will(Expectations.returnValue(bySubjectOs)); e.one(fs).create(e.with(new Path(tempDirPath, "bySubject.blockOffsets")), e.with(false)); e.will(Expectations.returnValue(bySubjectOffsetsOs)); e.allowing(subjectOs).write(e.with(new ByteMatcher()), e.with(0), e.with(Expectations.any(Integer.class))); e.allowing(allOs).write(e.with(new ByteMatcher("all\nall\n", true)), e.with(0), e.with(Expectations.any(Integer.class))); context.checking(e); System.out.println("tempDirPath:" + tempDirPath); ResourceRecordWriter writer = new ResourceRecordWriter(fs, tempDirPath, null); BySubjectRecord record = new BySubjectRecord(); Random random = new Random(); for (long l = 100000 ; l < 200000 ; l += (random.nextInt(19) + 2)) { record.setId(l); record.setSubject("Subject:" + Integer.toString(random.nextInt())); for (int i = 0 ; i < random.nextInt() % 100 ; i++) { record.addRelation("a relation " + Long.toString(random.nextLong())); } writer.write(null, record); record.setPreviousId(l); record.clearRelations(); } BySubjectRecord beforeBigRecord = new BySubjectRecord(); beforeBigRecord.setId(200200l); beforeBigRecord.setPreviousId(record.getId()); beforeBigRecord.setSubject("Before Big Test Record"); writer.write(null, beforeBigRecord); // Write a big record that will span multiple blocks of 100000 bytes. BySubjectRecord bigRecord = new BySubjectRecord(); bigRecord.setId(200201l); bigRecord.setPreviousId(beforeBigRecord.getId()); bigRecord.setSubject("Big Test Record"); MessageDigest md5Digest = MessageDigest.getInstance("MD5"); StringBuilder sb = new StringBuilder(); // 8k x 128 byte relations. The relation here is just a 128 byte hex string without delimiters. for (int i = 0 ; i < 8192 ; i++) { md5Digest.update((byte)((i * 1299299) & 0xFF)); byte[] digest = md5Digest.digest(); sb.append(Hex.encodeHex(digest)); md5Digest.update(digest); digest = md5Digest.digest(); sb.append(Hex.encodeHex(digest)); md5Digest.update(digest); digest = md5Digest.digest(); sb.append(Hex.encodeHex(digest)); md5Digest.update(digest); digest = md5Digest.digest(); sb.append(Hex.encodeHex(digest)); bigRecord.addRelation(sb.toString()); sb.setLength(0); } writer.write(null, bigRecord); BySubjectRecord afterBigRecord = new BySubjectRecord(); afterBigRecord.setId(200202l); afterBigRecord.setPreviousId(bigRecord.getId()); afterBigRecord.setSubject("After Big Test Record"); writer.write(null, afterBigRecord); OutputCount outputCount = new OutputCount(); outputCount.output = OUTPUT.ALL; outputCount.count = 1; Text key = new Text("all"); for (int i = 0 ; i < 200205 ; i++) { writer.write(key, outputCount); } writer.write(new Text("http://a/key1"), outputCount); writer.close(null); BlockCompressedDocumentCollection collection = new BlockCompressedDocumentCollection("bySubject", null, 10); String indexBaseName = new File(tempDirPath.toUri().getPath(), "bySubject").getCanonicalPath(); collection.filename(indexBaseName); assertEquals(-1, collection.stream(99999).read()); InputStream documentInputStream = collection.stream(100000); record.readFrom(new InputStreamReader(documentInputStream)); assertEquals(100000, record.getId()); documentInputStream = collection.stream(record.getId()); record.readFrom(new InputStreamReader(documentInputStream)); assertEquals(record.getId(), record.getId()); record.setPreviousId(3); record.setSubject(null); documentInputStream = collection.stream(record.getId() + 1); assertEquals(-1, documentInputStream.read()); documentInputStream = collection.stream(beforeBigRecord.getId()); record.readFrom(new InputStreamReader(documentInputStream)); assertEquals(beforeBigRecord, record); documentInputStream = collection.stream(afterBigRecord.getId()); record.readFrom(new InputStreamReader(documentInputStream)); assertEquals(afterBigRecord, record); documentInputStream = collection.stream(bigRecord.getId()); record.readFrom(new InputStreamReader(documentInputStream)); System.out.println("BigRecord Relation count:" + bigRecord.getRelationsCount()); System.out.println("First:" + bigRecord.getRelation(0)); System.out.println("Last:" + bigRecord.getRelation(bigRecord.getRelationsCount() - 1)); System.out.println("Record Relation count:" + record.getRelationsCount()); System.out.println("First:" + record.getRelation(0)); System.out.println("Last:" + record.getRelation(record.getRelationsCount() - 1)); int limit = bigRecord.getRelationsCount() > record.getRelationsCount() ? record.getRelationsCount() : bigRecord.getRelationsCount(); for (int i = 0 ; i < limit ; i++) { assertEquals("At index " + i, bigRecord.getRelation(i), record.getRelation(i)); } assertEquals(bigRecord.getRelationsCount(), record.getRelationsCount()); assertEquals(bigRecord, record); assertEquals(-1, collection.stream(afterBigRecord.getId() + 1).read()); collection.close(); } private static class ByteMatcher extends BaseMatcher<byte[]> { private byte[] bytes; private boolean ignoreTrailingBytes; public ByteMatcher() { } public ByteMatcher(String string, boolean ignoreTrailingBytes) { bytes = string.getBytes(); this.ignoreTrailingBytes = ignoreTrailingBytes; } @Override public boolean matches(Object object) { if (bytes == null) { return true; } assert object instanceof byte[]; byte[] other = (byte[]) object; if (ignoreTrailingBytes) { other = Arrays.copyOf(other, bytes.length); } return Arrays.equals(bytes, other); } @Override public void describeTo(Description description) { if (bytes == null) { description.appendText("any byte array"); } else if (ignoreTrailingBytes) { description.appendText(new String(bytes) + "..."); } else { description.appendText(new String(bytes)); } } } }