package com.yahoo.glimmer.indexing.generator; /* * Copyright (c) 2012 Yahoo! Inc. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. * You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 * Unless required by applicable law or agreed to in writing, software distributed under the License is * distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and limitations under the License. * See accompanying LICENSE file. */ import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import org.apache.hadoop.io.RawComparator; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.io.WritableComparator; import org.apache.hadoop.io.WritableUtils; import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner; /** * * @author pmika * */ public class TermKey implements WritableComparable<TermKey> { // The term is set to this when the values Type is DOC_SIZE. public static final String DOC_SIZE_TERM = ""; private int index; private String term; // We want hadoop to sort our values too. So the initial sort includes the // value in the key. The FirstGroupingComparator is then used to group all // values for each index/term pair. private TermValue value = new TermValue(); /* * Required for Hadoop */ public TermKey() { } public TermKey(String term, int index, TermValue value) { this.index = index; this.term = term; this.value = value; } public String getTerm() { return term; } public int getIndex() { return index; } public TermValue getValue() { return value; } public void readFields(DataInput in) throws IOException { value.readFields(in); index = in.readInt(); term = Text.readString(in); } public void write(DataOutput out) throws IOException { value.write(out); out.writeInt(index); Text.writeString(out, term); } public int compareTo(TermKey top) { if (!term.equals(top.term)) { return term.compareTo(top.term); } else if (index != top.index) { return ((Integer) index).compareTo(top.index); } else { return value.compareTo(top.value); } } @Override public int hashCode() { int hash = 31 * value.hashCode() + index; return 31 * hash + term.hashCode(); } @Override public boolean equals(Object right) { if (right instanceof TermKey) { TermKey r = (TermKey) right; return term.equals(r.term) && index == r.index && (value == null ? r.value == null : value.equals(r.value)); } else { return false; } } public String toString() { return Integer.toString(index) + ":" + term + ":" + (value == null ? "null" : value.toString()); } private static final int TYPE_BYTE_OFFSET = 0; private static final int V1_BYTE_OFFSET = Integer.SIZE / 8; private static final int V2_BYTE_OFFSET = V1_BYTE_OFFSET + Long.SIZE / 8; private static final int INDEX_BYTE_OFFSET = V2_BYTE_OFFSET + Integer.SIZE / 8; private static final int TERM_BYTE_OFFSET = INDEX_BYTE_OFFSET + Integer.SIZE / 8; /** A Comparator that compares serialized TermKey objects. */ public static class Comparator extends WritableComparator { public Comparator() { super(TermKey.class, true); } public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) { // Compare the index int index1 = WritableComparator.readInt(b1, s1 + INDEX_BYTE_OFFSET); int index2 = WritableComparator.readInt(b2, s2 + INDEX_BYTE_OFFSET); int d = index1 - index2; if (d == 0) { // Compare the term // first byte of string encodes the length of the size // length1 & 2 are the byte length of the serialized strings // length int length1 = WritableUtils.decodeVIntSize(b1[s1 + TERM_BYTE_OFFSET]); int length2 = WritableUtils.decodeVIntSize(b2[s2 + TERM_BYTE_OFFSET]); d = compareBytes(b1, s1 + TERM_BYTE_OFFSET + length1, l1 - TERM_BYTE_OFFSET - length1, b2, s2 + TERM_BYTE_OFFSET + length2, l2 - TERM_BYTE_OFFSET - length2); if (d == 0) { // Compare the values types int type1 = WritableComparator.readInt(b1, s1 + TYPE_BYTE_OFFSET); int type2 = WritableComparator.readInt(b2, s2 + TYPE_BYTE_OFFSET); d = type1 - type2; if (d == 0) { // Compare the values v1s long v11 = WritableComparator.readLong(b1, s1 + V1_BYTE_OFFSET); long v12 = WritableComparator.readLong(b2, s2 + V1_BYTE_OFFSET); long dl = v11 - v12; if (dl != 0) { d = dl > 0 ? 1 : -1; } else { // Compare the values v2s int v21 = WritableComparator.readInt(b1, s1 + V2_BYTE_OFFSET); int v22 = WritableComparator.readInt(b2, s2 + V2_BYTE_OFFSET); d = v21 - v22; } } } } return d; } } /** * Compare only the term and index of the pair, so that reduce is called * once for each value of the first part. * * NOTE: first part (i.e. index and term) are serialized first */ public static class FirstGroupingComparator implements RawComparator<TermKey> { public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) { // Compare the index int index1 = WritableComparator.readInt(b1, s1 + INDEX_BYTE_OFFSET); int index2 = WritableComparator.readInt(b2, s2 + INDEX_BYTE_OFFSET); int d = index1 - index2; if (d == 0) { // Compare the term // first byte of string encodes the length of the size // length1 & 2 are the byte length of the serialized strings // length int length1 = WritableUtils.decodeVIntSize(b1[s1 + TERM_BYTE_OFFSET]); int length2 = WritableUtils.decodeVIntSize(b2[s2 + TERM_BYTE_OFFSET]); d = WritableComparator.compareBytes(b1, s1 + TERM_BYTE_OFFSET + length1, l1 - TERM_BYTE_OFFSET - length1, b2, s2 + TERM_BYTE_OFFSET + length2, l2 - TERM_BYTE_OFFSET - length2); } return d; } public int compare(TermKey o1, TermKey o2) { if (!o1.getTerm().equals(o2.getTerm())) { return o1.getTerm().compareTo(o2.getTerm()); } else if (o1.getIndex() != o2.getIndex()) { return ((Integer) o1.getIndex()).compareTo(o2.getIndex()); } return 0; } } /** * Partition based only on the term. All occurrences of a term are processed * by the same reducer instance. */ public static class FirstPartitioner extends HashPartitioner<TermKey, TermValue> { @Override public int getPartition(TermKey key, TermValue value, int numPartitions) { return (int)(Math.abs(key.getTerm().hashCode() * 127l) % numPartitions); } } }