/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.dkpro.bigdata.collocations; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import java.nio.ByteBuffer; import java.nio.charset.CharacterCodingException; import org.apache.hadoop.io.BinaryComparable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.WritableComparable; import org.apache.mahout.math.Varint; import com.google.common.base.Preconditions; /** * Writable for holding data generated from the collocation discovery jobs. Depending on the job configuration * gram may be one or more words. In some contexts this is used to hold a complete ngram, while in others it * holds a part of an existing ngram (subgram). Tracks the frequency of the gram and its position in the ngram * in which is was found. */ public class Gram extends BinaryComparable implements WritableComparable<BinaryComparable> { public enum Type { HEAD('h'), TAIL('t'), UNIGRAM('u'), NGRAM('n'); private final char x; Type(char c) { this.x = c; } @Override public String toString() { return String.valueOf(x); } } private byte[] bytes; private int length; private int frequency; public Gram() { } /** * Copy constructor */ public Gram(Gram other) { frequency = other.frequency; length = other.length; bytes = other.bytes.clone(); } /** * Create an gram with a frequency of 1 * * @param ngram * the gram string * @param type * whether the gram is at the head or tail of its text unit or it is a unigram */ public Gram(String ngram, Type type) { this(ngram, 1, type); } /** * * Create a gram with the specified frequency. * * @param ngram * the gram string * @param frequency * the gram frequency * @param type * whether the gram is at the head of its text unit or tail or unigram */ public Gram(String ngram, int frequency, Type type) { Preconditions.checkNotNull(ngram); try { // extra character is used for storing type which is part // of the sort key. ByteBuffer bb = Text.encode('\0' + ngram, true); bytes = bb.array(); length = bb.limit(); } catch (CharacterCodingException e) { throw new IllegalStateException("Should not have happened ",e); } encodeType(type, bytes, 0); this.frequency = frequency; } @Override public byte[] getBytes() { return bytes; } @Override public int getLength() { return length; } /** * @return the gram is at the head of its text unit or tail or unigram. */ public Type getType() { return decodeType(bytes, 0); } /** * @return gram term string */ public String getString() { try { return Text.decode(bytes, 1, length - 1); } catch (CharacterCodingException e) { throw new IllegalStateException("Should not have happened " + e); } } /** * @return gram frequency */ public int getFrequency() { return frequency; } /** * @param frequency * gram's frequency */ public void setFrequency(int frequency) { this.frequency = frequency; } public void incrementFrequency(int i) { this.frequency += i; } @Override public void readFields(DataInput in) throws IOException { int newLength = Varint.readUnsignedVarInt(in); setCapacity(newLength, false); in.readFully(bytes, 0, newLength); int newFrequency = Varint.readUnsignedVarInt(in); length = newLength; frequency = newFrequency; } @Override public void write(DataOutput out) throws IOException { Varint.writeUnsignedVarInt(length, out); out.write(bytes, 0, length); Varint.writeUnsignedVarInt(frequency, out); } /* Cribbed from o.a.hadoop.io.Text: * Sets the capacity of this object to <em>at least</em> * {@code len} bytes. If the current buffer is longer, * then the capacity and existing content of the buffer are * unchanged. If {@code len} is larger * than the current capacity, this object's capacity is * increased to match. * @param len the number of bytes we need * @param keepData should the old data be kept */ private void setCapacity(int len, boolean keepData) { len++; // extra byte to hold type if (bytes == null || bytes.length < len) { byte[] newBytes = new byte[len]; if (bytes != null && keepData) { System.arraycopy(bytes, 0, newBytes, 0, length); } bytes = newBytes; } } @Override public String toString() { return '\'' + getString() + "'[" + getType() + "]:" + frequency; } public static void encodeType(Type type, byte[] buf, int offset) { switch (type) { case HEAD: buf[offset] = 0x1; break; case TAIL: buf[offset] = 0x2; break; case UNIGRAM: buf[offset] = 0x3; break; case NGRAM: buf[offset] = 0x4; break; default: throw new IllegalStateException("switch/case problem in encodeType"); } } public static Type decodeType(byte[] buf, int offset) { switch (buf[offset]) { case 0x1: return Type.HEAD; case 0x2: return Type.TAIL; case 0x3: return Type.UNIGRAM; case 0x4: return Type.NGRAM; default: throw new IllegalStateException("switch/case problem in decodeType"); } } }