// =================================================================================================
// Copyright 2011 Twitter, Inc.
// -------------------------------------------------------------------------------------------------
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this work except in compliance with the License.
// You may obtain a copy of the License in the LICENSE file, or at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =================================================================================================
package com.twitter.common.text.util;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.List;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import org.apache.lucene.util.AttributeSource;
import com.twitter.common.text.token.TokenStream;
/**
* Helper class to serialize a TokenStream into a byte array.
*
* A list of AttributeSerializers must be defined using the Builder, which serialize
* and deserialize individual attributes.
*
* The same TokenStreamSerializer should be used for serialization/de-serialization, as the order
* of the {@link AttributeSerializer}s must be consistent.
*/
public class TokenStreamSerializer {
public static enum Version {
VERSION_1,
VERSION_2
}
protected static final Version CURRENT_VERSION = Version.VERSION_2;
private final List<AttributeSerializer> attributeSerializers;
private final int attributeSerializersFingerprint;
public TokenStreamSerializer(List<AttributeSerializer> attributeSerializers) {
this.attributeSerializers = attributeSerializers;
this.attributeSerializersFingerprint = computeFingerprint(attributeSerializers);
}
public static int computeFingerprint(List<AttributeSerializer> attributeSerializers) {
int result = 0;
int i = 0;
for (AttributeSerializer attributeSerializer : attributeSerializers) {
int hashCode = attributeSerializer.getClass().getName().hashCode();
result = result ^ ((hashCode << i) | (hashCode >> i));
i++;
}
return result;
}
/**
* The fingerprint of the attribute serializers that are attached to this TokenStreamSerializer.
*/
public int attributeSerializersFingerprint() {
return attributeSerializersFingerprint;
}
/**
* Serialize the given TokenStream into a byte array using the provided AttributeSerializer(s).
* Note that this method doesn't serialize the CharSequence of the TokenStream - the caller
* has to take care of serializing this if necessary.
*/
public final byte[] serialize(final TokenStream tokenStream) throws IOException {
ByteArrayOutputStream baos = new ByteArrayOutputStream();
AttributeOutputStream output = new AttributeOutputStream(baos);
for (AttributeSerializer serializer : attributeSerializers) {
serializer.initialize(tokenStream, CURRENT_VERSION);
}
int numTokens = 0;
while (tokenStream.incrementToken()) {
serializeAttributes(output);
numTokens++;
}
output.flush();
byte[] data = baos.toByteArray();
baos.close();
baos = new ByteArrayOutputStream(8 + data.length);
output = new AttributeOutputStream(baos);
output.writeVInt(CURRENT_VERSION.ordinal());
output.writeInt(attributeSerializersFingerprint);
output.writeVInt(numTokens);
output.write(data);
output.flush();
return baos.toByteArray();
};
/**
* Deserializes the previously serialized TokenStream using the provided AttributeSerializer(s).
*
* This method only deserializes all Attributes; the CharSequence instance containing the text
* must be provided separately.
*/
public final TokenStream deserialize(final byte[] data,
final CharSequence charSequence) throws IOException {
return deserialize(data, 0, data.length, charSequence);
}
public final TokenStream deserialize(final byte[] data, int offset, int length,
final CharSequence charSequence) throws IOException {
Preconditions.checkNotNull(data);
Preconditions.checkState(length > 0);
Preconditions.checkState(data.length >= length);
ByteArrayInputStream bais = new ByteArrayInputStream(data, offset, length);
return deserialize(bais, charSequence);
}
public static Version readVersionAndCheckFingerprint(
AttributeInputStream input, int attributeSerializersFingerprint) throws IOException {
int ordinal = input.readVInt();
if (ordinal > CURRENT_VERSION.ordinal()) {
throw new IOException("Version of serialized data is newer than the version this serializer" +
"supports: " + ordinal + " > " + CURRENT_VERSION.ordinal());
}
if (ordinal >= Version.VERSION_2.ordinal()) {
int fp = input.readInt();
if (fp != attributeSerializersFingerprint) {
throw new IOException("Attributes of serialized data are different than attributes of " +
"this serializer: " + fp + " != " + attributeSerializersFingerprint);
}
}
return Version.values()[ordinal];
}
public final TokenStream deserialize(ByteArrayInputStream bais, final CharSequence charSequence)
throws IOException {
final AttributeInputStream input = new AttributeInputStream(bais);
final Version version = readVersionAndCheckFingerprint(input, attributeSerializersFingerprint);
final int numTokens = input.readVInt();
TokenStream tokenStream = new TokenStream() {
CharSequence chars;
int token = 0;
@Override public boolean incrementToken() {
if (token < numTokens) {
token++;
try {
deserializeAttributes(input, chars);
} catch (IOException e) {
throw new RuntimeException(e);
}
return true;
}
return false;
}
@Override
public void reset(CharSequence input) {
chars = input;
token = 0;
}
};
for (AttributeSerializer deserializer : attributeSerializers) {
deserializer.initialize(tokenStream, version);
}
tokenStream.reset(charSequence);
return tokenStream;
};
private void deserializeAttributes(AttributeInputStream input,
CharSequence charSequence) throws IOException {
for (AttributeSerializer serializer : attributeSerializers) {
serializer.deserialize(input, charSequence);
}
}
private void serializeAttributes(AttributeOutputStream output) throws IOException {
for (AttributeSerializer serializer : attributeSerializers) {
serializer.serialize(output);
}
}
/**
* Returns a new Builder to build a TokenStreamSerializer.
*/
public static Builder builder() {
return new Builder();
}
/**
* Defines how individual attributes a (de)serialized.
*/
public interface AttributeSerializer {
/**
* Initialises this AttributeSerializer. This method should be used to get the attribute
* instance from the TokenStream that this serializer handles. E.g.:
*
* CharSequenceTermAttribute termAtt =
* attributeSource.addAttribute(CharSequenceTermAttribute.class);
*
*/
public abstract void initialize(AttributeSource attributeSource, Version version)
throws IOException;
/**
* Serializes a single attribute.
*/
public abstract void serialize(AttributeOutputStream output) throws IOException ;
/**
* Deserializes a single attribute.
*/
public abstract void deserialize(AttributeInputStream input, CharSequence charSequence)
throws IOException;
}
/**
* Builds an TokenStreamSerializer.
*/
public final static class Builder {
private final List<AttributeSerializer> attributeSerializers = Lists.newLinkedList();
/**
* Adds an AttributeSerializer. The order in which the AttributeSerializers are added here
* is the same order in which they will be called for serializing a Token.
*/
public Builder add(AttributeSerializer serializer) {
attributeSerializers.add(serializer);
return this;
}
/**
* Builds the TokenStreamSerializer.
*/
public TokenStreamSerializer build() {
return new TokenStreamSerializer(attributeSerializers);
}
}
/**
* A DataOutputStream that supports VInt-encoding.
*/
public static class AttributeOutputStream extends DataOutputStream {
public AttributeOutputStream(OutputStream output) {
super(output);
}
/**
* Writes a value using VInt encoding.
*/
public final void writeVInt(int value) throws IOException {
while ((value & ~0x7F) != 0) {
writeByte((byte)((value & 0x7f) | 0x80));
value >>>= 7;
}
writeByte((byte)value);
}
}
/**
* A DataInputStream that supports VInt-encoding.
*/
public static class AttributeInputStream extends DataInputStream {
public AttributeInputStream(InputStream input) {
super(input);
}
/**
* Reads a value using VInt encoding.
*/
public final int readVInt() throws IOException {
byte b = readByte();
int value = b & 0x7F;
for (int shift = 7; (b & 0x80) != 0; shift += 7) {
b = readByte();
value |= (b & 0x7F) << shift;
}
return value;
}
}
}