/* * Copyright (c) 2012 LinkedIn, Inc * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package com.flaptor.indextank.index.storage; import java.io.BufferedInputStream; import java.io.BufferedOutputStream; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.EOFException; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.ObjectInputStream; import java.io.ObjectOutputStream; import java.io.OutputStream; import java.io.UTFDataFormatException; import java.util.HashMap; import java.util.Map; import java.util.Map.Entry; import java.util.Scanner; import java.util.concurrent.ConcurrentMap; import java.util.zip.GZIPInputStream; import java.util.zip.GZIPOutputStream; import org.apache.log4j.Logger; import com.flaptor.indextank.index.Document; import com.flaptor.indextank.storage.alternatives.DocumentStorage; import com.flaptor.util.Execute; import com.flaptor.util.FileUtil; import com.google.common.base.Preconditions; import com.google.common.collect.MapMaker; import com.google.common.collect.Maps; /** * @author santip * @author dbuthay * */ public abstract class DocumentBinaryStorage implements DocumentStorage { private static final Logger logger = Logger.getLogger(Execute.whoAmI()); private static final int COMPRESSION_THRESHOLD = 100; private static final int HEADER_COMPRESSED = 0x1; private static final int HEADER_HAS_TEXT = 0x2; protected abstract byte[] getBinaryDoc(String docId); protected abstract void saveBinaryDoc(String docId, byte[] bytes); protected abstract void deleteBinaryDoc(String docId); @Override public Document getDocument(String docId) { return decompress(getBinaryDoc(docId)); } @Override public void saveDocument(String docId, Document document) { saveBinaryDoc(docId, compress(document)); } @Override public void deleteDocument(String docId) { deleteBinaryDoc(docId); } private static byte[] compress(Document document) { try { int estimatedSize = estimateSize(document); boolean compress = estimatedSize >= COMPRESSION_THRESHOLD; ByteArrayOutputStream baos = new ByteArrayOutputStream(estimatedSize); OutputStream os = baos; writeTo(document, os, compress); return baos.toByteArray(); } catch (IOException e) { throw new RuntimeException(e); } } private static Document decompress(byte[] bytes) { try { ByteArrayInputStream bais = new ByteArrayInputStream(bytes); InputStream is = bais; return readFrom(is); } catch (IOException e) { throw new RuntimeException(e); } } private static void writeTo(Document document, OutputStream os, boolean compress) throws IOException { int header = 0; if (compress) { header |= HEADER_COMPRESSED; } String text = document.getField("text"); if (text != null) { header |= HEADER_HAS_TEXT; } os.write(header); if (compress) { os = new GZIPOutputStream(os); } if (text != null) { writeUTF(text, os); } int fs = document.asMap().size(); if (text != null) fs -= 1; writeSize(fs , os); for (Entry<String,String> e : document.asMap().entrySet()) { if (!e.getKey().equals("text")) { writeUTF(e.getKey(), os); writeUTF(e.getValue(), os); } } os.close(); } private static Document readFrom(InputStream is) throws IOException { int header = is.read(); String text = null; if ((header & HEADER_COMPRESSED) != 0) { is = new GZIPInputStream(is); } if ((header & HEADER_HAS_TEXT) != 0) { text = readUTF(is); } int fs = readSize(is); Map<String, String> fields = Maps.newHashMapWithExpectedSize(fs + (text == null ? 0 : 1)); if (text != null) { fields.put("text", text); } while (fs-- > 0) { fields.put(readUTF(is), readUTF(is)); } return new Document(fields); } private static int estimateSize(Document document) { int size = 0; for (Entry<String, String> e : document.asMap().entrySet()) { if (!e.getKey().equals("text")) { size += e.getKey().length(); } size += e.getValue().length(); } return size; } private static void writeUTF(String text, OutputStream os) throws IOException { int strlen = text.length(); int c = 0; writeSize(strlen, os); int i=0; for (i=0; i<strlen; i++) { c = text.charAt(i); if (!((c >= 0x0001) && (c <= 0x007F))) break; os.write(c); } for (;i < strlen; i++){ c = text.charAt(i); if ((c >= 0x0001) && (c <= 0x007F)) { os.write(c); } else if (c > 0x07FF) { os.write(0xE0 | ((c >> 12) & 0x0F)); os.write(0x80 | ((c >> 6) & 0x3F)); os.write(0x80 | ((c >> 0) & 0x3F)); } else { os.write(0xC0 | ((c >> 6) & 0x1F)); os.write(0x80 | ((c >> 0) & 0x3F)); } } } private static String readUTF(InputStream is) throws IOException { int size = readSize(is); char[] chars = new char[size]; int c, c2, c3; for (int i = 0; i < chars.length; i++) { c = readNonEOF(is); switch (c >> 4) { case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7: /* 0xxxxxxx*/ chars[i]=(char)c; break; case 12: case 13: /* 110x xxxx 10xx xxxx*/ c2 = readNonEOF(is); if ((c2 & 0xC0) != 0x80) throw new UTFDataFormatException("malformed input around char " + i); chars[i] = (char)(((c & 0x1F) << 6) | (c2 & 0x3F)); break; case 14: /* 1110 xxxx 10xx xxxx 10xx xxxx */ c2 = readNonEOF(is); c3 = readNonEOF(is); if (((c2 & 0xC0) != 0x80) || ((c3 & 0xC0) != 0x80)) throw new UTFDataFormatException("malformed input around char " + i); chars[i]=(char)(((c & 0x0F) << 12) | ((c2 & 0x3F) << 6) | ((c3 & 0x3F) << 0)); break; default: /* 10xx xxxx, 1111 xxxx */ throw new UTFDataFormatException("malformed input around char " + i); } } return String.valueOf(chars); } private static void writeSize(int size, OutputStream os) throws IOException { while (size >= 128) { os.write((size & 0x7F) | 0x80); size >>= 7; } os.write(size & 0x7F); } private static int readSize(InputStream is) throws IOException { int c = 0; int size = 0; boolean left = true; while (left) { int b = readNonEOF(is); left = (b & 0x80) != 0; b &= 0x7F; b <<= 7 * c; size |= b; c++; } return size; } private static int readNonEOF(InputStream is) throws IOException { int c = is.read(); if (c == -1) throw new EOFException(); return c; } /** * Allows testing changes to the compression method, it first * validates the correctness of the implementation and then * lists the compression value and ratio for several document * sizes. * * First argument should be the text to use for texting, it will * be clipped to different sizes for ratio testing. */ /* public static void main(String[] args) throws IOException { //testCorrectness(args); //testCompressionRatio(args); InMemoryStorage ims = new InMemoryStorage(new File(args[0]), true); Scanner in = new Scanner(System.in); while (in.hasNextLine()) { Document document = ims.getDocument(in.nextLine()); System.out.println(document); } } private static void testCompressionRatio(String[] args) { String text = args[0]; int len = text.length(); while (len > 10) { test(text, len); len -= 10; } } private static void testCorrectness(String[] args) throws IOException { InMemoryStorage storage = new InMemoryStorage(FileUtil.createTempDir("testInMemoryStorage", ".tmp"), false); Document doc1 = new Document(); doc1.setField("text", args[0]); storage.saveDocument("a", doc1); Document dd1 = storage.getDocument("a"); Preconditions.checkState(dd1.equals(doc1), dd1 + " - " + doc1); Document doc2 = new Document(); doc2.setField("nottext", args[0]); storage.saveDocument("b", doc2); Document dd2 = storage.getDocument("b"); Preconditions.checkState(dd2.equals(doc2), dd2); Document doc3 = new Document(); doc3.setField("text", args[0]); doc3.setField("f1", "v1"); doc3.setField("f2", "v2"); storage.saveDocument("c", doc3); Document dd3 = storage.getDocument("c"); Preconditions.checkState(dd3.equals(doc3), dd3); } private static void test(String text, int len) { Document d = new Document(); d.setField("text", text.substring(0, len)); int clen = compress(d).length; len *= 2; System.out.println(String.format("%2.2f = original: %5d - compressed: %5d", 1.0 * clen / len, len, clen)); } @Override public Map<String, String> getStats() { HashMap<String, String> stats = Maps.newHashMap(); stats.put("in_memory_storage_count", String.valueOf(compressedMap.size())); return stats; } */ }