/*
* Copyright 2013 Future Systems
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.krakenapps.logstorage.index;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.TreeMap;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* @since 0.9
* @author xeraph
*/
public class InvertedIndexWriter {
private final Logger logger = LoggerFactory.getLogger(InvertedIndexWriter.class.getName());
private final int FLUSH_THRESHOLD = 10000;
private final Charset utf8 = Charset.forName("utf-8");
private int queueCount;
// term -> log id postings (no key)
private Map<String, List<InvertedIndexItem>> postings;
private boolean closed;
private InvertedIndexFileSet files;
private OutputStream indexStream;
private OutputStream dataStream;
// total index data length (will be used for index position marking)
private long dataLength;
private byte[] longbuf = new byte[8];
private Date lastFlush = new Date();
public InvertedIndexWriter(File indexFile, File dataFile) throws IOException {
this(new InvertedIndexFileSet(indexFile, dataFile));
}
public InvertedIndexWriter(InvertedIndexFileSet files) throws IOException {
this.postings = new HashMap<String, List<InvertedIndexItem>>();
if (isEmptyFile(files.getIndexFile()) && isEmptyFile(files.getDataFile())) {
// write file header if empty
Map<String, Object> indexHeaders = new HashMap<String, Object>();
indexHeaders.put("version", 1);
indexHeaders.put("type", "pos");
indexHeaders.put("created", new Date());
Map<String, Object> dataHeaders = new HashMap<String, Object>();
dataHeaders.put("version", 1);
dataHeaders.put("type", "seg");
dataHeaders.put("created", new Date());
InvertedIndexHeader indexHeader = new InvertedIndexHeader(indexHeaders);
InvertedIndexHeader dataHeader = new InvertedIndexHeader(dataHeaders);
InvertedIndexUtil.writeHeader(indexHeader, files.getIndexFile());
InvertedIndexUtil.writeHeader(dataHeader, files.getDataFile());
} else {
// check file header
InvertedIndexUtil.readHeader(files.getIndexFile());
InvertedIndexUtil.readHeader(files.getDataFile());
}
// open file stream
this.indexStream = new FileOutputStream(files.getIndexFile(), true);
this.dataStream = new BufferedOutputStream(new FileOutputStream(files.getDataFile(), true));
this.dataLength = files.getDataFile().length();
}
private boolean isEmptyFile(File f) {
if (!f.exists())
return true;
if (f.isFile() && f.length() == 0)
return true;
return false;
}
public void write(InvertedIndexItem item) throws IOException {
if (closed) {
String msg = "inverted index writer is closed: index=" + files.getIndexFile().getAbsolutePath() + ", data="
+ files.getDataFile().getAbsolutePath();
throw new IllegalStateException(msg);
}
if (item.tokens != null) {
for (String t : item.tokens) {
if (t == null)
continue;
List<InvertedIndexItem> items = postings.get(t);
if (items == null) {
items = new ArrayList<InvertedIndexItem>();
postings.put(t, items);
}
items.add(item);
}
}
if (queueCount >= FLUSH_THRESHOLD)
flush();
queueCount++;
}
public void flush() throws IOException {
if (postings.isEmpty())
return;
// mark last flush time
lastFlush = new Date();
Map<String, Term> terms = new TreeMap<String, Term>();
// posting block length
long pblen = 0;
// postings block for specific term
for (Entry<String, List<InvertedIndexItem>> e : postings.entrySet()) {
long plen = 0;
List<InvertedIndexItem> v = e.getValue();
Collections.reverse(v);
Collections.sort(v);
// mark postings block begin position
terms.put(e.getKey(), new Term(v.size(), pblen));
// write postings
Long last = null;
for (InvertedIndexItem item : v) {
if (last == null) {
plen += writeBeNumber(long.class, item.id);
} else {
plen += writeBeNumber(long.class, last - item.id);
}
last = item.id;
}
pblen += plen;
}
// term block length
long tblen = 0;
// write term block
for (Entry<String, Term> e : terms.entrySet()) {
tblen += writeLeNumber(long.class, e.getValue().offset);
tblen += writeLeNumber(long.class, e.getValue().count);
byte[] token = e.getKey().getBytes(utf8);
dataStream.write(token);
tblen += token.length;
tblen += writeLeNumber(int.class, token.length);
}
// write posting block length
int plen = writeLeNumber(long.class, pblen);
// write term block length
logger.debug("kraken logstorage: writing term block length {}", tblen);
int tlen = writeLeNumber(long.class, tblen);
// last version mark
dataStream.write(1);
dataStream.flush();
dataLength += plen + tlen + tblen + pblen + 1;
// write end offset of block to index
InvertedIndexUtil.prepareLong(dataLength - 1, longbuf);
logger.debug("kraken logstorage: writing index data offset [{}]", (dataLength - 1));
indexStream.write(longbuf);
queueCount = 0;
postings.clear();
}
private int writeBeNumber(Class<?> clazz, long value) throws IOException {
int len = lengthOfRawNumber(clazz, value);
for (int i = 0; i < len; i++) {
byte signalBit = (byte) (i != len - 1 ? 0x80 : 0);
byte data = (byte) (signalBit | (byte) (value >> (7 * (len - i - 1)) & 0x7F));
dataStream.write(data);
}
return len;
}
// little endian writing
private int writeLeNumber(Class<?> clazz, long value) throws IOException {
int len = lengthOfRawNumber(clazz, value);
for (int i = 0; i < len; i++) {
byte signalBit = (byte) (i == 0 ? 0 : 0x80);
byte data = (byte) (signalBit | (byte) (value & 0x7F));
value >>= 7;
dataStream.write(data);
}
return len;
}
public static <T> int lengthOfRawNumber(Class<T> clazz, long value) {
if (value < 0) {
if (long.class == clazz)
return 10; // max length for long
else if (int.class == clazz)
return 5; // max length for int
else
return 3; // max length for short
} else {
if (value <= 127)
return 1;
if (value <= 16383)
return 2;
}
return (63 - Long.numberOfLeadingZeros(value)) / 7 + 1;
}
public Date getLastFlush() {
return lastFlush;
}
public void close() {
if (closed)
return;
closed = true;
try {
flush();
} catch (IOException e) {
}
try {
indexStream.close();
indexStream = null;
} catch (IOException e) {
}
try {
dataStream.close();
dataStream = null;
} catch (IOException e) {
}
}
private static class Term {
// term count
public long count;
// posting offset
public long offset;
public Term(long count, long offset) {
this.count = count;
this.offset = offset;
}
}
}