package org.solbase.lucenehbase; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.util.Map.Entry; import java.util.NavigableMap; import java.util.ResourceBundle; import org.apache.hadoop.hbase.client.Get; import org.apache.hadoop.hbase.client.HTableInterface; import org.apache.hadoop.hbase.client.Put; import org.apache.hadoop.hbase.client.Result; import org.apache.hadoop.hbase.client.ResultScanner; import org.apache.hadoop.hbase.client.Scan; import org.apache.hadoop.hbase.util.Bytes; import org.apache.log4j.Logger; import org.apache.lucene.index.Term; import org.solbase.SolbaseShardUtil; import org.solbase.SolbaseUtil; import org.solbase.cache.CachedObjectLoader; import org.solbase.cache.CachedObjectWrapper; import org.solbase.cache.LayeredCache; import org.solbase.cache.LayeredCache.ModificationType; public class TermDocMetadataLoader implements CachedObjectLoader<Term, CompactedTermDocMetadataArray, TermDocMetadataVersionIdentifier, TermDocMetadata> { public static final int CHUNK_SIZE = 1000 * 100; private final static Logger logger = Logger.getLogger(TermDocMetadataLoader.class); public enum STORAGE_TYPE { KEY_ONLY, WIDE_ROW, NARROW_ROW; } public static STORAGE_TYPE storageType; static { String type = System.getProperty("solbase.storage.type"); if (type == null && ResourceBundle.getBundle("solbase") != null) { try { type = ResourceBundle.getBundle("solbase").getString("storage.type"); } catch (java.util.MissingResourceException ex) { } } if (type == null || type.isEmpty()) { storageType = STORAGE_TYPE.NARROW_ROW; } else { storageType = STORAGE_TYPE.valueOf(type); } } public CachedObjectWrapper<CompactedTermDocMetadataArray, TermDocMetadataVersionIdentifier> loadObject(Term term, int start, int end, LayeredCache<Term, CompactedTermDocMetadataArray, TermDocMetadataVersionIdentifier, TermDocMetadata> cache) throws IOException { HTableInterface termVectorTable = SolbaseUtil.getTermVectorTable(); try { byte[] termBeginKey = SolbaseUtil.generateTermKey(term, start); byte[] termEndKey; if(end > SolbaseShardUtil.getMaxDocId()){ // in case, we are in last shard, our end key is always get start from \xff\xff\xff\xff // meaning fetch remaining docs. termEndKey = SolbaseUtil.generateTermEndKey(term); } else { termEndKey = SolbaseUtil.generateTermKey(term, end); } Scan fieldScan = new Scan(termBeginKey, termEndKey); fieldScan.addFamily(SolbaseUtil.termVectorDocColumnFamilyName); fieldScan.setBatch(2000); fieldScan.setCaching(2000); ResultScanner fieldScanner = termVectorTable.getScanner(fieldScan); Result termDoc; ByteArrayOutputStream bis = new ByteArrayOutputStream(); int docAmount = 0; while ((termDoc = fieldScanner.next()) != null) { if (storageType == STORAGE_TYPE.WIDE_ROW) { convertResultChunkToTermDoc(termDoc, bis); } else { convertResultToTermDoc(termDoc, bis); docAmount++; } } fieldScanner.close(); logger.info("Read from HBase for term: " + term.toString() + " has this many docs: " + docAmount); // TODO LOAD VERSION TermDocMetadataVersionIdentifier versionIdentifier = getVersionIdentifier(term, start, end); return new CachedObjectWrapper<CompactedTermDocMetadataArray, TermDocMetadataVersionIdentifier>(new CompactedTermDocMetadataArray(bis, docAmount), versionIdentifier, System.currentTimeMillis()); } finally { SolbaseUtil.releaseTable(termVectorTable); } } private void convertResultChunkToTermDoc(Result termDocQueryResult, ByteArrayOutputStream bis) throws IOException { NavigableMap<byte[], byte[]> columns = termDocQueryResult.getFamilyMap(SolbaseUtil.termVectorDocColumnFamilyName); for (Entry<byte[], byte[]> column : columns.entrySet()) { byte[] serializedTermDocMetadata = column.getValue(); byte[] docId = column.getKey(); bis.write(docId); bis.write(serializedTermDocMetadata); } } private void convertResultToTermDoc(Result termDocQueryResult, ByteArrayOutputStream bis) throws IOException { switch (storageType) { case KEY_ONLY: { byte[] row = termDocQueryResult.getRow(); int start = SolbaseUtil.findDocIdIndex(row); bis.write(row, start, row.length - start); } break; case WIDE_ROW: break; case NARROW_ROW: default: { throw new UnsupportedOperationException(); } } } public static TermDocMetadataVersionIdentifier getStaticVersionIdentifier(Term key, int startDocId, int endDocId) throws IOException { HTableInterface termVectorVersionIDTable = SolbaseUtil.getTermVectorVersionIDTable(); try { byte[] fieldTermKey = SolbaseUtil.generateTermKey(key); Get get = new Get(Bytes.add(fieldTermKey, Bytes.toBytes(startDocId), Bytes.toBytes(endDocId))); Result result = termVectorVersionIDTable.get(get); if (result.isEmpty()) { Put updatePut = new Put(Bytes.add(fieldTermKey, Bytes.toBytes(startDocId), Bytes.toBytes(endDocId))); long currentTime = System.currentTimeMillis(); updatePut.add(SolbaseUtil.timestampColumnFamilyName, Bytes.toBytes(""), Bytes.toBytes(currentTime)); termVectorVersionIDTable.put(updatePut); return new TermDocMetadataVersionIdentifier(currentTime, startDocId, endDocId); } else { return new TermDocMetadataVersionIdentifier(Bytes.toLong(result.getValue(Bytes.toBytes("timestamp"), Bytes.toBytes(""))), startDocId, endDocId); } } finally { SolbaseUtil.releaseTable(termVectorVersionIDTable); } } public TermDocMetadataVersionIdentifier getVersionIdentifier(Term key, int startDocId, int endDocId) throws IOException { return TermDocMetadataLoader.getStaticVersionIdentifier(key, startDocId, endDocId); } public static Integer getChunkId(int docId) { return (docId / TermDocMetadataLoader.CHUNK_SIZE) * TermDocMetadataLoader.CHUNK_SIZE; } public static void main(String[] argv) { System.out.println((99999 / TermDocMetadataLoader.CHUNK_SIZE) * TermDocMetadataLoader.CHUNK_SIZE); System.exit(0); } @Override public void updateObject(CachedObjectWrapper<CompactedTermDocMetadataArray, TermDocMetadataVersionIdentifier> object, TermDocMetadata modificationdData, LayeredCache<Term, CompactedTermDocMetadataArray, TermDocMetadataVersionIdentifier, TermDocMetadata> cache, LayeredCache.ModificationType modType, int startDocId, int endDocId) throws IOException { int docId = modificationdData.getDocId(); CompactedTermDocMetadataArray ctdma = object.getValue(); if (modType == ModificationType.DELETE) { // deleting term vector from term vector byte array if (ctdma != null) { ctdma.readWriteLock.writeLock().lock(); try { int docAmount = ctdma.getDocAmount(); SolbaseTermDocs std = new SolbaseTermDocs(ctdma); // offset into byte array int prevPosition = std.termDocs.currentPostion(); // iterate over term vector array until current doc is greater than docId we want to delete while (std.next() && std.doc() < docId) { prevPosition = std.termDocs.currentPostion(); } // if current docid matches docid we want to delete, properly delete term vector out of array if (std.doc() == docId) { int currentPosition = std.termDocs.currentPostion(); docAmount--; ctdma.setDocAmount(docAmount); ctdma.deleteTermVector(prevPosition, currentPosition); } Term term = modificationdData.getTerm(); logger.debug("term delete: " + term.toString() + " has this many docs: " + docAmount + " docId: " + docId); TermDocMetadataVersionIdentifier newVersionId = new TermDocMetadataVersionIdentifier(System.currentTimeMillis(), startDocId, endDocId); object.setVersionIdentifier(newVersionId); modificationdData.versionIdentifier = newVersionId; } finally { ctdma.readWriteLock.writeLock().unlock(); } } } else { Term term = modificationdData.getTerm(); logger.debug("entering tv updateObject for " + term.toString() + " docId: " + docId); if (ctdma != null) { ctdma.readWriteLock.writeLock().lock(); try { int docAmount = ctdma.getDocAmount(); byte[] byteArray = ctdma.getTermDocMetadataArray(); SolbaseTermDocs std = new SolbaseTermDocs(ctdma); int prevPosition = std.termDocs.currentPostion(); while (std.next() && std.doc() < docId) { prevPosition = std.termDocs.currentPostion(); } byte[] newTdm = Bytes.add(Bytes.toBytes(docId), Bytes.toBytes(modificationdData.serialize())); if (std.doc() == docId) { // UPDATING EXISTING DOC int currentPosition = std.termDocs.currentPostion(); int prevTermDocSize = currentPosition - prevPosition; // if we can fit new term doc into exisitng space, do it! if(newTdm.length < prevTermDocSize + ctdma.bufferedSize()){ ctdma.updateTermVector(prevPosition, currentPosition, newTdm); } else { // new term doc metadata won't fit in buffered space, so copy it over and create new size ByteArrayOutputStream baos = new ByteArrayOutputStream(byteArray.length + newTdm.length); baos.write(byteArray, 0, prevPosition); baos.write(newTdm); baos.write(byteArray, currentPosition, byteArray.length - currentPosition); // calculate new buffer size and fill 0's int origSize = baos.size(); int bufferedSize = CompactedTermDocMetadataArray.bufferTermVectorArray(baos); baos.write(new byte[bufferedSize - origSize], 0, bufferedSize - origSize); byteArray = baos.toByteArray(); ctdma.setTermDocMetadataArray(byteArray); ctdma.setTermVectorSize(origSize); } } else { // ADDING NEW DOC if (newTdm.length < ctdma.bufferedSize()) { docAmount++; int currentPosition = std.termDocs.currentPostion(); ctdma.setDocAmount(docAmount); ctdma.addTermVector(prevPosition, currentPosition, newTdm); } else { // new term doc metadata won't fit in buffered space, so copy it over and create new size int termVectorSize = ctdma.getTermVectorSize(); ByteArrayOutputStream baos = new ByteArrayOutputStream(termVectorSize + newTdm.length); baos.write(byteArray, 0, prevPosition); baos.write(newTdm); baos.write(byteArray, prevPosition, termVectorSize - prevPosition); docAmount++; // calculate new buffer size and fill 0's int origSize = baos.size(); int bufferedSize = CompactedTermDocMetadataArray.bufferTermVectorArray(baos); baos.write(new byte[bufferedSize - origSize], 0, bufferedSize - origSize); byteArray = baos.toByteArray(); ctdma.setDocAmount(docAmount); ctdma.setTermDocMetadataArray(byteArray); ctdma.setTermVectorSize(origSize); } } logger.debug("term update: " + term.toString() + " has this many docs: " + docAmount + " docId: " + docId); TermDocMetadataVersionIdentifier newVersionId = new TermDocMetadataVersionIdentifier(System.currentTimeMillis(), startDocId, endDocId); object.setVersionIdentifier(newVersionId); modificationdData.versionIdentifier = newVersionId; } finally { ctdma.readWriteLock.writeLock().unlock(); } } } } @Override public void updateObjectStore(Term key, TermDocMetadata modificationData, IndexWriter writer, LayeredCache<Term, CompactedTermDocMetadataArray, TermDocMetadataVersionIdentifier, TermDocMetadata> cache, LayeredCache.ModificationType modType, int startDocId, int endDocId) throws IOException { logger.debug("update store docId: " + modificationData.docId + " term: " + key.toString()); if(modType == LayeredCache.ModificationType.DELETE){ // this is delete writer.deleteTermVector(modificationData, startDocId, endDocId); writer.updateTermVectorVersionId(modificationData, startDocId, endDocId); } else if(modType == LayeredCache.ModificationType.UPDATE){ writer.updateTermVector(modificationData, startDocId, endDocId); writer.updateTermVectorVersionId(modificationData, startDocId, endDocId); } else if(modType == LayeredCache.ModificationType.ADD){ writer.addTermVector(modificationData); writer.updateTermVectorVersionId(modificationData, startDocId, endDocId); } } }