/** * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.solbase.lucenehbase; import java.io.IOException; import java.io.StringReader; import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.concurrent.ConcurrentSkipListMap; import org.apache.commons.lang.ArrayUtils; import org.apache.hadoop.hbase.client.Delete; import org.apache.hadoop.hbase.client.HTableInterface; import org.apache.hadoop.hbase.client.Put; import org.apache.hadoop.hbase.client.Result; import org.apache.hadoop.hbase.client.ResultScanner; import org.apache.hadoop.hbase.client.Scan; import org.apache.hadoop.hbase.util.Bytes; import org.apache.log4j.Logger; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.document.Document; import org.apache.lucene.document.EmbeddedSortField; import org.apache.lucene.document.Fieldable; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.Term; import org.apache.lucene.search.Scorer; import org.apache.lucene.search.Similarity; import org.solbase.SolbaseUtil; import org.solbase.common.SolbaseException; import org.solbase.indexer.ParsedDoc; import org.solbase.indexer.SolbaseIndexUtil; public class IndexWriter { private static final Logger logger = Logger.getLogger(IndexWriter.class); private static final InheritableThreadLocal<String> indexName = new InheritableThreadLocal<String>(); private static SolbaseIndexUtil indexUtil; private Similarity similarity = Similarity.getDefault(); // how to // normalize; // going to hold onto puts until later public List<Put> puts = new ArrayList<Put>(); // private static final Logger logger = Logger.getLogger(IndexWriter.class); public IndexWriter() { } public IndexWriter(String indexName) { setIndexName(indexName); } public void setIndexUtil(SolbaseIndexUtil indexUtil){ this.indexUtil = indexUtil; } public void addDocument(Put documentPut, Document doc){ byte[] docId = documentPut.getRow(); String uniqId =doc.get("global_uniq_id"); if (uniqId != null && docId != null) { // for remote server update via solr update, we want to use // getDocTable(), but for now map/red can use local htable HTableInterface docTable = SolbaseUtil.getDocTable(); // insert document to doctable try { documentPut.add(SolbaseUtil.timestampColumnFamilyName, SolbaseUtil.tombstonedColumnFamilyQualifierBytes, Bytes.toBytes(0)); docTable.put(documentPut); } catch (IOException e) { throw new SolbaseException(SolbaseException.ErrorCode.SERVER_ERROR, e.getMessage()); } finally { SolbaseUtil.releaseTable(docTable); } // need to insert to docKeyIdMap Put mapping = new Put(Bytes.toBytes(uniqId)); mapping.add(Bytes.toBytes("docId"), Bytes.toBytes(""), SolbaseUtil.randomize(docId)); mapping.add(SolbaseUtil.docIdColumnFamilyName, SolbaseUtil.tombstonedColumnFamilyQualifierBytes, Bytes.toBytes(0)); updateDocKeyIdMap(mapping); logger.info("adding document: " + Bytes.toInt(SolbaseUtil.randomize(docId)) + " uniqId: " + uniqId); } else { if(uniqId == null){ logger.info("uniqId is null: " + doc.toString()); } else if(docId == null){ logger.info("docId is null: " + doc.toString()); } else { logger.info("both uniqId and docId are null: " + doc.toString()); } } } public void updateDocument(Put documentPut, Document doc){ String uniqId = doc.get("global_uniq_id"); Put mappingPut = new Put(Bytes.toBytes(uniqId)); mappingPut.add(SolbaseUtil.docIdColumnFamilyName, SolbaseUtil.tombstonedColumnFamilyQualifierBytes, Bytes.toBytes(0)); updateDocKeyIdMap(mappingPut); // for remote server update via solr update, we want to use // getDocTable(), but for now map/red can use local htable HTableInterface docTable = SolbaseUtil.getDocTable(); // insert document to doctable try { documentPut.add(SolbaseUtil.timestampColumnFamilyName, SolbaseUtil.tombstonedColumnFamilyQualifierBytes, Bytes.toBytes(0)); docTable.put(documentPut); } catch (IOException e) { throw new SolbaseException(SolbaseException.ErrorCode.SERVER_ERROR, e.getMessage()); } finally { SolbaseUtil.releaseTable(docTable); } } public void deleteDocument(Put documentPut){ HTableInterface docTable = SolbaseUtil.getDocTable(); try { documentPut.add(SolbaseUtil.timestampColumnFamilyName, SolbaseUtil.tombstonedColumnFamilyQualifierBytes, Bytes.toBytes(1)); docTable.put(documentPut); } catch (IOException e) { e.printStackTrace(); } finally { SolbaseUtil.releaseTable(docTable); } } public void updateDocKeyIdMap(Put docKeyIdPut){ // for remote server update via solr update, we want to use // getDocTable(), but for now map/red can use local htable HTableInterface docKeyIdMap = SolbaseUtil.getDocKeyIdMapTable(); // insert document to doctable try { docKeyIdMap.put(docKeyIdPut); } catch (IOException e) { throw new SolbaseException(SolbaseException.ErrorCode.SERVER_ERROR, e.getMessage()); } finally { SolbaseUtil.releaseTable(docKeyIdMap); } } public void deleteDocKeyIdMap(Put mappingPut){ // for remote server update via solr update, we want to use // getDocTable(), but for now map/red can use local htable HTableInterface mappingTable = SolbaseUtil.getDocKeyIdMapTable(); // insert document to doctable try { Delete delete = new Delete(mappingPut.getRow()); mappingTable.delete(delete); } catch (IOException e) { throw new SolbaseException(SolbaseException.ErrorCode.SERVER_ERROR, e.getMessage()); } finally { SolbaseUtil.releaseTable(mappingTable); } } public void updateTermVector(TermDocMetadata termDocMeta) throws CorruptIndexException, IOException { this.addTermVector(termDocMeta, 0, 0); int docNumber = termDocMeta.getDocId(); logger.info("updating term vector: " + termDocMeta.getTerm().toString() + " docId: " + docNumber); } public void addTermVector(TermDocMetadata termDocMeta) throws CorruptIndexException, IOException { this.addTermVector(termDocMeta, 0, 0); int docNumber = termDocMeta.getDocId(); logger.info("adding term vector: " + termDocMeta.getTerm().toString() + " docId: " + docNumber); } public void addTermVector(TermDocMetadata termDocMeta, int startDocId, int endDocId) throws CorruptIndexException, IOException { // getting terVector and doc tables HTableInterface termVectorTable = SolbaseUtil.getTermVectorTable(); try { byte[] key = termDocMeta.getFieldTermKey(); ByteBuffer buf = termDocMeta.serialize(); int docNumber = termDocMeta.getDocId(); Put put = null; switch (TermDocMetadataLoader.storageType) { case KEY_ONLY: { put = new Put(Bytes.add(Bytes.add(key, SolbaseUtil.delimiter, Bytes.toBytes(docNumber)), Bytes.toBytes(buf))); put.add(SolbaseUtil.termVectorDocColumnFamilyName, Bytes.toBytes(""), Bytes.toBytes("")); } break; case WIDE_ROW: int chunkId = TermDocMetadataLoader.getChunkId(docNumber); put = new Put(Bytes.add(key, SolbaseUtil.delimiter, Bytes.toBytes(chunkId))); put.add(SolbaseUtil.termVectorDocColumnFamilyName, Bytes.toBytes(docNumber), Bytes.toBytes(buf)); break; case NARROW_ROW: default: { put = new Put(Bytes.add(key, SolbaseUtil.delimiter, Bytes.toBytes(docNumber))); put.add(SolbaseUtil.termVectorDocColumnFamilyName, Bytes.toBytes(""), Bytes.toBytes(buf)); } } termVectorTable.put(put); } catch (Exception e){ logger.error("failed to add term vector: " + termDocMeta.getTerm().toString() + " and docId: " + termDocMeta.docId); } finally { SolbaseUtil.releaseTable(termVectorTable); } } public void updateTermVector(TermDocMetadata termDocMeta, int startDocId, int endDocId){ // to update, we should first delete existing term doc meta data. // getting terVector and doc tables try { // TODO: what do we do with doc update? just update anyway? boolean deleted = deleteTermVector(termDocMeta, startDocId, endDocId, true); if(deleted) { updateTermVector(termDocMeta); } } catch (CorruptIndexException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } public void updateTermVectorVersionId(TermDocMetadata termDocMeta, int startDocId, int endDocId){ HTableInterface versionIdTable = SolbaseUtil.getTermVectorVersionIDTable(); Term term = termDocMeta.getTerm(); byte[] fieldTermKey = SolbaseUtil.generateTermKey(term); Put updatePut = new Put(Bytes.add(fieldTermKey, Bytes.toBytes(startDocId), Bytes.toBytes(endDocId))); if(termDocMeta.versionIdentifier == null){ // we havn't loaded this term into cache yet, but need to do update with try { TermDocMetadataVersionIdentifier versionIdentifier = TermDocMetadataLoader.getStaticVersionIdentifier(term, startDocId, endDocId); updatePut.add(SolbaseUtil.timestampColumnFamilyName, Bytes.toBytes(""), Bytes.toBytes(versionIdentifier.getVersionIdentifier())); } catch (IOException e) { throw new RuntimeException(e); } } else { updatePut.add(SolbaseUtil.timestampColumnFamilyName, Bytes.toBytes(""), Bytes.toBytes(termDocMeta.versionIdentifier.getVersionIdentifier())); } try { versionIdTable.put(updatePut); } catch (IOException e) { e.printStackTrace(); } finally { SolbaseUtil.releaseTable(versionIdTable); } } public void deleteTermVectorVersionId(TermDocMetadata termDocMeta){ HTableInterface versionIdTable = SolbaseUtil.getTermVectorVersionIDTable(); Term term = termDocMeta.getTerm(); byte[] fieldTermKey = SolbaseUtil.generateTermKey(term); Delete delete = new Delete(fieldTermKey); try { versionIdTable.delete(delete); } catch (IOException e) { e.printStackTrace(); } finally { SolbaseUtil.releaseTable(versionIdTable); } } /** * by default it's not going to compare current term vector with what's in tv table * * @return boolean - indicating whether term vector's been deleted */ public boolean deleteTermVector(TermDocMetadata termDocMeta, int startDocId, int endDocId) { return this.deleteTermVector(termDocMeta, startDocId, endDocId, false); } /** * * @param termDocMeta - term vector to be deleted * @param startDocId * @param endDocId * @param compare - if true, it will compare new and old term vectors and if same, don't bother deleting term vector * @return boolean - indicating whether term vector's been deleted */ public boolean deleteTermVector(TermDocMetadata termDocMeta, int startDocId, int endDocId, boolean compare){ // to update, we should first delete existing term doc meta data. // getting terVector and doc tables HTableInterface termVectorTable = SolbaseUtil.getTermVectorTable(); ResultScanner fieldScanner = null; try { byte[] key = termDocMeta.getFieldTermKey(); int docNumber = termDocMeta.getDocId(); Delete delete = null; switch (TermDocMetadataLoader.storageType) { case KEY_ONLY: { byte[] termBeginKey = Bytes.add(key, SolbaseUtil.delimiter, Bytes.toBytes(docNumber)); byte[] termEndKey = Bytes.add(key, SolbaseUtil.delimiter, Bytes.toBytes(docNumber+1)); Scan fieldScan = new Scan(termBeginKey, termEndKey); fieldScan.addFamily(SolbaseUtil.termVectorDocColumnFamilyName); fieldScanner = termVectorTable.getScanner(fieldScan); Result termDoc; termDoc = fieldScanner.next(); fieldScanner.close(); if(termDoc != null && !termDoc.isEmpty()){ if(compare) { byte[] oldRow = termDoc.getRow(); ByteBuffer buf = termDocMeta.serialize(); byte[] newRow = Bytes.add(Bytes.add(key, SolbaseUtil.delimiter, Bytes.toBytes(docNumber)), Bytes.toBytes(buf)); // if term vector hasn't changed, don't bother deleting if (!ArrayUtils.isEquals(oldRow, newRow)) { delete = new Delete(termDoc.getRow()); } } else { delete = new Delete(termDoc.getRow()); } } } break; case WIDE_ROW: int chunkId = TermDocMetadataLoader.getChunkId(docNumber); delete = new Delete(Bytes.add(key, SolbaseUtil.delimiter, Bytes.toBytes(chunkId))); break; case NARROW_ROW: default: { delete = new Delete(Bytes.add(key, SolbaseUtil.delimiter, Bytes.toBytes(docNumber))); } } if(delete != null){ termVectorTable.delete(delete); logger.info("deleting term vector: " + termDocMeta.getTerm().toString() + " docId: " + docNumber); return true; } } catch (IOException e) { throw new SolbaseException(SolbaseException.ErrorCode.SERVER_ERROR, e.getMessage()); } finally { if(fieldScanner != null){ fieldScanner.close(); } SolbaseUtil.releaseTable(termVectorTable); } return false; } @SuppressWarnings("unchecked") public ParsedDoc parseDoc(Document doc, Analyzer analyzer, String indexName, int docNumber, List<String> sortFieldNames) throws CorruptIndexException, IOException { // given doc, what are all of terms we indexed List<Term> allIndexedTerms = new ArrayList<Term>(); Map<String, byte[]> fieldCache = new HashMap<String, byte[]>(1024); // need to hold onto TermDocMetaData, so it can return this array List<TermDocMetadata> metadatas = new ArrayList<TermDocMetadata>(); byte[] docId = Bytes.toBytes(docNumber); int position = 0; for (Fieldable field : (List<Fieldable>) doc.getFields()) { // Indexed field if (field.isIndexed() && field.isTokenized()) { TokenStream tokens = field.tokenStreamValue(); if (tokens == null) { tokens = analyzer.tokenStream(field.name(), new StringReader(field.stringValue())); } // collect term information per field Map<Term, Map<ByteBuffer, List<Number>>> allTermInformation = new ConcurrentSkipListMap<Term, Map<ByteBuffer, List<Number>>>(); int lastOffset = 0; if (position > 0) { position += analyzer.getPositionIncrementGap(field.name()); } tokens.reset(); // reset the TokenStream to the first token // offsets OffsetAttribute offsetAttribute = null; if (field.isStoreOffsetWithTermVector()) offsetAttribute = (OffsetAttribute) tokens.addAttribute(OffsetAttribute.class); // positions PositionIncrementAttribute posIncrAttribute = null; if (field.isStorePositionWithTermVector()) posIncrAttribute = (PositionIncrementAttribute) tokens.addAttribute(PositionIncrementAttribute.class); TermAttribute termAttribute = (TermAttribute) tokens.addAttribute(TermAttribute.class); // store normalizations of field per term per document // rather // than per field. // this adds more to write but less to read on other side Integer tokensInField = new Integer(0); while (tokens.incrementToken()) { tokensInField++; Term term = new Term(field.name(), termAttribute.term()); allIndexedTerms.add(term); // fetch all collected information for this term Map<ByteBuffer, List<Number>> termInfo = allTermInformation.get(term); if (termInfo == null) { termInfo = new ConcurrentSkipListMap<ByteBuffer, List<Number>>(); allTermInformation.put(term, termInfo); } // term frequency List<Number> termFrequency = termInfo.get(TermDocMetadata.termFrequencyKeyBytes); if (termFrequency == null) { termFrequency = new ArrayList<Number>(); termFrequency.add(new Integer(0)); termInfo.put(TermDocMetadata.termFrequencyKeyBytes, termFrequency); } // increment termFrequency.set(0, termFrequency.get(0).intValue() + 1); // position vector if (field.isStorePositionWithTermVector()) { position += (posIncrAttribute.getPositionIncrement() - 1); List<Number> positionVector = termInfo.get(TermDocMetadata.positionVectorKeyBytes); if (positionVector == null) { positionVector = new ArrayList<Number>(); termInfo.put(TermDocMetadata.positionVectorKeyBytes, positionVector); } positionVector.add(++position); } // term offsets if (field.isStoreOffsetWithTermVector()) { List<Number> offsetVector = termInfo.get(TermDocMetadata.offsetVectorKeyBytes); if (offsetVector == null) { offsetVector = new ArrayList<Number>(); termInfo.put(TermDocMetadata.offsetVectorKeyBytes, offsetVector); } offsetVector.add(lastOffset + offsetAttribute.startOffset()); offsetVector.add(lastOffset + offsetAttribute.endOffset()); } List<Number> sortValues = new ArrayList<Number>(); // init sortValues for(int i = 0; i < Scorer.numSort; i++){ sortValues.add(new Integer(-1)); } int order = 0; // extract sort field value and store it in term doc metadata obj for(String fieldName: sortFieldNames){ Fieldable fieldable = doc.getFieldable(fieldName); if (fieldable instanceof EmbeddedSortField) { EmbeddedSortField sortField = (EmbeddedSortField) fieldable; int value = -1; if (sortField.stringValue() != null) { value = Integer.parseInt(sortField.stringValue()); } int sortSlot = sortField.getSortSlot(); sortValues.set(sortSlot - 1, new Integer(value)); } else { // TODO: this logic is used for real time indexing. // hacky. depending on order of sort field names in array int value = -1; if(fieldable.stringValue() != null){ value = Integer.parseInt(fieldable.stringValue()); } sortValues.set(order++, new Integer(value)); } } termInfo.put(TermDocMetadata.sortFieldKeyBytes, sortValues); } List<Number> bnorm = null; if (!field.getOmitNorms()) { bnorm = new ArrayList<Number>(); float norm = doc.getBoost(); norm *= field.getBoost(); norm *= similarity.lengthNorm(field.name(), tokensInField); bnorm.add(Similarity.encodeNorm(norm)); } for (Map.Entry<Term, Map<ByteBuffer, List<Number>>> term : allTermInformation.entrySet()) { Term tempTerm = term.getKey(); byte[] fieldTermKeyBytes = SolbaseUtil.generateTermKey(tempTerm); // Mix in the norm for this field alongside each term // more writes but faster on read side. if (!field.getOmitNorms()) { term.getValue().put(TermDocMetadata.normsKeyBytes, bnorm); } TermDocMetadata data = new TermDocMetadata(docNumber, term.getValue(), fieldTermKeyBytes, tempTerm); metadatas.add(data); } } // Untokenized fields go in without a termPosition if (field.isIndexed() && !field.isTokenized()) { Term term = new Term(field.name(), field.stringValue()); allIndexedTerms.add(term); byte[] fieldTermKeyBytes = SolbaseUtil.generateTermKey(term); Map<ByteBuffer, List<Number>> termMap = new ConcurrentSkipListMap<ByteBuffer, List<Number>>(); termMap.put(TermDocMetadata.termFrequencyKeyBytes, Arrays.asList(new Number[] {})); termMap.put(TermDocMetadata.positionVectorKeyBytes, Arrays.asList(new Number[] {})); TermDocMetadata data = new TermDocMetadata(docNumber, termMap, fieldTermKeyBytes, term); metadatas.add(data); } // Stores each field as a column under this doc key if (field.isStored()) { byte[] _value = field.isBinary() ? field.getBinaryValue() : Bytes.toBytes(field.stringValue()); // first byte flags if binary or not byte[] value = new byte[_value.length + 1]; System.arraycopy(_value, 0, value, 0, _value.length); value[value.length - 1] = (byte) (field.isBinary() ? Byte.MAX_VALUE : Byte.MIN_VALUE); // logic to handle multiple fields w/ same name byte[] currentValue = fieldCache.get(field.name()); if (currentValue == null) { fieldCache.put(field.name(), value); } else { // append new data byte[] newValue = new byte[currentValue.length + SolbaseUtil.delimiter.length + value.length - 1]; System.arraycopy(currentValue, 0, newValue, 0, currentValue.length - 1); System.arraycopy(SolbaseUtil.delimiter, 0, newValue, currentValue.length - 1, SolbaseUtil.delimiter.length); System.arraycopy(value, 0, newValue, currentValue.length + SolbaseUtil.delimiter.length - 1, value.length); fieldCache.put(field.name(), newValue); } } } Put documentPut = new Put(SolbaseUtil.randomize(docNumber)); // Store each field as a column under this docId for (Map.Entry<String, byte[]> field : fieldCache.entrySet()) { documentPut.add(Bytes.toBytes("field"), Bytes.toBytes(field.getKey()), field.getValue()); } // in case of real time update, we need to add back docId field if(!documentPut.has(Bytes.toBytes("field"), Bytes.toBytes("docId"))){ byte[] docIdStr = Bytes.toBytes(new Integer(docNumber).toString()); // first byte flags if binary or not byte[] value = new byte[docIdStr.length + 1]; System.arraycopy(docIdStr, 0, value, 0, docIdStr.length); value[value.length - 1] = (byte) (Byte.MIN_VALUE); documentPut.add(Bytes.toBytes("field"), Bytes.toBytes("docId"), value); } // Finally, Store meta-data so we can delete this document documentPut.add(Bytes.toBytes("allTerms"), Bytes.toBytes("allTerms"), SolbaseUtil.toBytes(allIndexedTerms).array()); ParsedDoc parsedDoc = new ParsedDoc(metadatas, doc, documentPut, fieldCache.entrySet(), allIndexedTerms); return parsedDoc; } public void updateDocument(Term updateTerm, Document doc, Analyzer analyzer, int docNumber, List<String> sortFieldNames) throws CorruptIndexException, IOException { // we treat add/update same throw new UnsupportedOperationException(); } public int docCount() { throw new RuntimeException("not supported"); } public String getIndexName() { return indexName.get(); } public void setIndexName(String indexName) { IndexWriter.indexName.set(indexName); } }