/* * Copyright 2010 Bizosys Technologies Limited * * Licensed to the Bizosys Technologies Limited (Bizosys) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The Bizosys licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.bizosys.hsearch.dictionary; import java.io.Writer; import java.util.ArrayList; import java.util.Collection; import java.util.List; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import com.bizosys.hsearch.filter.Storable; import com.bizosys.hsearch.hbase.HDML; import com.bizosys.hsearch.hbase.HReader; import com.bizosys.hsearch.hbase.HWriter; import com.bizosys.hsearch.hbase.IScanCallBack; import com.bizosys.hsearch.hbase.NV; import com.bizosys.hsearch.schema.IOConstants; import com.bizosys.hsearch.util.ObjectFactory; import com.bizosys.hsearch.util.RecordScalar; import com.bizosys.oneline.ApplicationFault; import com.bizosys.oneline.SystemFault; import com.bizosys.oneline.util.StringUtils; /** * Dictionary has around 1 Second. This should be taken care by * batching this. BatchProcessor should do this one by one. * We should also perform mass updates. * @author karan * */ public class Dictionary { /** * Character separating Multiple keywords */ private static final char KEYWORD_SEPARATOR = '\t'; /** * Many words forming a single line. Stacking many words * in a line helps saving the storage space for fuzzy and regex queries */ int termMergeFactor = 1000; /** * On retrieving dictionary, number of words per page */ int pageSize = 1000; /** * The whole dictionary is stored as multiple lines and * in each line multiple words. This enables faster pattern * and fuzzy matching. */ List<String> mergedWordLines = new ArrayList<String>(100); boolean threadSafe = false; private boolean isDebugEnabled = DictLog.l.isDebugEnabled(); private String wordPrefix = StringUtils.Empty; protected long touchTime = System.currentTimeMillis(); public String tenant = StringUtils.Empty; /** * Constructor * @param termMergeFactor Many words forming a single line. * @param pageSize On retrieving dictionary, number of words per page * @param threadSafe Enable thread Safety. */ public Dictionary(String tenant, int termMergeFactor, int pageSize, boolean threadSafe) throws ApplicationFault { if ( StringUtils.isEmpty(tenant)) throw new ApplicationFault("No tenant"); this.tenant = tenant; this.wordPrefix = tenant + "/" ; this.termMergeFactor = termMergeFactor; this.pageSize = pageSize; this.threadSafe = threadSafe; this.touchTime = System.currentTimeMillis(); } /** * Add entries to the dictionary * @param keywords Dictionary words * @throws SystemFault Error */ public void add(Map<String, DictEntry> keywords) throws SystemFault { if ( null == keywords) return; if (isDebugEnabled) DictLog.l.debug( "Dictionary> Adding Keywords :" + keywords.size()); List<RecordScalar> records = null; try { records = ObjectFactory.getInstance().getScalarRecordList(); for (DictEntry entry : keywords.values()) { if (isDebugEnabled) DictLog.l.debug("Dictionary> Word = " + entry.word); if ( null == entry) continue; if ( null == entry.word) continue; Storable pk = new Storable(wordPrefix + entry.word); DictEntryMerge scalar = new DictEntryMerge( pk, IOConstants.DICTIONARY_BYTES, IOConstants.DICTIONARY_TERM_BYTES, entry); records.add(scalar); } HWriter.getInstance(threadSafe).mergeScalar(IOConstants.TABLE_DICTIONARY, records); } catch (Exception ex) { DictLog.l.error(ex); throw new SystemFault(ex); } finally { if ( null != records) ObjectFactory.getInstance().putScalarRecordList(records); this.touchTime = System.currentTimeMillis(); } } /** * Find exact entry detail from the dictionary. * @param keyword Word to be searched * @return The Dictionary Entry for the word * @throws SystemFault Error condition */ public DictEntry get(String keyword) throws SystemFault { if ( StringUtils.isEmpty(keyword) ) return null; if (isDebugEnabled) DictLog.l.debug("Dictionary> Getting Keyword :" + keyword); try { NV kv = new NV(IOConstants.DICTIONARY_BYTES, IOConstants.DICTIONARY_TERM_BYTES); Storable pk = new Storable(wordPrefix + keyword); RecordScalar scalar = new RecordScalar(pk,kv); HReader.getScalar(IOConstants.TABLE_DICTIONARY, scalar); if ( null == scalar.kv.data) return null; return new DictEntry(scalar.kv.data.toBytes()); } catch (Exception ex) { throw new SystemFault("Error in dictionary resolution for :" + keyword, ex); } finally { this.touchTime = System.currentTimeMillis(); } } /** * Stream all values. * @param writer * @throws SystemFault */ public void getAll(String indexLetters, Writer writer) throws SystemFault { IScanCallBack callBack = new StreamDictionaryEntries(writer); NV nv = new NV(IOConstants.DICTIONARY_BYTES, IOConstants.DICTIONARY_TERM_BYTES); HReader.getAllValues(IOConstants.TABLE_DICTIONARY, nv, indexLetters, callBack); this.touchTime = System.currentTimeMillis(); } /** * Builds the dictionary terms for regex and fuzzy searches * @throws SystemFault Storage Failure */ public synchronized void buildTerms() throws SystemFault { DictLog.l.info("Dictionary> Term building START"); DictionaryBook book = new DictionaryBook(termMergeFactor, KEYWORD_SEPARATOR, this.wordPrefix); NV kv = new NV(IOConstants.DICTIONARY_BYTES, IOConstants.DICTIONARY_TERM_BYTES); HReader.getAllKeys(IOConstants.TABLE_DICTIONARY, kv, this.wordPrefix, book); /** * Swap the temp with merged one * TODO:// This is not memory efficient with growing number of words * Think of finding and deleting from the stack */ List<String> cleanThis = this.mergedWordLines; this.mergedWordLines = book.getLines(); cleanThis.clear(); cleanThis = null; DictLog.l.info("Dictionary> Term building END"); this.touchTime = System.currentTimeMillis(); } /** * Uses fuzzy mechanism for searching. * @param searchWord Fuzzy word to be scanned * @param fuzzyFactor Low fuzzy means accurate matching. * A value of 3 is a good fuzzy matching for named. * @return Matching words */ public List<String> fuzzy(String searchWord, int fuzzyFactor) { DistanceImpl dis = new DistanceImpl(); List<String> foundWords = new ArrayList<String>(); int index1, index2; String token = null; for (String text: mergedWordLines) { index1 = 0; index2 = text.indexOf(KEYWORD_SEPARATOR); token = null; while (index2 >= 0) { token = text.substring(index1, index2); index1 = index2 + 1; index2 = text.indexOf(KEYWORD_SEPARATOR, index1); if ( StringUtils.isEmpty(token) ) continue; if ( dis.getDistance(searchWord, token) <= fuzzyFactor) { foundWords.add(token); } } } this.touchTime = System.currentTimeMillis(); return foundWords; } /** * Uses regular expression to find it. * @param pattern The regex pattern for the word * @return List of matching words */ public synchronized List<String> regex(String pattern) { Pattern p = Pattern.compile(pattern); List<String> matchedWords = new ArrayList<String>(); int readIndex, foundIndex; String token = null; Matcher m = null; for (String text: mergedWordLines) { readIndex = 0; foundIndex = text.indexOf(KEYWORD_SEPARATOR); if ( foundIndex == -1 && text.length() > 0) { m = p.matcher(text); if ( m.find() ) matchedWords.add(text); } token = null; while (foundIndex >= 0) { token = text.substring(readIndex, foundIndex); m = p.matcher(token); if ( m.find() ) matchedWords.add(token); readIndex = foundIndex + 1; foundIndex = text.indexOf(KEYWORD_SEPARATOR, readIndex); } } this.touchTime = System.currentTimeMillis(); return matchedWords; } /** * Delete the occurance of supplied words from dictionary * @param keywords The words to be deleted * @throws SystemFault */ public void delete(Collection<String> keywords) throws SystemFault { if ( null == keywords) return; List<byte[]> deletes = null; try { deletes = ObjectFactory.getInstance().getByteArrList(); for (String keyword : keywords) { if ( StringUtils.isEmpty(keyword)) continue; byte[] pk = Storable.putString(wordPrefix + keyword); deletes.add(pk); continue; } HDML.truncateBatch(IOConstants.TABLE_DICTIONARY, deletes); } catch (Exception ex) { DictLog.l.error(ex); throw new SystemFault(ex); } finally { if ( null != deletes) ObjectFactory.getInstance().putByteArrList(deletes); this.touchTime = System.currentTimeMillis(); } } public void delete(String keyword) throws SystemFault { if ( StringUtils.isEmpty(keyword) ) return; try { Storable pk = new Storable(wordPrefix + keyword); HWriter.getInstance(threadSafe).delete(IOConstants.TABLE_DICTIONARY, pk); } catch (Exception ex) { DictLog.l.error(ex); throw new SystemFault(ex); } finally { this.touchTime = System.currentTimeMillis(); } } /** * Lower the sighting frequencies of the dictionary entries * @param keywords "Keyword-Dictionary Entry" map * @throws SystemFault Error condition */ public void substract(Map<String, DictEntry> keywords) throws SystemFault { if ( null == keywords) return; List<RecordScalar> records = new ArrayList<RecordScalar>(keywords.size()); for (DictEntry entry : keywords.values()) { if ( null == entry) continue; if ( null == entry.word) continue; DictEntrySubstract scalar = new DictEntrySubstract( new Storable(wordPrefix + entry.word), IOConstants.DICTIONARY_BYTES, IOConstants.DICTIONARY_TERM_BYTES, entry); records.add(scalar); } try { HWriter.getInstance(threadSafe).mergeScalar(IOConstants.TABLE_DICTIONARY, records); } catch (Exception ex) { DictLog.l.error(ex); throw new SystemFault(ex); } finally { this.touchTime = System.currentTimeMillis(); } } public void clean() { if ( null != this.mergedWordLines) this.mergedWordLines.clear(); } }