/* * Copyright (C) 2013 Omry Yadan <omry@yadan.net> * All rights reserved. * * See https://github.com/omry/banana/blob/master/BSD-LICENSE for licensing information */ package net.yadan.banana.utils; import net.yadan.banana.DebugLevel; import net.yadan.banana.map.IVarKeyHashMap; import net.yadan.banana.map.VarKeyHashMap; import net.yadan.banana.memory.Buffer; import net.yadan.banana.memory.IBuffer; import net.yadan.banana.memory.IMemAllocator; import net.yadan.banana.memory.malloc.MultiSizeAllocator; import net.yadan.banana.memory.malloc.TreeAllocator; public class TextIndex { private static final int DOC_LIST_BLOCK_SIZE = 10; private static final int INITIAL_DOC_LIST_SIZE = DOC_LIST_BLOCK_SIZE - VarKeyHashMap.RESERVED_SIZE; public static final int DOC_LIST_ALLOCATION_SIZE_OFFSET = 0; public static final int DOC_LIST_SIZE_OFFSET = 1; private static final int DOC_LIST_DATA_OFFSET = 2; // private static double DOC_LIST_GROWTH_FACTOR = 1.2; private int MAX_WORD_LENGTH = 30; private int m_maxDocListSize = -1; private char[] m_wordBuf; private int m_wordLength; int m_textOffset; IBuffer m_keyBuffer; private IVarKeyHashMap m_word2DocList; private IVarKeyHashMap m_currentDocumentWords; // TODO: should be a Set private IVarKeyHashMap m_stopWords; // TODO: should be a Set // stats private long m_numDocumentsIndexed = 0; private long m_numWordsTokenized = 0; private long m_totalIndexedTextSize = 0; public TextIndex(int initialWordsCapacity, int maxWordLength) { m_wordBuf = new char[MAX_WORD_LENGTH]; m_wordLength = 0; m_textOffset = 0; IMemAllocator docListsMemory = new TreeAllocator(initialWordsCapacity, DOC_LIST_BLOCK_SIZE, 2.0); IMemAllocator keys = new MultiSizeAllocator(1024, new int[] { 1, 2, 4, 8, 16, 32 }, 1.5); m_word2DocList = new VarKeyHashMap(docListsMemory, keys, initialWordsCapacity, 0.75); m_keyBuffer = new Buffer(50); m_currentDocumentWords = new VarKeyHashMap(docListsMemory, keys, initialWordsCapacity, 0.75); m_stopWords = new VarKeyHashMap(docListsMemory, keys, 100, 0.75); } public int index(int documentId, String text, char seps[]) { m_currentDocumentWords.clear(); m_numDocumentsIndexed++; m_totalIndexedTextSize += text.length(); int length = text.length(); char line[] = text.toCharArray(); m_textOffset = 0; int numWords = 0; while (m_textOffset < length) { nextWord(line, length, seps); if (m_wordLength > 0) { try { numWords++; m_numWordsTokenized++; m_keyBuffer.reset(); // m_keyBuffer.appendInt(m_wordLength); m_keyBuffer.appendChars(m_wordBuf, 0, m_wordLength); if (m_stopWords.containsKey(m_keyBuffer)) { continue; } if (m_currentDocumentWords.containsKey(m_keyBuffer)) { continue; } m_currentDocumentWords.createRecord(m_keyBuffer, 0); int docListRecord = m_word2DocList.findRecord(m_keyBuffer); if (docListRecord == -1) { // new word, create a list of documents this word is in] docListRecord = m_word2DocList.createRecord(m_keyBuffer, INITIAL_DOC_LIST_SIZE); m_word2DocList.setInt(docListRecord, DOC_LIST_ALLOCATION_SIZE_OFFSET, INITIAL_DOC_LIST_SIZE); m_word2DocList.setInt(docListRecord, DOC_LIST_SIZE_OFFSET, 0); // zero } int size = m_word2DocList.getInt(docListRecord, DOC_LIST_SIZE_OFFSET); int maxCap = m_word2DocList.getInt(docListRecord, DOC_LIST_ALLOCATION_SIZE_OFFSET); // this word appears too many times, stop keeping track of it if (m_maxDocListSize != -1 && size >= m_maxDocListSize) { continue; } if (size > maxCap - (VarKeyHashMap.RESERVED_SIZE + DOC_LIST_DATA_OFFSET)) { // System.out.println(m_word2DocList.valueMemory().pointerDebugString(docListRecord)); int newSize = maxCap + DOC_LIST_BLOCK_SIZE; docListRecord = m_word2DocList.reallocRecord(m_keyBuffer, newSize); m_word2DocList.setInt(docListRecord, DOC_LIST_ALLOCATION_SIZE_OFFSET, newSize); // System.out.println(m_word2DocList.valueMemory().pointerDebugString(docListRecord)); } m_word2DocList.setInt(docListRecord, DOC_LIST_SIZE_OFFSET, size + 1); m_word2DocList.setInt(docListRecord, DOC_LIST_DATA_OFFSET + size, documentId); } catch (RuntimeException e) { System.out.println("Error indexing document " + documentId + " , word : " + new String(m_wordBuf, 0, m_wordLength)); throw e; } } } return numWords; } // int EMPTY[] = new int[0]; public int[] find(String word) { m_keyBuffer.reset(); // m_keyBuffer.appendInt(word.length()); m_keyBuffer.appendChars(word.toLowerCase().toCharArray()); int docListRecord = m_word2DocList.findRecord(m_keyBuffer); if (docListRecord == -1) { return new int[0]; } else { int size = m_word2DocList.getInt(docListRecord, DOC_LIST_SIZE_OFFSET); int res[] = new int[size]; m_word2DocList.getInts(docListRecord, DOC_LIST_DATA_OFFSET, res, 0, size); return res; } } public int getNumWords() { return m_word2DocList.size(); } private void nextWord(char line[], int length, char[] seps) { m_wordLength = 0; while (m_textOffset < length) { char c = line[m_textOffset++]; boolean found_sp = false; for (int j = 0; j < seps.length; j++) { if (c == seps[j]) { found_sp = true; break; } } if (found_sp) { if (m_wordLength > 0) { break; } else { // if no word yet, keep looking continue; } } if (m_wordLength < MAX_WORD_LENGTH) { m_wordBuf[m_wordLength++] = Character.toLowerCase(c); } } } public IVarKeyHashMap getWord2DocList() { return m_word2DocList; } public long computeMemoryUsage() { return m_word2DocList.computeMemoryUsage(); } public long getNumDocumentsIndexed() { return m_numDocumentsIndexed; } public long getNumWordsTokenized() { return m_numWordsTokenized; } public long getTotalIndexedTextSize() { return m_totalIndexedTextSize; } public void setDebug(boolean b) { m_word2DocList.setDebug(DebugLevel.DEBUG_CONTENT); m_currentDocumentWords.setDebug(DebugLevel.DEBUG_STRUCTURE); m_stopWords.setDebug(DebugLevel.DEBUG_CONTENT); } public void addStopWord(String word) { m_keyBuffer.reset(); m_keyBuffer.appendChars(word.toLowerCase().toCharArray()); m_stopWords.createRecord(m_keyBuffer, 0); } public void setMaxDocListSize(int maxDocListSize) { m_maxDocListSize = maxDocListSize; } }