TextIndex.java example

Explorer

Banana-master
- banana
  - src
    - net
      - yadan
        banana
        DebugLevel.java
        DefaultFormatter.java
        Formatter.java
        ICollection.java
        experimental
        set
        DefaultSetVisitor.java
        HashSet.java
        ISet.java
        ISetVisitor.java
        list
        DoubleLinkedList.java
        ILinkedList.java
        LinkedList.java
        ListUtil.java
        map
        HashMap.java
        HashMapVisitor.java
        HashMapVisitorAdapter.java
        IHashMap.java
        IVarKeyHashMap.java
        VarKeyHashMap.java
        VarKeyHashMapVisitor.java
        VarKeyHashMapVisitorAdapter.java
        memory
        Buffer.java
        IAllocator.java
        IBlockAllocator.java
        IBuffer.java
        IMemAllocator.java
        IPrimitiveAccess.java
        MemInitializer.java
        OffsetPrimitiveAccess.java
        OutOfBoundsAccess.java
        OutOfMemoryException.java
        block
        BigBlockAllocator.java
        BlockAllocator.java
        initializers
        MemSetInitializer.java
        NullInitializer.java
        PrototypeInitializer.java
        malloc
        ChainedAllocator.java
        MultiSizeAllocator.java
        TreeAllocator.java
        stack
        Stack.java
        utils
        LRU.java
        TextIndex.java
  - test
    - net
      - yadan
        banana
        AllTests.java
        list
        AllTests.java
        DoubleLinkedListTest.java
        LinkedListBenchmark.java
        LinkedListTest.java
        map
        AllTests.java
        HashMapTest.java
        VarKeyHashMapTest.java
        memory
        AllTests.java
        BufferTest.java
        block
        AbstractBlockAllocatorTest.java
        AllTests.java
        BigBlockAllocatorTest.java
        BlockAllocatorTest.java
        malloc
        AbstractCharsTest.java
        AbstractComputeMemoryUsageTest.java
        AbstractMemAllocatorTest.java
        AbstractMemSetTest.java
        AbstractOOMTest.java
        AbstractReallocTest.java
        AllTests.java
        MultiSizeAllocatorTest.java
        chainedallocator
        AllTests.java
        ChainedCharsTest.java
        ChainedComputeMemoryUsageTest.java
        ChainedMemAllocatorTest.java
        ChainedMemSetTest.java
        ChainedOOMTest.java
        ChainedReallocTest.java
        treeallocator
        AllTests.java
        TreeCharsTest.java
        TreeComputeMemoryUsageTest.java
        TreeMemAllocatorTest.java
        TreeMemSetTest.java
        TreeOOMTest.java
        TreeReallocTest.java
        VarTests.java
        stack
        AllTests.java
        StackTest.java
        utils
        AllTests.java
        LRUBenchmark.java
        LRUTest.java
        TextIndexTest.java
- examples
  - src
    - net
      - yadan
        banana
        list
        FormattingListElements.java
        LinkedListExample.java
        map
        FormattingMapElements.java
        HashMapExample.java
        SimpleMapOverhead.java
        VarKeyHashMapExample.java
        memory
        block
        BlockAllocatorExample.java
        malloc
        MemoryAllocatorsExample.java
        utils
        LRUExample.java
- misc
  - src
    - net
      - yadan
        banana
        list
        LinkedListBenchmark.java
        map
        Benchmark.java
        LongToFixedSizeObjectBenchmark.java
        MapInsertRate.java
        memory
        block
        Benchmark.java
        Type.java
        malloc
        Benchmark.java
        tools
        WikipediaIndexer.java
        utils
        DefaultStringConverter.java
        GlobFilter.java
        Histogram.java
        JVMSpawn.java
        RateCounter.java
        StringConverter.java
        StringUtil.java
        Util.java

/*
 * Copyright (C) 2013 Omry Yadan <omry@yadan.net>
 * All rights reserved.
 *
 * See https://github.com/omry/banana/blob/master/BSD-LICENSE for licensing information
 */
package net.yadan.banana.utils;

import net.yadan.banana.DebugLevel;
import net.yadan.banana.map.IVarKeyHashMap;
import net.yadan.banana.map.VarKeyHashMap;
import net.yadan.banana.memory.Buffer;
import net.yadan.banana.memory.IBuffer;
import net.yadan.banana.memory.IMemAllocator;
import net.yadan.banana.memory.malloc.MultiSizeAllocator;
import net.yadan.banana.memory.malloc.TreeAllocator;


public class TextIndex {

  private static final int DOC_LIST_BLOCK_SIZE = 10;
  private static final int INITIAL_DOC_LIST_SIZE = DOC_LIST_BLOCK_SIZE
      - VarKeyHashMap.RESERVED_SIZE;
  public static final int DOC_LIST_ALLOCATION_SIZE_OFFSET = 0;
  public static final int DOC_LIST_SIZE_OFFSET = 1;
  private static final int DOC_LIST_DATA_OFFSET = 2;
  // private static double DOC_LIST_GROWTH_FACTOR = 1.2;

  private int MAX_WORD_LENGTH = 30;

  private int m_maxDocListSize = -1;
  private char[] m_wordBuf;
  private int m_wordLength;
  int m_textOffset;

  IBuffer m_keyBuffer;
  private IVarKeyHashMap m_word2DocList;
  private IVarKeyHashMap m_currentDocumentWords; // TODO: should be a Set
  private IVarKeyHashMap m_stopWords; // TODO: should be a Set

  // stats
  private long m_numDocumentsIndexed = 0;
  private long m_numWordsTokenized = 0;
  private long m_totalIndexedTextSize = 0;

  public TextIndex(int initialWordsCapacity, int maxWordLength) {
    m_wordBuf = new char[MAX_WORD_LENGTH];
    m_wordLength = 0;
    m_textOffset = 0;

    IMemAllocator docListsMemory = new TreeAllocator(initialWordsCapacity, DOC_LIST_BLOCK_SIZE, 2.0);
    IMemAllocator keys = new MultiSizeAllocator(1024, new int[] { 1, 2, 4, 8, 16, 32 }, 1.5);
    m_word2DocList = new VarKeyHashMap(docListsMemory, keys, initialWordsCapacity, 0.75);
    m_keyBuffer = new Buffer(50);
    m_currentDocumentWords = new VarKeyHashMap(docListsMemory, keys, initialWordsCapacity, 0.75);
    m_stopWords = new VarKeyHashMap(docListsMemory, keys, 100, 0.75);
  }

  public int index(int documentId, String text, char seps[]) {
    m_currentDocumentWords.clear();
    m_numDocumentsIndexed++;
    m_totalIndexedTextSize += text.length();
    int length = text.length();
    char line[] = text.toCharArray();
    m_textOffset = 0;
    int numWords = 0;
    while (m_textOffset < length) {
      nextWord(line, length, seps);
      if (m_wordLength > 0) {
        try {
          numWords++;
          m_numWordsTokenized++;
          m_keyBuffer.reset();
          // m_keyBuffer.appendInt(m_wordLength);
          m_keyBuffer.appendChars(m_wordBuf, 0, m_wordLength);

          if (m_stopWords.containsKey(m_keyBuffer)) {
            continue;
          }

          if (m_currentDocumentWords.containsKey(m_keyBuffer)) {
            continue;
          }
          m_currentDocumentWords.createRecord(m_keyBuffer, 0);

          int docListRecord = m_word2DocList.findRecord(m_keyBuffer);
          if (docListRecord == -1) {
            // new word, create a list of documents this word is in]
            docListRecord = m_word2DocList.createRecord(m_keyBuffer, INITIAL_DOC_LIST_SIZE);
            m_word2DocList.setInt(docListRecord, DOC_LIST_ALLOCATION_SIZE_OFFSET,
                INITIAL_DOC_LIST_SIZE);
            m_word2DocList.setInt(docListRecord, DOC_LIST_SIZE_OFFSET, 0); // zero
          }

          int size = m_word2DocList.getInt(docListRecord, DOC_LIST_SIZE_OFFSET);
          int maxCap = m_word2DocList.getInt(docListRecord, DOC_LIST_ALLOCATION_SIZE_OFFSET);
          // this word appears too many times, stop keeping track of it
          if (m_maxDocListSize != -1 && size >= m_maxDocListSize) {
            continue;
          }

          if (size > maxCap - (VarKeyHashMap.RESERVED_SIZE + DOC_LIST_DATA_OFFSET)) {
            // System.out.println(m_word2DocList.valueMemory().pointerDebugString(docListRecord));

            int newSize = maxCap + DOC_LIST_BLOCK_SIZE;
            docListRecord = m_word2DocList.reallocRecord(m_keyBuffer, newSize);
            m_word2DocList.setInt(docListRecord, DOC_LIST_ALLOCATION_SIZE_OFFSET, newSize);
            // System.out.println(m_word2DocList.valueMemory().pointerDebugString(docListRecord));
          }

          m_word2DocList.setInt(docListRecord, DOC_LIST_SIZE_OFFSET, size + 1);
          m_word2DocList.setInt(docListRecord, DOC_LIST_DATA_OFFSET + size, documentId);
        } catch (RuntimeException e) {
          System.out.println("Error indexing document " + documentId + " , word : "
              + new String(m_wordBuf, 0, m_wordLength));
          throw e;
        }
      }
    }
    return numWords;
  }

//  int EMPTY[] = new int[0];

  public int[] find(String word) {
    m_keyBuffer.reset();
    // m_keyBuffer.appendInt(word.length());
    m_keyBuffer.appendChars(word.toLowerCase().toCharArray());
    int docListRecord = m_word2DocList.findRecord(m_keyBuffer);
    if (docListRecord == -1) {
      return new int[0];
    } else {

      int size = m_word2DocList.getInt(docListRecord, DOC_LIST_SIZE_OFFSET);
      int res[] = new int[size];
      m_word2DocList.getInts(docListRecord, DOC_LIST_DATA_OFFSET, res, 0, size);
      return res;
    }
  }

  public int getNumWords() {
    return m_word2DocList.size();
  }

  private void nextWord(char line[], int length, char[] seps) {
    m_wordLength = 0;
    while (m_textOffset < length) {
      char c = line[m_textOffset++];
      boolean found_sp = false;
      for (int j = 0; j < seps.length; j++) {
        if (c == seps[j]) {
          found_sp = true;
          break;
        }
      }
      if (found_sp) {
        if (m_wordLength > 0) {
          break;
        } else {
          // if no word yet, keep looking
          continue;
        }
      }

      if (m_wordLength < MAX_WORD_LENGTH) {
        m_wordBuf[m_wordLength++] = Character.toLowerCase(c);
      }
    }
  }

  public IVarKeyHashMap getWord2DocList() {
    return m_word2DocList;
  }

  public long computeMemoryUsage() {
    return m_word2DocList.computeMemoryUsage();
  }

  public long getNumDocumentsIndexed() {
    return m_numDocumentsIndexed;
  }

  public long getNumWordsTokenized() {
    return m_numWordsTokenized;
  }

  public long getTotalIndexedTextSize() {
    return m_totalIndexedTextSize;
  }

  public void setDebug(boolean b) {
    m_word2DocList.setDebug(DebugLevel.DEBUG_CONTENT);
    m_currentDocumentWords.setDebug(DebugLevel.DEBUG_STRUCTURE);
    m_stopWords.setDebug(DebugLevel.DEBUG_CONTENT);
  }

  public void addStopWord(String word) {
    m_keyBuffer.reset();
    m_keyBuffer.appendChars(word.toLowerCase().toCharArray());
    m_stopWords.createRecord(m_keyBuffer, 0);
  }

  public void setMaxDocListSize(int maxDocListSize) {
    m_maxDocListSize = maxDocListSize;
  }
}