TernarySearchTree.java example

Explorer

openpipe-master
- lemmatizer
  - src
    - main
      - java
        no
        trank
        openpipe
        lemmatizer
        LemmatizerStep.java
        model
        LemmaDeSerializer.java
        LemmaSuffix.java
        LemmatizeModel.java
        LemmatizeModelFactory.java
        parser
        Parser.java
        TextParser.java
        util
        TernarySearchTree.java
        TreeEntry.java
        TreeValue.java
        TreeValueFactory.java
    - test
      - java
        no
        trank
        openpipe
        lemmatizer
        model
        LemmaDeSerializerTest.java
        parser
        TextParserTest.java
        util
        TernarySearchTreeTest.java
- openpipe-core
  - src
    - main
      - java
        no
        trank
        openpipe
        MainSpring.java
        api
        BasePipelineExceptionHandler.java
        BasePipelineStep.java
        BaseSubPipeline.java
        DefaultPipelineExceptionHandler.java
        Finishable.java
        LoggingPipelineExceptionListener.java
        MultiInputFieldPipelineStep.java
        MultiInputOutputFieldPipelineStep.java
        MultiPipelineException.java
        Pipeline.java
        PipelineException.java
        PipelineExceptionHandler.java
        PipelineExceptionListener.java
        PipelineFlow.java
        PipelineFlowEnum.java
        PipelineRunner.java
        PipelineStatusCode.java
        PipelineStep.java
        PipelineStepStatus.java
        PipelineStepStatusCode.java
        SubPipeline.java
        document
        AnnotatedField.java
        Annotation.java
        BaseAnnotatedField.java
        BaseAnnotation.java
        BaseResolvedAnnotation.java
        ByteArrayRawData.java
        Document.java
        DocumentOperation.java
        DocumentProducer.java
        DomRawData.java
        FileRawData.java
        PreResolvedAnnotation.java
        RawData.java
        ResolvedAnnotation.java
        package-info.java
        package-info.java
        config
        BeanValidator.java
        annotation
        NotEmpty.java
        NotNull.java
        NullNotEmpty.java
        reader
        FileDocumentReader.java
        MultiXmlDocumentReader.java
        TextFileDocumentReader.java
        step
        AnnotateSentence.java
        AnnotateSpace.java
        AnnotationToField.java
        ChecksumFields.java
        ChopField.java
        ConvertDate.java
        CopyField.java
        Debug.java
        FieldPipelineSelector.java
        HierarchicalSplitter.java
        OperationPipelineSelector.java
        ParseXML.java
        ParseXMLXPath.java
        PipelineSelector.java
        RegexField.java
        RemoveFields.java
        SetField.java
        StripHtml.java
        Uppercase.java
        WriteXML.java
        xml
        DocumentWriter.java
        XMLStreamDocWriter.java
        util
        AcceptAllFileFilter.java
        FilesFirstComparator.java
        HexUtil.java
        IdentityHashSet.java
        Iterators.java
        QNameEditor.java
        RegexFileFilter.java
        log
        DefaultTimedLogger.java
        NoopTimedLogger.java
        TimedLogger.java
        TotalTimedLogger.java
    - test
      - java
        no
        trank
        openpipe
        api
        BaseSubPipelineTest.java
        DefaultPipelineExceptionHandlerTest.java
        PipelineTest.java
        document
        BaseAnnotatedFieldTest.java
        DocumentTest.java
        config
        BeanValidatorTest.java
        reader
        FileDocumentReaderTest.java
        MultiXmlDocumentReaderTest.java
        step
        AnnotationToFieldTest.java
        ChecksumFieldsTest.java
        ChopFieldTest.java
        ConvertDateTest.java
        HierarchicalSplitterTest.java
        OperationPipelineSelectorTest.java
        ParseXMLTest.java
        ParseXMLXPathTest.java
        RegexFieldTest.java
        SetFieldTest.java
        StripHtmlTest.java
        util
        HexUtilTest.java
        RegexFileFilterTest.java
- openpipe-jdbc
  - src
    - main
      - java
        no
        trank
        openpipe
        jdbc
        DocumentMapper.java
        HtmlJdbcStats.java
        JdbcAdmin.java
        JdbcDocumentProducer.java
        JdbcPoller.java
        JdbcStats.java
        MetaDataDocumentMapper.java
        NoopJdbcStats.java
        SimpleJdbcDocumentProducer.java
        store
        IdStateHolder.java
        StateDocumentProducer.java
        StateDocumentStep.java
        StringRowMapper.java
        TableDescription.java
    - test
      - java
        no
        trank
        openpipe
        jdbc
        SimpleJdbcDocumentProducerTest.java
        store
        StateDocumentProducerTest.java
- openpipe-lang
  - src
    - main
      - java
        no
        trank
        openpipe
        lang
        step
        LanguageIdentifier.java
    - test
      - java
        no
        trank
        openpipe
        lang
        step
        LanguageIdentifierTest.java
- openpipe-opennlp
  - src
    - main
      - java
        no
        trank
        openpipe
        opennlp
        io
        InputStreamGISModelReader.java
        step
        ONLPNEDetector.java
        ONLPSentenceDetector.java
        ONLPTokenizer.java
- openpipe-solr
  - src
    - main
      - java
        no
        trank
        openpipe
        solr
        SolrDocumentPostException.java
        SolrHttpDocumentPoster.java
        SolrXmlDocumentWriter.java
        UpdateOptions.java
        analysis
        AnnotationTokenStream.java
        Base64TokenSerializer.java
        TokenAnnotation.java
        TokenSerializer.java
        TokenStreamAnnotation.java
        step
        SolrAnalyzerStep.java
        SolrDocumentProcessor.java
        util
        TokenFilterFactoryFactory.java
        xml
        XmlInputStream.java
    - test
      - java
        no
        trank
        openpipe
        solr
        step
        SolrDocumentProcessorTest.java
        xml
        XmlInputStreamTest.java
- parse
  - src
    - main
      - java
        no
        trank
        openpipe
        parse
        api
        ParseData.java
        Parser.java
        ParserException.java
        ParserResult.java
        ParserResultImpl.java
        PipelineParseData.java
        package-info.java
        step
        DocumentParser.java
        package-info.java
- parse-misc
  - src
    - main
      - java
        no
        trank
        openpipe
        parse
        oo
        OOParser.java
        pdf
        PDFParser.java
        text
        TextDecoder.java
        TextParser.java
        xml
        XMLParser.java
    - test
      - java
        no
        trank
        openpipe
        parse
        oo
        OOParserTest.java
        xml
        XMLParserTest.java
- parse-ms
  - src
    - main
      - java
        no
        trank
        openpipe
        parse
        ms
        ExcelParser.java
        POIUtils.java
        PowerPointParser.java
        WordParser.java
    - test
      - java
        no
        trank
        openpipe
        parse
        ms
        AbstractMsParserTest.java
        ExcelParserTest.java
        PowerPointParserTest.java
        WordParserTest.java
- solr-producer
  - src
    - main
      - java
        no
        trank
        openpipe
        solr
        producer
        SolrDocumentProducer.java
        SolrUpdateServlet.java
        xml
        XmlStreamDocumentReader.java
    - test
      - java
        no
        trank
        openpipe
        solr
        producer
        SolrUpdateServletTest.java
        xml
        XmlStreamDocumentReaderTest.java
- solr-tokenizer
  - src
    - main
      - java
        no
        trank
        openpipe
        solr
        analysis
        BinaryIO.java
        BinaryTokenDeserializer.java
        BinaryTokenDeserializerFactory.java
        io
        Base64InputStream.java
        Base64Output.java
        Base64OutputBuffer.java
        Base64OutputStream.java
        schema
        Base64Type.java
        util
        IOUtil.java
    - test
      - java
        no
        trank
        openpipe
        solr
        analysis
        BinaryIOTest.java
        BinaryTokenDeserializerTest.java
        io
        Base64InputStreamTest.java
        Base64OutputBufferTest.java
        Base64OutputStreamTest.java
        util
        IOUtilTest.java
- tutorial-intranet
  - src
    - main
      - java
        no
        trank
        openpipe
        tutorial
        intranet
        Main.java
    - test
      - java
        TestMainIntranet.java
- wikipedia
  - src
    - main
      - java
        no
        trank
        openpipe
        wikipedia
        WikipediaDumpHandler.java
        download
        DownloadProgressListener.java
        DownloadProgressLogger.java
        DownloadingWikipediaDumpHandler.java
        HttpDownloader.java
        NullProgressListener.java
        producer
        InputStreamPrefixStripper.java
        WikiDocumentSplitter.java
        WikipediaDocumentProducer.java
        meta
        RssMetaParser.java
        step
        WikipediaUrlBuilder.java
    - test
      - java
        no
        trank
        openpipe
        wikipedia
        download
        HttpDownloaderTest.java
        producer
        WikiDocumentSplitterTest.java
        meta
        RssMetaParserTest.java
        step
        WikipediaUrlBuilderTest.java

package no.trank.openpipe.lemmatizer.util;

/*		 
* MG4J: Managing Gigabytes for Java
*
* Copyright (C) 2005-2007 Sebastiano Vigna 
*
*  This library is free software; you can redistribute it and/or modify it
*  under the terms of the GNU Lesser General Public License as published by the Free
*  Software Foundation; either version 2.1 of the License, or (at your option)
*  any later version.
*
*  This library is distributed in the hope that it will be useful, but
*  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
*  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
*  for more details.
*
*  You should have received a copy of the GNU Lesser General Public License
*  along with this program; if not, write to the Free Software
*  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
*/

import it.unimi.dsi.io.InputBitStream;
import it.unimi.dsi.io.OutputBitStream;
import it.unimi.dsi.lang.MutableString;

import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.Arrays;

public class TernarySearchTree<V extends TreeValue> {
   private static final byte VERSION = (byte) 1;
   private static final byte[] HEADER = new byte[]{(byte) 'o', (byte) '|', VERSION};
   /**
    * The root of the tree.
    */
   private Node<V> root;
   /**
    * The number of nodes in the tree.
    */
   private int size;
   private final TreeValueFactory<V> factory;
   private static final int LEN_ZK = 1;
   private static final int CHAR_ZK = 3;

   /**
    * Creates a new empty ternary search tree.
    */
   public TernarySearchTree(TreeValueFactory<V> factory) {
      if (factory == null) {
         throw new NullPointerException("TreeValueFactory cannot be null");
      }
      this.factory = factory;
   }

   /**
    * Creates a new empty ternary search tree and populates it with a given collection of character sequences.
    *
    * @param c a collection of character sequences.
    */
   public TernarySearchTree(final Iterable<TreeEntry<V>> c, TreeValueFactory<V> factory) {
      this(factory);
      for (final TreeEntry<V> entry : c) {
         put(entry.getKey(), entry.getValue());
      }
   }


   public V get(final CharSequence s) {
      final int l = s.length();

      Node<V> e = root;
      int offset = 0;

      while (e != null) {
         final char[] path = e.path;
         int i = 0;
         for (; i < path.length - 1; i++) {
            if (offset + i == l || s.charAt(offset + i) != path[i]) {
               return null;
            }
         }

         offset += i;
         if (offset == l) {
            return null;
         }

         final char c = s.charAt(offset);
         if (c < e.path[i]) {
            e = e.left;
         } else if (c > e.path[i]) {
            e = e.right;
         } else {
            offset++;
            if (offset == l) {
               return e.value;
            }
            e = e.middle;
         }
      }

      return null;
   }

   public boolean contains(CharSequence s) {
      return get(s) != null;
   }

   /**
    * True if the last {@link #add(CharSequence)} modified the tree.
    */
   private boolean modified;

   public boolean put(final CharSequence s, final V value) {
      modified = false;
      root = addRec(s, 0, s.length(), root, value);
      return modified;
   }

   /**
    * Inserts the given character sequence, starting at the given position, in the given subtree.
    *
    * @param s      the character sequence containing the characters to be inserted.
    * @param offset the first character to be inserted.
    * @param length the number of characters to be inserted.
    * @param e      the subtree in which the characters should be inserted, or <code>null</code> if
    *               a new node should be created.
    * @return the new node at the top of the subtree.
    */

   private Node<V> addRec(final CharSequence s, final int offset, final int length, final Node<V> e, final V value) {

      if (e == null) {
         // We create a new node containing all the characters and return it.
         modified = true;
         size++;
         return new Node<V>(s, offset, length, value);
      }

      /* We start scanning the path contained in the current node, up to
         * the last character excluded. If we find a mismatch, or if we exhaust our
         * characters, we must fork this node. */

      int i;
      Node<V> n = null;
      final char[] path = e.path;

      for (i = 0; i < path.length - 1; i++) {
         final char c = s.charAt(offset + i);

         if (c < path[i]) {
            /* We fork on the left, keeping just the first i + 1 characters (this is necessary
                 * as at least one character must be present in every node). The new
                 * node will cover one word more than e.
                 */
            n = new Node<V>(path, 0, i + 1, null);

            n.middle = e;
            e.removePathPrefix(i + 1);

            n.left = addRec(s, offset + i, length - i, null, value);
            break;
         } else if (c > path[i]) {
            // As before, but on the right.
            n = new Node<V>(path, 0, i + 1, null);

            n.middle = e;
            e.removePathPrefix(i + 1);

            n.right = addRec(s, offset + i, length - i, null, value);
            break;
         } else {
            if (i == length - 1) {
               /* We exhausted the character sequence. We fork in the middle,
                     * keeping length characters and marking the new node as
                     * containing one work. Again, the new code will cover one word
                     * more than e. */
               n = new Node<V>(s, offset, length, value);
               n.middle = e;
               e.removePathPrefix(length);
               size++;
               modified = true;
               break;
            }
         }
      }

      if (i < path.length - 1) {
         return n;
      }

      /* We are positioned on the last character of the path. In this case our
         * behaviour is different, as if we must fork we must not perform any
         * splitting. Moreover, if we exhaust the characters we either found
         * the new sequence in the tree, or we just have to mark the node. */

      final char c = s.charAt(offset + i);

      if (c < path[i]) {
         /** We fork on the left. The number of words under this node will
          * increase only if the structure is modified. */
         e.left = addRec(s, offset + i, length - i, e.left, value);
      } else if (c > path[i]) {
         e.right = addRec(s, offset + i, length - i, e.right, value);
      } else {
         if (i == length - 1) {
            // This is the node.
            if (modified = e.value != null) {
               size++;
            }
            e.value = value;
         } else {
            // We add a node in the middle, completing the sequence.
            e.middle = addRec(s, offset + i + 1, length - i - 1, e.middle, value);
         }
      }

      return e;
   }

   public int size() {
      return size;
   }

   public void read(InputStream in) throws IOException {
      readHeader(in);
      final InputBitStream inB = new InputBitStream(in);
      final long serialVersionUID = inB.readLongNibble();
      if (serialVersionUID != factory.getSerialVersionUID()) {
         throw new IOException("serialVersionUID missmatch, read " + serialVersionUID + " exptected " +
               factory.getSerialVersionUID());
      }
      size = inB.readNibble();
      factory.readHeader(inB);
      root = readNode(inB);
   }

   private Node<V> readNode(InputBitStream in) throws IOException {
      final int len = in.readZeta(LEN_ZK);
      if (len > 0) {
         final char[] path = new char[len];
         for (int i = 0; i < len; i++) {
            path[i] = (char) (in.readZeta(CHAR_ZK) + '0');
         }
         final V value;
         if (in.readBit() == 1) {
            value = factory.newValue();
            value.read(in);
         } else {
            value = null;
         }
         final Node<V> node = new Node<V>(path, value);
         node.left = readNode(in);
         node.middle = readNode(in);
         node.right = readNode(in);
         return node;
      }
      return null;
   }

   private static void readHeader(InputStream in) throws IOException {
      final byte[] buf = new byte[HEADER.length];
      final int len = in.read(buf);
      if (len != buf.length) {
         throw new IOException("Could not read header from stream, got " + len + " bytes expected " + HEADER.length);
      } else if (!Arrays.equals(HEADER, buf)) {
         throw new IOException("Could not read header from stream, got " + Arrays.toString(buf) + " expected " +
               Arrays.toString(HEADER));
      }
   }

   public void write(OutputStream out) throws IOException {
      out.write(HEADER);
      final OutputBitStream outB = new OutputBitStream(out);
      try {
         outB.writeLongNibble(factory.getSerialVersionUID());
         outB.writeNibble(size);
         factory.writeHeader(outB);
         writeNode(root, outB);
      } finally {
         outB.flush();
      }
   }

   private void writeNode(final Node<V> node, final OutputBitStream out) throws IOException {
      if (node == null) {
         out.writeZeta(0, LEN_ZK);
      } else {
         final char[] path = node.path;
         final int len = path.length;
         out.writeZeta(len, LEN_ZK);
         for (int i = 0; i < len; i++) {
            out.writeZeta(path[i] - '0', CHAR_ZK);
         }
         final boolean hasValue = node.value != null;
         out.writeBit(hasValue);
         if (hasValue) {
            node.value.write(out);
         }
         writeNode(node.left, out);
         writeNode(node.middle, out);
         writeNode(node.right, out);
      }
   }

   /**
    * A node of the tree.
    */
   private static final class Node<V extends TreeValue> {
      /**
       * A pointer to the left subtree.
       */
      private Node<V> left;
      /**
       * A pointer to the middle subtree.
       */
      private Node<V> middle;
      /**
       * A pointer to the right subtree.
       */
      private Node<V> right;
      /**
       * The nonempty path compressed at this node.
       */
      private char[] path;
      /**
       * Whether this node represents a word.
       */
      private V value;

      /**
       * Creates a new node containing a path specified by a character-sequence fragment.
       *
       * @param s        a character sequence contaning the path of the node.
       * @param offset   the starting character of the path.
       * @param length   the length of the path.
       * @param value    the value of this node.
       */
      public Node(final CharSequence s, final int offset, final int length, final V value) {
         this.value = value;
         path = new char[length];
         MutableString.getChars(s, offset, offset + length, path, 0);
      }

      /**
       * Creates a new node containing a path specified by a character-array fragment.
       *
       * @param a        a character array contaning the path of the node.
       * @param offset   the starting character of the path.
       * @param length   the length of the path.
       * @param value    the value of this node.
       */
      public Node(final char[] a, final int offset, final int length, final V value) {
         this.value = value;
         path = new char[length];
         System.arraycopy(a, offset, path, 0, length);
      }

      /**
       * Creates a new node containing a path specified by a character-array fragment.
       *
       * @param path     a character array contaning the path of the node.
       * @param value    the value of this node.
       */
      public Node(char[] path, V value) {
         this.path = path;
         this.value = value;
      }

      /**
       * Removes a prefix from the path of this node.
       *
       * @param length the length of the prefix to be removed
       */
      public void removePathPrefix(final int length) {
         final char[] a = new char[path.length - length];
         System.arraycopy(path, length, a, 0, a.length);
         path = a;
      }
   }
}