Tokenizer.java example

Explorer
ProjectLibre-master
/*
 * file:       Tokenizer.java
 * author:     Jon Iles
 * copyright:  (c) Packwood Software 2002-2003
 * date:       03/01/2003
 */

/*
 * This library is free software; you can redistribute it and/or modify it
 * under the terms of the GNU Lesser General Public License as published by the
 * Free Software Foundation; either version 2.1 of the License, or (at your
 * option) any later version.
 *
 * This library is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
 * License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this library; if not, write to the Free Software Foundation, Inc.,
 * 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
 */

package net.sf.mpxj.common;

import java.io.IOException;

/**
 * This class implements a tokenizer based loosely on
 * java.io.StreamTokenizer. This tokenizer is designed to parse records from
 * an MPX file correctly. In particular it will handle empty fields,
 * represented by adjacent field delimiters.
 */
public abstract class Tokenizer
{
   /**
    * This method must be implemented to read the next character from the
    * data source.
    *
    * @return next character
    * @throws IOException
    */
   protected abstract int read() throws IOException;

   /**
    * This method retrieves the next token and returns a constant representing
    * the type of token found.
    *
    * @return token type value
    */
   public int nextToken() throws IOException
   {
      int c;
      int nextc = -1;
      boolean quoted = false;
      int result = m_next;
      if (m_next != 0)
      {
         m_next = 0;
      }

      m_buffer.setLength(0);

      while (result == 0)
      {
         if (nextc != -1)
         {
            c = nextc;
            nextc = -1;
         }
         else
         {
            c = read();
         }

         switch (c)
         {
            case TT_EOF:
            {
               if (m_buffer.length() != 0)
               {
                  result = TT_WORD;
                  m_next = TT_EOF;
               }
               else
               {
                  result = TT_EOF;
               }
               break;
            }

            case TT_EOL:
            {
               int length = m_buffer.length();

               if (length != 0 && m_buffer.charAt(length - 1) == '\r')
               {
                  --length;
                  m_buffer.setLength(length);
               }

               if (length == 0)
               {
                  result = TT_EOL;
               }
               else
               {
                  result = TT_WORD;
                  m_next = TT_EOL;
               }

               break;
            }

            default:
            {
               if (c == m_quote)
               {
                  if (quoted == false && startQuotedIsValid(m_buffer))
                  {
                     quoted = true;
                  }
                  else
                  {
                     if (quoted == false)
                     {
                        m_buffer.append((char) c);
                     }
                     else
                     {
                        nextc = read();
                        if (nextc == m_quote)
                        {
                           m_buffer.append((char) c);
                           nextc = -1;
                        }
                        else
                        {
                           quoted = false;
                        }
                     }
                  }
               }
               else
               {
                  if (c == m_delimiter && quoted == false)
                  {
                     result = TT_WORD;
                  }
                  else
                  {
                     m_buffer.append((char) c);
                  }
               }
            }
         }
      }

      m_type = result;

      return (result);
   }

   /**
    * This method allows us to control the behaviour of the tokenizer for 
    * quoted text. Normally quoted text begins with a quote character
    * at the first position within a field. As this method is protected,
    * sub classes can alter this behaviour if required.
    * 
    * @param buffer the field contents read so far
    * @return true if it is valid to treat the subsequent text as quoted
    */
   protected boolean startQuotedIsValid(StringBuilder buffer)
   {
      return buffer.length() == 0;
   }

   /**
    * This method retrieves the text of the last token found.
    *
    * @return last token text
    */
   public String getToken()
   {
      return (m_buffer.toString());
   }

   /**
    * This method retrieves the type of the last token found.
    *
    * @return last token type
    */
   public int getType()
   {
      return (m_type);
   }

   /**
    * This method is used to set the delimiter character recognised
    * by the tokenizer.
    *
    * @param delimiter delimiter character
    */
   public void setDelimiter(char delimiter)
   {
      m_delimiter = delimiter;
   }

   public static final int TT_EOL = '\n';
   public static final int TT_EOF = -1;
   public static final int TT_WORD = -3;

   private char m_quote = '"';
   private char m_delimiter = ',';
   private int m_next;
   private int m_type;
   private StringBuilder m_buffer = new StringBuilder();
}