CSVTokenizer.java example

Explorer
pentaho-reporting-master
/*!
* This program is free software; you can redistribute it and/or modify it under the
* terms of the GNU Lesser General Public License, version 2.1 as published by the Free Software
* Foundation.
*
* You should have received a copy of the GNU Lesser General Public License along with this
* program; if not, you can obtain a copy at http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html
* or from the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
* See the GNU Lesser General Public License for more details.
*
* Copyright (c) 2002-2016 Pentaho Corporation..  All rights reserved.
*/

package org.pentaho.reporting.libraries.base.util;

import java.util.ArrayList;
import java.util.Enumeration;
import java.util.NoSuchElementException;

/**
 * The csv tokenizer class allows an application to break a Comma Separated Value format into tokens. The tokenization
 * method is much simpler than the one used by the <code>StringTokenizer</code> class. The <code>CSVTokenizer</code>
 * methods do not distinguish among identifiers, numbers, and quoted strings, nor do they recognize and skip comments.
 * <p/>
 * The set of separator (the characters that separate tokens) may be specified either at creation time or on a per-token
 * basis.
 * <p/>
 * An instance of <code>CSVTokenizer</code> behaves in one of two ways, depending on whether it was created with the
 * <code>returnSeparators</code> flag having the value <code>true</code> or <code>false</code>: <ul> <li>If the flag is
 * <code>false</code>, delimiter characters serve to separate tokens. A token is a maximal sequence of consecutive
 * characters that are not separator. <li>If the flag is <code>true</code>, delimiter characters are themselves
 * considered to be tokens. A token is thus either one delimiter character, or a maximal sequence of consecutive
 * characters that are not separator. </ul><p> A <tt>CSVTokenizer</tt> object internally maintains a current position
 * within the string to be tokenized. Some operations advance this current position past the characters processed.<p> A
 * token is returned by taking a substring of the string that was used to create the <tt>CSVTokenizer</tt> object.
 * <p/>
 * The following is one example of the use of the tokenizer. The code:
 * <blockquote><pre>
 *     CSVTokenizer csvt = new CSVTokenizer("this,is,a,test");
 *     while (csvt.hasMoreTokens()) {
 *         println(csvt.nextToken());
 *     }
 * </pre></blockquote>
 * <p/>
 * prints the following output:
 * <blockquote><pre>
 *     this
 *     is
 *     a
 *     test
 * </pre></blockquote>
 *
 * @author abupon
 */
public class CSVTokenizer implements Enumeration {
  /**
   * The complete record that should be separated into elements.
   */
  private String record;
  /**
   * The separator.
   */
  private String separator;
  /**
   * The quoting char.
   */
  private String quate;

  /**
   * the current parsing position.
   */
  private int currentIndex;

  /**
   * A flag indicating that the current parse position is before the start.
   */
  private boolean beforeStart;

  /**
   * A possible separator constant.
   */
  public static final String SEPARATOR_COMMA = ",";
  /**
   * A possible separator constant.
   */
  public static final String SEPARATOR_TAB = "\t";
  /**
   * A possible separator constant.
   */
  public static final String SEPARATOR_SPACE = " ";

  /**
   * A possible quote character constant.
   */
  public static final String DOUBLE_QUATE = "\"";
  /**
   * A possible quote character constant.
   */
  public static final String SINGLE_QUATE = "'";

  /**
   * Constructs a csv tokenizer for the specified string. <code>theSeparator</code> argument is the separator for
   * separating tokens.
   * <p/>
   * If the <code>returnSeparators</code> flag is <code>true</code>, then the separator string is also returned as
   * tokens. separator is returned as a string. If the flag is <code>false</code>, the separator string is skipped and
   * only serve as separator between tokens.
   *
   * @param aString      a string to be parsed.
   * @param theSeparator the separator (CSVTokenizer.SEPARATOR_COMMA, CSVTokenizer.TAB, CSVTokenizer.SPACE, etc.).
   * @param theQuate     the quate (CSVTokenizer.SINGLE_QUATE, CSVTokenizer.DOUBLE_QUATE, etc.).
   */
  public CSVTokenizer( final String aString,
                       final String theSeparator,
                       final String theQuate ) {
    this( aString, theSeparator, theQuate, true );
  }

  /**
   * Constructs a csv tokenizer for the specified string. <code>theSeparator</code> argument is the separator for
   * separating tokens.
   * <p/>
   * If the <code>returnSeparators</code> flag is <code>true</code>, then the separator string is also returned as
   * tokens. separator is returned as a string. If the flag is <code>false</code>, the separator string is skipped and
   * only serve as separator between tokens.
   *
   * @param aString      a string to be parsed.
   * @param theSeparator the separator (CSVTokenizer.SEPARATOR_COMMA, CSVTokenizer.TAB, CSVTokenizer.SPACE, etc.).
   * @param theQuate     the quate (CSVTokenizer.SINGLE_QUATE, CSVTokenizer.DOUBLE_QUATE, etc.).
   */
  public CSVTokenizer( final String aString,
                       final String theSeparator,
                       final String theQuate,
                       final boolean trim ) {
    if ( aString == null ) {
      throw new NullPointerException( "The given string is null" );
    }
    if ( theSeparator == null ) {
      throw new NullPointerException( "The given separator is null" );
    }
    if ( trim ) {
      this.record = aString.trim();
    } else {
      this.record = aString;
    }
    this.separator = theSeparator;
    this.quate = theQuate;
    this.currentIndex = 0;
    this.beforeStart = true;
  }

  /**
   * Constructs a csv tokenizer for the specified string. The characters in the <code>theSeparator</code> argument are
   * the separator for separating tokens. Separator string themselves will not be treated as tokens.
   *
   * @param aString      a string to be parsed.
   * @param theSeparator the separator (CSVTokenizer.SEPARATOR_COMMA, CSVTokenizer.TAB, CSVTokenizer.SPACE, etc.).
   */
  public CSVTokenizer( final String aString, final String theSeparator ) {
    this( aString, theSeparator, CSVTokenizer.DOUBLE_QUATE );
  }

  /**
   * Constructs a string tokenizer for the specified string. The tokenizer uses the default separator set, which is
   * <code>CSVTokenizer.SEPARATOR_COMMA</code>. Separator string themselves will not be treated as tokens.
   *
   * @param aString a string to be parsed.
   */
  public CSVTokenizer( final String aString ) {
    this( aString, CSVTokenizer.SEPARATOR_COMMA, CSVTokenizer.DOUBLE_QUATE, true );
  }

  /**
   * Constructs a string tokenizer for the specified string. The tokenizer uses the default separator set, which is
   * <code>CSVTokenizer.SEPARATOR_COMMA</code>. Separator string themselves will not be treated as tokens.
   *
   * @param aString a string to be parsed.
   */
  public CSVTokenizer( final String aString, final boolean trim ) {
    this( aString, CSVTokenizer.SEPARATOR_COMMA, CSVTokenizer.DOUBLE_QUATE, trim );
  }

  /**
   * Tests if there are more tokens available from this tokenizer's string. If this method returns <tt>true</tt>, then a
   * subsequent call to <tt>nextToken</tt> with no argument will successfully return a token.
   *
   * @return <code>true</code> if and only if there is at least one token in the string after the current position;
   * <code>false</code> otherwise.
   */
  public boolean hasMoreTokens() {
    return ( ( this.currentIndex < this.record.length() )
        && ( this.separator.length() > 0 ) );
  }

  /**
   * Returns the next token from this string tokenizer.
   *
   * @return the next token from this string tokenizer.
   * @throws NoSuchElementException   if there are no more tokens in this tokenizer's string.
   * @throws IllegalArgumentException if given parameter string format was wrong
   */
  public String nextToken()
    throws NoSuchElementException, IllegalArgumentException {

    if ( !this.hasMoreTokens() ) {
      throw new NoSuchElementException();
    }

    if ( beforeStart == false ) {
      currentIndex += this.separator.length();
    } else {
      beforeStart = false;
    }

    if ( this.quate != null
        && this.record.startsWith( this.quate, this.currentIndex ) ) {
      final int quateLength = this.quate.length();
      int startOffset = this.currentIndex + quateLength;
      final StringBuilder b = new StringBuilder();
      while ( true ) {
        final int end = record.indexOf( this.quate, startOffset );
        if ( end < 0 ) {
          throw new IllegalArgumentException( "Illegal format" );
        }

        if ( record.startsWith( this.quate, end + 1 ) == false ) {
          b.append( record.substring( startOffset, end ) );
          currentIndex = end + 1;
          return b.toString();
        } else {
          b.append( record.substring( startOffset, end + 1 ) );
        }

        startOffset = end + quateLength * 2;
      }
    }

    final int end = this.record.indexOf( this.separator, this.currentIndex );
    if ( end >= 0 ) {
      final int start = this.currentIndex;
      final String token = this.record.substring( start, end );
      this.currentIndex = end;
      return token;
    } else {
      final int start = this.currentIndex;
      final String token = this.record.substring( start );
      this.currentIndex = this.record.length();
      return token;
    }
  }

  /**
   * Returns the next token in this string tokenizer's string. First, the set of characters considered to be separator
   * by this <tt>CSVTokenizer</tt> object is changed to be the characters in the string <tt>separator</tt>. Then the
   * next token in the string after the current position is returned. The current position is advanced beyond the
   * recognized token.  The new delimiter set remains the default after this call.
   *
   * @param theSeparator the new separator.
   * @return the next token, after switching to the new delimiter set.
   * @throws NoSuchElementException if there are no more tokens in this tokenizer's string.
   */
  public String nextToken( final String theSeparator ) {
    separator = theSeparator;
    return nextToken();
  }

  /**
   * Returns the same value as the <code>hasMoreTokens</code> method. It exists so that this class can implement the
   * <code>Enumeration</code> interface.
   *
   * @return <code>true</code> if there are more tokens; <code>false</code> otherwise.
   * @see Enumeration
   * @see CSVTokenizer#hasMoreTokens()
   */
  public boolean hasMoreElements() {
    return hasMoreTokens();
  }

  /**
   * Returns the same value as the <code>nextToken</code> method, except that its declared return value is
   * <code>Object</code> rather than <code>String</code>. It exists so that this class can implement the
   * <code>Enumeration</code> interface.
   *
   * @return the next token in the string.
   * @throws NoSuchElementException if there are no more tokens in this tokenizer's string.
   * @see Enumeration
   * @see CSVTokenizer#nextToken()
   */
  public Object nextElement() {
    return nextToken();
  }

  /**
   * Calculates the number of times that this tokenizer's <code>nextToken</code> method can be called before it
   * generates an exception. The current position is not advanced.
   *
   * @return the number of tokens remaining in the string using the current delimiter set.
   * @see CSVTokenizer#nextToken()
   */
  public int countTokens() {
    int count = 0;

    final int preserve = this.currentIndex;
    final boolean preserveStart = this.beforeStart;
    while ( this.hasMoreTokens() ) {
      this.nextToken();
      count++;
    }
    this.currentIndex = preserve;
    this.beforeStart = preserveStart;

    return count;
  }

  /**
   * Returns the quate.
   *
   * @return char
   */
  public String getQuate() {
    return this.quate;
  }

  /**
   * Sets the quate.
   *
   * @param quate The quate to set
   */
  public void setQuate( final String quate ) {
    this.quate = quate;
  }

  public String[] toArray() {
    final ArrayList<String> retval = new ArrayList<String>( 20 );
    while ( hasMoreElements() ) {
      retval.add( nextToken() );
    }
    return (String[]) retval.toArray();
  }
}