DelimitedLineTokenizer.java example

Explorer
spring-batch-master
/*
 * Copyright 2006-2014 the original author or authors.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.springframework.batch.item.file.transform;

import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;

import org.springframework.beans.factory.InitializingBean;
import org.springframework.util.Assert;
import org.springframework.util.StringUtils;

/**
 * A {@link LineTokenizer} implementation that splits the input String on a
 * configurable delimiter. This implementation also supports the use of an
 * escape character to escape delimiters and line endings.
 *
 * @author Rob Harrop
 * @author Dave Syer
 * @author Michael Minella
 */
public class DelimitedLineTokenizer extends AbstractLineTokenizer
	implements InitializingBean {
	/**
	 * Convenient constant for the common case of a tab delimiter.
	 */
	public static final String DELIMITER_TAB = "\t";

	/**
	 * Convenient constant for the common case of a comma delimiter.
	 */
	public static final String DELIMITER_COMMA = ",";

	/**
	 * Convenient constant for the common case of a " character used to escape
	 * delimiters or line endings.
	 */
	public static final char DEFAULT_QUOTE_CHARACTER = '"';

	// the delimiter character used when reading input.
	private String delimiter;

	private char quoteCharacter = DEFAULT_QUOTE_CHARACTER;

	private String quoteString;

    private String escapedQuoteString;

	private Collection<Integer> includedFields = null;

	/**
	 * Create a new instance of the {@link DelimitedLineTokenizer} class for the
	 * common case where the delimiter is a {@link #DELIMITER_COMMA comma}.
	 *
	 * @see #DelimitedLineTokenizer(String)
	 * @see #DELIMITER_COMMA
	 */
	public DelimitedLineTokenizer() {
		this(DELIMITER_COMMA);
	}

	/**
	 * Create a new instance of the {@link DelimitedLineTokenizer} class.
	 *
	 * @param delimiter the desired delimiter.  This is required
	 */
	public DelimitedLineTokenizer(String delimiter) {
		Assert.notNull(delimiter, "A delimiter is required");
		Assert.state(!delimiter.equals(String.valueOf(DEFAULT_QUOTE_CHARACTER)), "[" + DEFAULT_QUOTE_CHARACTER
				+ "] is not allowed as delimiter for tokenizers.");

		this.delimiter = delimiter;
		setQuoteCharacter(DEFAULT_QUOTE_CHARACTER);
	}

	/**
	 * Setter for the delimiter character.
	 *
	 * @param delimiter the String used as a delimiter
	 */
	public void setDelimiter(String delimiter) {
		this.delimiter = delimiter;
	}

	/**
	 * The fields to include in the output by position (starting at 0). By
	 * default all fields are included, but this property can be set to pick out
	 * only a few fields from a larger set. Note that if field names are
	 * provided, their number must match the number of included fields.
	 *
	 * @param includedFields the included fields to set
	 */
	public void setIncludedFields(int[] includedFields) {
		this.includedFields = new HashSet<>();
		for (int i : includedFields) {
			this.includedFields.add(i);
		}
	}

	/**
	 * Public setter for the quoteCharacter. The quote character can be used to
	 * extend a field across line endings or to enclose a String which contains
	 * the delimiter. Inside a quoted token the quote character can be used to
	 * escape itself, thus "a""b""c" is tokenized to a"b"c.
	 *
	 * @param quoteCharacter the quoteCharacter to set
	 *
	 * @see #DEFAULT_QUOTE_CHARACTER
	 */
	public void setQuoteCharacter(char quoteCharacter) {
		this.quoteCharacter = quoteCharacter;
		this.quoteString = "" + quoteCharacter;
        this.escapedQuoteString = "" + quoteCharacter + quoteCharacter;
	}

	/**
	 * Yields the tokens resulting from the splitting of the supplied
	 * <code>line</code>.
	 *
	 * @param line the line to be tokenized
	 *
	 * @return the resulting tokens
	 */
	@Override
	protected List<String> doTokenize(String line) {

		List<String> tokens = new ArrayList<>();

		// line is never null in current implementation
		// line is checked in parent: AbstractLineTokenizer.tokenize()
		char[] chars = line.toCharArray();
		boolean inQuoted = false;
		int lastCut = 0;
		int length = chars.length;
		int fieldCount = 0;
		int endIndexLastDelimiter = -1;

		for (int i = 0; i < length; i++) {
			char currentChar = chars[i];
			boolean isEnd = (i == (length - 1));

            boolean isDelimiter = endsWithDelimiter(chars, i, endIndexLastDelimiter);

			if ((isDelimiter && !inQuoted) || isEnd) {
				endIndexLastDelimiter = i;
				int endPosition = (isEnd ? (length - lastCut) : (i - lastCut));

				if (isEnd && isDelimiter) {
					endPosition = endPosition - delimiter.length();
				}
				else if (!isEnd){
					endPosition = (endPosition - delimiter.length()) + 1;
				}

				if (includedFields == null || includedFields.contains(fieldCount)) {
                    String value =
                            substringWithTrimmedWhitespaceAndQuotesIfQuotesPresent(chars, lastCut, endPosition);
					tokens.add(value);
				}

				fieldCount++;

				if (isEnd && (isDelimiter)) {
					if (includedFields == null || includedFields.contains(fieldCount)) {
						tokens.add("");
					}
					fieldCount++;
				}

				lastCut = i + 1;
			}
			else if (isQuoteCharacter(currentChar)) {
				inQuoted = !inQuoted;
			}

		}

		return tokens;
	}

    /**
     * Trim and leading or trailing quotes (and any leading or trailing
     * whitespace before or after the quotes) from within the specified character
     * array beginning at the specified offset index for the specified count.
     * <p/>
     * Quotes are escaped with double instances of the quote character.
     *
     * @param chars  the character array
     * @param offset index from which to begin extracting substring
     * @param count  length of substring
     * @return a substring from the specified offset within the character array
     * with any leading or trailing whitespace trimmed.
     * @see String#trim()
     */
    private String substringWithTrimmedWhitespaceAndQuotesIfQuotesPresent(char chars[], int offset, int count) {
        int start = offset;
        int len = count;

        while ((start < (start + len)) && (chars[start] <= ' ')) {
            start++;
            len--;
        }

        while ((start < (start + len)) && ((start + len - 1 < chars.length) && (chars[start + len - 1] <= ' '))) {
            len--;
        }

        String value;

        if ((chars.length > 2) && (chars[start] == quoteCharacter) && (chars[start + len - 1] == quoteCharacter)) {
            value = new String(chars, start + 1, len - 2);
            if (value.contains(escapedQuoteString)) {
                value = StringUtils.replace(value, escapedQuoteString, quoteString);
            }
        }
        else {
            value = new String(chars, offset, count);
        }

        return value;
    }

    /**
     * Do the character(s) in the specified array end, at the specified end
     * index, with the delimiter character(s)?
     * <p/>
     * Checks that the specified end index is sufficiently greater than the
     * specified previous delimiter end index to warrant trying to match
     * another delimiter.  Also checks that the specified end index is
     * sufficiently large to be able to match the length of a delimiter.
     *
     * @param chars    the character array
     * @param end      the index in up to which the delimiter should be matched
     * @param previous the index of the end of the last delimiter
     * @return <code>true</code> if the character(s) from the specified end
     * match the delimiter character(s), otherwise false
     * @see DelimitedLineTokenizer#DelimitedLineTokenizer(String)
     */
    private boolean endsWithDelimiter(char[] chars, int end, int previous) {
        boolean result = false;

        if (end - previous >= delimiter.length()) {
            if (end >= delimiter.length() - 1) {
                result = true;
                for (int j = 0; j < delimiter.length() && (((end - delimiter.length() + 1) + j) < chars.length); j++) {
                    if (delimiter.charAt(j) != chars[(end - delimiter.length() + 1) + j]) {
                        result = false;
                    }
                }
            }
        }

        return result;
    }

	/**
	 * Is the supplied character a quote character?
	 *
	 * @param c the character to be checked
	 * @return <code>true</code> if the supplied character is an quote character
	 * @see #setQuoteCharacter(char)
	 */
	protected boolean isQuoteCharacter(char c) {
		return c == quoteCharacter;
	}

	@Override
	public void afterPropertiesSet() throws Exception {
		Assert.hasLength(this.delimiter, "A delimiter is required");
	}
}