/*
* Copyright 2006-2014 the original author or authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.springframework.batch.item.file.transform;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import org.springframework.beans.factory.InitializingBean;
import org.springframework.util.Assert;
import org.springframework.util.StringUtils;
/**
* A {@link LineTokenizer} implementation that splits the input String on a
* configurable delimiter. This implementation also supports the use of an
* escape character to escape delimiters and line endings.
*
* @author Rob Harrop
* @author Dave Syer
* @author Michael Minella
*/
public class DelimitedLineTokenizer extends AbstractLineTokenizer
implements InitializingBean {
/**
* Convenient constant for the common case of a tab delimiter.
*/
public static final String DELIMITER_TAB = "\t";
/**
* Convenient constant for the common case of a comma delimiter.
*/
public static final String DELIMITER_COMMA = ",";
/**
* Convenient constant for the common case of a " character used to escape
* delimiters or line endings.
*/
public static final char DEFAULT_QUOTE_CHARACTER = '"';
// the delimiter character used when reading input.
private String delimiter;
private char quoteCharacter = DEFAULT_QUOTE_CHARACTER;
private String quoteString;
private String escapedQuoteString;
private Collection<Integer> includedFields = null;
/**
* Create a new instance of the {@link DelimitedLineTokenizer} class for the
* common case where the delimiter is a {@link #DELIMITER_COMMA comma}.
*
* @see #DelimitedLineTokenizer(String)
* @see #DELIMITER_COMMA
*/
public DelimitedLineTokenizer() {
this(DELIMITER_COMMA);
}
/**
* Create a new instance of the {@link DelimitedLineTokenizer} class.
*
* @param delimiter the desired delimiter. This is required
*/
public DelimitedLineTokenizer(String delimiter) {
Assert.notNull(delimiter, "A delimiter is required");
Assert.state(!delimiter.equals(String.valueOf(DEFAULT_QUOTE_CHARACTER)), "[" + DEFAULT_QUOTE_CHARACTER
+ "] is not allowed as delimiter for tokenizers.");
this.delimiter = delimiter;
setQuoteCharacter(DEFAULT_QUOTE_CHARACTER);
}
/**
* Setter for the delimiter character.
*
* @param delimiter the String used as a delimiter
*/
public void setDelimiter(String delimiter) {
this.delimiter = delimiter;
}
/**
* The fields to include in the output by position (starting at 0). By
* default all fields are included, but this property can be set to pick out
* only a few fields from a larger set. Note that if field names are
* provided, their number must match the number of included fields.
*
* @param includedFields the included fields to set
*/
public void setIncludedFields(int[] includedFields) {
this.includedFields = new HashSet<>();
for (int i : includedFields) {
this.includedFields.add(i);
}
}
/**
* Public setter for the quoteCharacter. The quote character can be used to
* extend a field across line endings or to enclose a String which contains
* the delimiter. Inside a quoted token the quote character can be used to
* escape itself, thus "a""b""c" is tokenized to a"b"c.
*
* @param quoteCharacter the quoteCharacter to set
*
* @see #DEFAULT_QUOTE_CHARACTER
*/
public void setQuoteCharacter(char quoteCharacter) {
this.quoteCharacter = quoteCharacter;
this.quoteString = "" + quoteCharacter;
this.escapedQuoteString = "" + quoteCharacter + quoteCharacter;
}
/**
* Yields the tokens resulting from the splitting of the supplied
* <code>line</code>.
*
* @param line the line to be tokenized
*
* @return the resulting tokens
*/
@Override
protected List<String> doTokenize(String line) {
List<String> tokens = new ArrayList<>();
// line is never null in current implementation
// line is checked in parent: AbstractLineTokenizer.tokenize()
char[] chars = line.toCharArray();
boolean inQuoted = false;
int lastCut = 0;
int length = chars.length;
int fieldCount = 0;
int endIndexLastDelimiter = -1;
for (int i = 0; i < length; i++) {
char currentChar = chars[i];
boolean isEnd = (i == (length - 1));
boolean isDelimiter = endsWithDelimiter(chars, i, endIndexLastDelimiter);
if ((isDelimiter && !inQuoted) || isEnd) {
endIndexLastDelimiter = i;
int endPosition = (isEnd ? (length - lastCut) : (i - lastCut));
if (isEnd && isDelimiter) {
endPosition = endPosition - delimiter.length();
}
else if (!isEnd){
endPosition = (endPosition - delimiter.length()) + 1;
}
if (includedFields == null || includedFields.contains(fieldCount)) {
String value =
substringWithTrimmedWhitespaceAndQuotesIfQuotesPresent(chars, lastCut, endPosition);
tokens.add(value);
}
fieldCount++;
if (isEnd && (isDelimiter)) {
if (includedFields == null || includedFields.contains(fieldCount)) {
tokens.add("");
}
fieldCount++;
}
lastCut = i + 1;
}
else if (isQuoteCharacter(currentChar)) {
inQuoted = !inQuoted;
}
}
return tokens;
}
/**
* Trim and leading or trailing quotes (and any leading or trailing
* whitespace before or after the quotes) from within the specified character
* array beginning at the specified offset index for the specified count.
* <p/>
* Quotes are escaped with double instances of the quote character.
*
* @param chars the character array
* @param offset index from which to begin extracting substring
* @param count length of substring
* @return a substring from the specified offset within the character array
* with any leading or trailing whitespace trimmed.
* @see String#trim()
*/
private String substringWithTrimmedWhitespaceAndQuotesIfQuotesPresent(char chars[], int offset, int count) {
int start = offset;
int len = count;
while ((start < (start + len)) && (chars[start] <= ' ')) {
start++;
len--;
}
while ((start < (start + len)) && ((start + len - 1 < chars.length) && (chars[start + len - 1] <= ' '))) {
len--;
}
String value;
if ((chars.length > 2) && (chars[start] == quoteCharacter) && (chars[start + len - 1] == quoteCharacter)) {
value = new String(chars, start + 1, len - 2);
if (value.contains(escapedQuoteString)) {
value = StringUtils.replace(value, escapedQuoteString, quoteString);
}
}
else {
value = new String(chars, offset, count);
}
return value;
}
/**
* Do the character(s) in the specified array end, at the specified end
* index, with the delimiter character(s)?
* <p/>
* Checks that the specified end index is sufficiently greater than the
* specified previous delimiter end index to warrant trying to match
* another delimiter. Also checks that the specified end index is
* sufficiently large to be able to match the length of a delimiter.
*
* @param chars the character array
* @param end the index in up to which the delimiter should be matched
* @param previous the index of the end of the last delimiter
* @return <code>true</code> if the character(s) from the specified end
* match the delimiter character(s), otherwise false
* @see DelimitedLineTokenizer#DelimitedLineTokenizer(String)
*/
private boolean endsWithDelimiter(char[] chars, int end, int previous) {
boolean result = false;
if (end - previous >= delimiter.length()) {
if (end >= delimiter.length() - 1) {
result = true;
for (int j = 0; j < delimiter.length() && (((end - delimiter.length() + 1) + j) < chars.length); j++) {
if (delimiter.charAt(j) != chars[(end - delimiter.length() + 1) + j]) {
result = false;
}
}
}
}
return result;
}
/**
* Is the supplied character a quote character?
*
* @param c the character to be checked
* @return <code>true</code> if the supplied character is an quote character
* @see #setQuoteCharacter(char)
*/
protected boolean isQuoteCharacter(char c) {
return c == quoteCharacter;
}
@Override
public void afterPropertiesSet() throws Exception {
Assert.hasLength(this.delimiter, "A delimiter is required");
}
}