/* * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.addthis.hydra.data.util; import java.util.ArrayList; import java.util.List; import com.addthis.basis.util.LessStrings; import com.addthis.codec.annotations.FieldConfig; import com.addthis.codec.codables.Codable; /** * Configurable tokenizer that splits input strings into fields. * <p/> * <p>The separator is used to split the input string into one or more tokens. The * group parameter can be used to combine strings that would otherwise be split * across tokens. For example: if the separator parameter is "," and the group * parameter is the single quote character (') then the input string "a,b,'c,d',e" * produces the four tokens "a", "b", "c,d", and "e". * * @user-reference */ public class Tokenizer implements Codable { /** * This string is the deliminator between two fields in the input string. Default is ",". */ @FieldConfig(codable = true) private String separator = ","; /** * Any of these strings can be used to combine strings that would otherwise be split across tokens. */ @FieldConfig(codable = true) private String[] group; /** * If true then pack all the tokens into a single field. Default is false. */ @FieldConfig(codable = true) private boolean pack; /** * Extract the first character of this string and use it as an escape character. Default is "\". */ @FieldConfig(codable = true) private String escape = "\\"; private int maxColCount; private String quoteOpen; private String quoteClose; private boolean isInitialized; private char esc = '\\'; public Tokenizer() { } public Tokenizer(String separator, String[] group, boolean pack) { this.separator = separator; this.group = group; this.pack = pack; initialize(); } public Tokenizer setEscape(String escape) { this.escape = escape; return this; } public Tokenizer setSeparator(String sep) { this.separator = sep; return this; } public Tokenizer setGrouping(String[] group) { this.group = group; return this; } public Tokenizer setPacking(boolean pack) { this.pack = pack; return this; } public Tokenizer initialize() { if (separator == null) { throw new RuntimeException("separator not set"); } if (group != null) { StringBuilder open = new StringBuilder(); StringBuilder close = new StringBuilder(); for (String q : group) { if (q.length() == 1) { open.append(q.charAt(0)); close.append(q.charAt(0)); } else if (q.length() == 2) { open.append(q.charAt(0)); close.append(q.charAt(1)); } else { throw new RuntimeException("invalid match " + q); } } quoteOpen = open.toString(); quoteClose = close.toString(); } esc = escape.charAt(0); isInitialized = true; return this; } /** * @return the separator string (used to split fields) */ public String getSeparator() { return separator; } /** * @return the grouping strings (used to "quote" text that might contain the * separator) */ public String[] getGrouping() { return group; } /** * @return will all fields be packed into one? */ public boolean isPacked() { return pack; } /** * @return filtered line */ public String filterLine(String line) { return line; } /** * @return filtered value */ public String filterValue(String value) { return value; } /** * @return splits the supplied string and returns the parts */ public List<String> tokenize(String line) { if (!isInitialized) { initialize(); } if (line == null || LessStrings.isEmpty(line.trim())) { return null; } line = filterLine(line); if (line == null) { return null; } List<String> ret = new ArrayList<>(maxColCount); int inGroup = -1; boolean isEscaped = false; boolean isSep = false; int pos = 0; StringBuilder sb = new StringBuilder(); while (true) { boolean eol = pos == line.length(); if ((isSep && inGroup < 0) || eol) { if (sb.length() > 0 || !pack) { ret.add(filterValue(sb.toString())); sb = new StringBuilder(); if (isSep && eol && !pack) { ret.add(filterValue(sb.toString())); } } } if (eol) { break; } char ch = line.charAt(pos++); if (isEscaped) { sb.append(ch); isEscaped = false; continue; } if (ch == esc) { isEscaped = true; isSep = false; continue; } // check for group close if (inGroup >= 0) { if (ch == quoteClose.charAt(inGroup)) { inGroup = -1; } else { sb.append(ch); } continue; } else if (group != null) { // check for group open int qspos = quoteOpen.indexOf(ch); if (qspos >= 0) { isSep = false; inGroup = qspos; continue; } } // check for separator if (isSep = (separator.indexOf(ch) >= 0)) { continue; } sb.append(ch); } maxColCount = Math.max(maxColCount, ret.size()); return ret; } }