/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.addthis.hydra.data.util;
import javax.annotation.Nonnull;
import java.util.List;
public final class CommentTokenizer {
private int oneLineComment;
private int multiLineComment;
private int parameter;
private int doubleQuote;
private int singleQuote;
private int offset;
@Nonnull
private final String input;
private final int length;
public CommentTokenizer(@Nonnull String input) {
this.input = input;
length = input.length();
offset = 0;
oneLineComment = -1;
multiLineComment = -1;
parameter = -1;
doubleQuote = -1;
singleQuote = -1;
}
private static enum TokenizerMode {
NEUTRAL, BEGIN_SINGLE_QUOTE, BEGIN_DOUBLE_QUOTE, BEGIN_PARAMETER,
BEGIN_SINGLELINE_COMMENT, BEGIN_MULTILINE_COMMENT
}
/**
* Convert -1 values returned by {@link String#indexOf(int)}
* into the string length.
*
* @param offset return value of {@link String#indexOf(int)}
* @param length string length
* @return converted offset
*/
private static int convertMissing(int offset, int length) {
return (offset == -1) ? length : offset;
}
private static int min(int a, int b, int c, int d, int e) {
int x = Math.min(a, b);
int y = Math.min(c, d);
int z = Math.min(x, y);
return Math.min(z, e);
}
private int nonEscapingUpdate(int position, String search) {
if (position < offset) {
position = convertMissing(input.indexOf(search, offset), length);
}
return position;
}
private int escapingUpdate(int position, String search) {
if (position < offset) {
position = input.indexOf(search, offset);
while (position > 0 && input.charAt(position - 1) == '\\') {
position = input.indexOf(search, position + 1);
}
position = convertMissing(position, length);
}
return position;
}
private void updatePositions() {
oneLineComment = nonEscapingUpdate(oneLineComment, "//");
multiLineComment = nonEscapingUpdate(multiLineComment, "/*");
parameter = nonEscapingUpdate(parameter, "%[");
doubleQuote = escapingUpdate(doubleQuote, "\"");
singleQuote = escapingUpdate(singleQuote, "\'");
}
/**
* @return splits the supplied string and returns the parts
*/
public void tokenize(List<String> content, List<String> delimiters) {
assert (content != null && content.size() == 0);
assert (delimiters != null && delimiters.size() == 0);
TokenizerMode mode = TokenizerMode.NEUTRAL;
if (length == 0) {
delimiters.add("");
content.add("");
return;
}
while (offset < length) {
switch (mode) {
case NEUTRAL: {
updatePositions();
int smallest = min(oneLineComment, multiLineComment, parameter, doubleQuote, singleQuote);
if (smallest == length) {
delimiters.add("");
content.add(input.substring(offset));
offset = length;
} else if (smallest == doubleQuote) {
delimiters.add("\"");
content.add(input.substring(offset, doubleQuote));
offset = doubleQuote + 1;
mode = TokenizerMode.BEGIN_DOUBLE_QUOTE;
} else if (smallest == singleQuote) {
delimiters.add("\'");
content.add(input.substring(offset, singleQuote));
offset = singleQuote + 1;
mode = TokenizerMode.BEGIN_SINGLE_QUOTE;
} else if (smallest == oneLineComment) {
delimiters.add("//");
content.add(input.substring(offset, oneLineComment));
offset = oneLineComment + 2;
mode = TokenizerMode.BEGIN_SINGLELINE_COMMENT;
} else if (smallest == multiLineComment) {
delimiters.add("/*");
content.add(input.substring(offset, multiLineComment));
offset = multiLineComment + 2;
mode = TokenizerMode.BEGIN_MULTILINE_COMMENT;
} else if (smallest == parameter) {
delimiters.add("%[");
content.add(input.substring(offset, parameter));
offset = parameter + 2;
mode = TokenizerMode.BEGIN_PARAMETER;
} else {
throw new IllegalStateException();
}
break;
}
case BEGIN_SINGLE_QUOTE: {
int singleQuote = input.indexOf("\'", offset);
while (singleQuote > 0 && input.charAt(singleQuote - 1) == '\\') {
singleQuote = input.indexOf("\'", singleQuote + 1);
}
singleQuote = convertMissing(singleQuote, length);
content.add(input.substring(offset, singleQuote));
if (singleQuote == length) {
delimiters.add("");
} else {
delimiters.add("\'");
}
offset = singleQuote + 1;
mode = TokenizerMode.NEUTRAL;
break;
}
case BEGIN_DOUBLE_QUOTE: {
int doubleQuote = input.indexOf("\"", offset);
while (doubleQuote > 0 && input.charAt(doubleQuote - 1) == '\\') {
doubleQuote = input.indexOf("\"", doubleQuote + 1);
}
doubleQuote = convertMissing(doubleQuote, length);
content.add(input.substring(offset, doubleQuote));
if (doubleQuote == length) {
delimiters.add("");
} else {
delimiters.add("\"");
}
offset = doubleQuote + 1;
mode = TokenizerMode.NEUTRAL;
break;
}
case BEGIN_SINGLELINE_COMMENT: {
int newline = convertMissing(input.indexOf("\n", offset), length);
content.add(input.substring(offset, newline));
if (newline == length) {
delimiters.add("");
} else {
delimiters.add("\n");
}
offset = newline + 1;
mode = TokenizerMode.NEUTRAL;
break;
}
case BEGIN_MULTILINE_COMMENT: {
int endComment = convertMissing(input.indexOf("*/", offset), length);
content.add(input.substring(offset, endComment));
if (endComment == length) {
delimiters.add("");
} else {
delimiters.add("*/");
}
offset = endComment + 2;
mode = TokenizerMode.NEUTRAL;
break;
}
case BEGIN_PARAMETER: {
int endParameter = convertMissing(input.indexOf("]%", offset), length);
content.add(input.substring(offset, endParameter));
if (endParameter == length) {
delimiters.add("");
} else {
delimiters.add("]%");
}
offset = endParameter + 2;
mode = TokenizerMode.NEUTRAL;
break;
}
}
}
}
}