/****************************************************************************** * * Copyright 2014 Paphus Solutions Inc. * * Licensed under the Eclipse Public License, Version 1.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.eclipse.org/legal/epl-v10.html * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ******************************************************************************/ package org.botlibre.util; import java.util.ArrayList; import java.util.Arrays; import java.util.HashSet; import java.util.List; import java.util.Set; /** * Provides a useful text stream class. */ public class TextStream { public static final String WHITESPACE =" \t\n\r\f"; public static final String HTTP =" \t\n\r\f\"{}<>"; public static final String TOKENS =" \t\n\r\f.,:;!()?[]{}+=^&*\"`~|/\\<>"; public static final String TERMINATORS =".?!。"; public static Set<String> ABBREVIATIONS = new HashSet<String>(Arrays.asList(new String[]{"mr","ms", "mrs", "dr", "inc", "sr", "jr", "st", "vs", "mt", "ltd", "co"})); public static Set<String> IGNORABLE = new HashSet<String>(Arrays.asList(new String[]{"'","`", "\"", ","})); /** * The text being streamed. */ protected String text; /** * The current stream index into the text. */ protected int index; public TextStream(String text) { this.index = 0; this.text = text; } public void reset() { this.index = 0; } public char last() { if (this.index <= 1) { return (char)0; } return this.text.charAt(this.index - 2); } public char current() { if (atEnd()) { return (char)-1; } if (this.index <= 0) { return (char)0; } return this.text.charAt(this.index - 1); } public char peekPrevious() { if (this.index <= 1) { return (char)0; } return this.text.charAt(this.index - 2); } public char peek() { if (atEnd()) { return (char)-1; } return this.text.charAt(this.index); } public int peekCodePoint() { if (atEnd()) { return (int)-1; } return this.text.codePointAt(this.index); } public char next() { if (this.index >= this.text.length()) { return (char)-1; } char next = this.text.charAt(this.index); this.index++; return next; } public char previous() { this.index--; char previous = current(); return previous; } public void skip() { if (this.index >= this.text.length()) { return; } this.index++; } public void skip(int count) { this.index = Math.min(this.index + count, this.text.length()); } public void backup(int count) { this.index = Math.max(this.index - count, 0); } public void backup() { this.index--; } public String peek(int count) { int start = this.index; int end = Math.min(this.index + count, this.text.length()); return this.text.substring(start, end); } public String previous(int count) { int end = this.index - 2; if (end <= 0) { return ""; } int start = Math.max(end - count, 0); return this.text.substring(start, end); } public String next(int count) { int start = this.index; this.index = Math.min(this.index + count, this.text.length()); return this.text.substring(start, this.index); } public boolean atEnd() { return this.index >= this.text.length(); } public boolean atStart() { return this.index == 0; } public String upTo(char token) { return upTo(token, false); } public void skipTo(char token) { skipTo(token, false); } public String upToEnd() { int start = this.index; this.index = this.text.length(); return this.text.substring(start, this.index); } public String upTo(char token, boolean including) { return upTo(token, including, false); } public String upTo(char token, boolean including, boolean resetIfNotFound) { int start = this.index; boolean found = skipTo(token, including); if (resetIfNotFound && !found) { this.index = start; return ""; } return this.text.substring(start, this.index); } public boolean skipTo(char token, boolean including) { boolean found = false; while (!atEnd()) { if (peek() == token) { found = true; break; } skip(); } if (found && including) { skip(); } return found; } public void backupTo(char token) { backupTo(token, false); } public void backupTo(char token, boolean including) { boolean found = false; while (this.index > 0) { if (current() == token) { found = true; break; } backup(); } if (found && including) { backup(); } } public String upToAny(String tokens) { return upToAny(tokens, false); } public String upToAny(String tokens, boolean including) { int start = this.index; skipToAny(tokens, including); return this.text.substring(start, this.index); } public void skipToAny(String tokens) { skipToAny(tokens, false); } public void skipToAny(String tokens, boolean including) { boolean found = false; while (!atEnd()) { if (tokens.indexOf(peek()) != -1) { found = true; break; } skip(); } if (found && including) { skip(); } } public String upToAll(String tokens) { return upToAll(tokens, false); } public String upToAll(String tokens, boolean including) { return upToAll(tokens, including, false); } public String upToAll(String tokens, boolean including, boolean resetIfNotFound) { int start = this.index; boolean found = skipToAll(tokens, including); if (resetIfNotFound && !found) { this.index = start; return ""; } return this.text.substring(start, this.index); } public void skipToAll(String tokens) { skipToAll(tokens, false); } public boolean skipToAll(String tokens, boolean including) { int tokenIndex = 0; boolean found = false; while (!atEnd()) { if (peek() == tokens.charAt(tokenIndex)) { tokenIndex++; if (tokenIndex == tokens.length()) { found = true; break; } } else { tokenIndex = 0; } skip(); } if (found) { if (including) { skip(); } else { this.index = index - (tokens.length() - 1); } } return found; } public void backupToAll(String tokens) { backupToAll(tokens, false); } public void backupToAll(String tokens, boolean including) { int tokenIndex = 1; int length = tokens.length(); boolean found = false; while (this.index > 0) { if (current() == tokens.charAt(length - tokenIndex)) { if (tokenIndex == tokens.length()) { found = true; break; } tokenIndex++; } else { tokenIndex = 1; } backup(); } if (found) { if (including) { backup(); } else { this.index = index + (length - 1); } } } public void backupToAny(String tokens) { backupToAny(tokens, false); } public void backupToAny(String tokens, boolean including) { boolean found = false; while (!atStart()) { if (tokens.indexOf(peekPrevious()) != -1) { found = true; break; } backup(); } if (found && including) { backup(); } } public String peekWord() { int position = this.index; String word = nextWord(); this.index = position; return word; } public List<String> allWords() { List<String> words = new ArrayList<String>(); while (!atEnd()) { String word = nextWord(); if (word != null) { words.add(word); } } return words; } public List<String> csv() { List<String> words = new ArrayList<String>(); skipWhitespace(); while (!atEnd()) { String word = upTo(','); if (!atEnd()) { skip(); skipWhitespace(); } word = word.trim(); if ((word != null) && !word.isEmpty()) { words.add(word); } } return words; } public String peekPreviousWord() { int original = this.index; backupWhitespace(); int start = this.index; if (atStart()) { return null; } char current = current(); if ((TOKENS.indexOf(current) != -1) || (isWordSymbol(current))) { previous(); String word = String.valueOf(current); this.index = original; return word; } backupToAny(TOKENS); String word = this.text.substring(Math.max(this.index - 1, 0), start); this.index = original; return word; } public boolean isWordSymbol(char character) { Character.UnicodeBlock block = Character.UnicodeBlock.of(character); return (Character.isIdeographic(character) || (block == Character.UnicodeBlock.HIRAGANA) || (block == Character.UnicodeBlock.KATAKANA) || (block == Character.UnicodeBlock.HANGUL_SYLLABLES) || (block == Character.UnicodeBlock.HANGUL_JAMO)) && !Character.isDigit(character); } public String nextQuotes() { if (atEnd()) { return ""; } int start = this.index; skipQuotes(); int end = this.index - 1; if (atEnd() && current() != '"') { end = this.index; } String quotes = this.text.substring(start, end); return quotes; } public String nextStringQuotes() { if (atEnd()) { return ""; } int start = this.index; skipStringQuotes(); int end = this.index - 1; if (atEnd() && current() != '\'') { end = this.index; } String quotes = this.text.substring(start, end); return quotes; } public String nextStringDoubleQuotes() { if (atEnd()) { return ""; } int start = this.index; skipStringDoubleQuotes(); int end = this.index - 1; if (atEnd() && current() != '"') { end = this.index; } String quotes = this.text.substring(start, end); return quotes; } public String nextStringWithBracketsDoubleQuotes() { if (atEnd()) { return ""; } int start = this.index; skipStringWithBracketsDoubleQuotes(); int end = this.index - 1; if (atEnd() && current() != '"') { end = this.index; } String quotes = this.text.substring(start, end); return quotes; } public String nextQuotesExcludeDoubleQuote() { String quotes = nextQuotes(); if (quotes.contains("\"\"")) { quotes = quotes.replace("\"\"", "\""); } return quotes; } public void skipStringQuotes() { if (atEnd()) { return; } char next = next(); while (!atEnd() && (next != '\'')) { if ((next == '\\')) { skip(); } if (next == '{') { skipStringBrackets(); } next = next(); } } public void skipStringDoubleQuotes() { if (atEnd()) { return; } char next = next(); while (!atEnd() && (next != '"')) { if ((next == '\\')) { skip(); } next = next(); } } public void skipStringWithBracketsDoubleQuotes() { if (atEnd()) { return; } char next = next(); while (!atEnd() && (next != '"')) { if ((next == '\\')) { skip(); } if (next == '{') { skipStringDoubleQuoteBrackets(); } next = next(); } } public void skipQuotes() { if (atEnd()) { return; } char next = next(); char peek = peek(); while (!atEnd() && ((next != '"') || (peek == '"'))) { if ((next == '"') && (peek == '"')) { skip(); } if (next == '{') { skipBrackets(); } next = next(); peek = peek(); } } public void skipBrackets() { if (atEnd()) { return; } char next = next(); while (!atEnd() && (next != '}')) { if (next == '"') { skipQuotes(); } next = next(); } } public void skipStringBrackets() { if (atEnd()) { return; } char next = next(); while (!atEnd() && (next != '}')) { if (next == '\'') { skipStringQuotes(); } next = next(); } } public void skipStringDoubleQuoteBrackets() { if (atEnd()) { return; } char next = next(); while (!atEnd() && (next != '}')) { if ((next == '\\')) { skip(); } if (next == '"') { skipStringWithBracketsDoubleQuotes(); } if (next == '{') { skipStringDoubleQuoteBrackets(); } next = next(); } } public String nextWord() { skipWhitespace(); if (atEnd()) { return null; } //int peek = peekCodePoint(); char peek = peek(); boolean isSign = (peek == '-') || (peek == '+'); // Check for "1-1" vs "1 - -1" if (isSign && !atStart()) { backup(); char previous = peek(); if (Character.isLetterOrDigit(previous) || previous == ')') { skip(); skip(); return String.valueOf(peek); } skip(); } if (isWordSymbol(peek) || (!Character.isLetterOrDigit(peek) && peek != '_' && peek != '#' && peek != '@' && !isSign)) { skip(); Character.UnicodeBlock block = Character.UnicodeBlock.of(peek); if (block == Character.UnicodeBlock.HIGH_SURROGATES) { skip(); return this.text.substring(this.index - 2, this.index); } return String.valueOf(peek); } // Check url. if (peek == 'h') { if (peek(7).equals("http://") || peek(8).equals("https://")) { return upToAny(HTTP); } } int start = this.index; boolean wasDigit = Character.isDigit(peek); skip(); while (!atEnd()) { //peek = peekCodePoint(); peek = peek(); boolean isDigit = Character.isDigit(peek); if (isWordSymbol(peek)) { break; } // "-1" is ok if (isSign && !isDigit) { break; } else { // Allow "1.1", "1,000", "ab-ba", "ab_ba", "#abc" if (!Character.isLetter(peek) && !isDigit && (peek != '_') && (peek != '#')) { if (wasDigit) { if ((peek != '.') && (peek != ',') && (peek != '@')) { break; } } else { if ((peek != '-') && (peek != '_') && (peek != '@') && (peek != '.')) { break; } } } } wasDigit = isDigit; skip(); // Could have been "1.", "1.a" if ((((peek == '.') && !Character.isLetter(peek())) || (peek == ',')) && !Character.isDigit(peek())) { backup(); break; } isSign = false; } return this.text.substring(start, this.index); } public String nextSimpleWord() { skipWhitespace(); if (atEnd()) { return null; } char peek = peek(); if (isWordSymbol(peek) || (!Character.isLetterOrDigit(peek))) { skip(); Character.UnicodeBlock block = Character.UnicodeBlock.of(peek); if (block == Character.UnicodeBlock.HIGH_SURROGATES) { skip(); return this.text.substring(this.index - 2, this.index); } return String.valueOf(peek); } int start = this.index; skip(); while (!atEnd()) { peek = peek(); boolean isDigit = Character.isDigit(peek); if (isWordSymbol(peek)) { break; } if (!Character.isLetter(peek) && !isDigit) { break; } skip(); } return this.text.substring(start, this.index); } public void skipWord() { skipWhitespace(); if (atEnd()) { return; } if (TOKENS.indexOf(peek()) != -1) { skip(); } // Check url. if (peek() == 'h') { if (peek(7).equals("http://")) { skipToAny(WHITESPACE); } } skipToAny(TOKENS); } /** * Return the next paragraph text including full sentences up to the max text size. */ public String nextParagraph(int max) { skipWhitespace(); if (atEnd()) { return null; } int start = this.index; int last = this.index; while (!atEnd()) { skipSentence(); if ((this.index - start) > max) { if (start == last) { this.index = start + max; } else { this.index = last; } break; } last = this.index; } return this.text.substring(start, this.index); } public String nextSentence() { skipWhitespace(); if (atEnd()) { return null; } int start = this.index; skipSentence(); return this.text.substring(start, this.index); } public void skipSentence() { skipWhitespace(); if (atEnd()) { return; } skipToAny(TERMINATORS, true); while (!atEnd()) { char peek = peek(); boolean done = WHITESPACE.indexOf(peek) != -1; if (done) { if (current() != '.') { break; } backup(); String word = peekPreviousWord(); skip(); if (word != null && (ABBREVIATIONS.contains(word.toLowerCase()) || ((word.length() == 1) && (Character.isUpperCase(word.charAt(0)))))) { done = false; } else { break; } } else if (isWordSymbol(peek)) { break; } else if (peek == '<') { // Allow HTML as whitespace. break; } skipToAny(TERMINATORS, true); } } public String nextWhitespace() { int start = this.index; while (!atEnd() && (WHITESPACE.indexOf(peek()) != -1)) { skip(); } return this.text.substring(start, this.index); } public boolean skipWhitespace() { boolean found = false; while (!atEnd() && (Character.isWhitespace(peek()))) { skip(); found = true; } return found; } public void backupWhitespace() { while (!atStart() && (WHITESPACE.indexOf(current()) != -1)) { backup(); } } public String nextLine() { return upToAll("\n", true); } public String currentLine() { int position = this.index; backupToAll("\n"); String line = nextLine(); this.index = position; return line; } public int currentLineNumber() { int position = this.index; this.index = 0; int count = 0; int last = this.index; while (this.index < position) { skipLine(); if (this.index < position) { count++; } if (this.index == last) { break; } last = this.index; } this.index = position; return count + 1; } public int currentLineColumn() { int position = this.index; backupToAll("\n"); int column = position - this.index; this.index = position; return column + 1; } public void skipLine() { skipToAll("\n", true); } public int getPosition() { return index; } public void setPosition(int index) { this.index = index; } public String getText() { return text; } public String toString() { return peek(text.length()); } }