package org.jabref.model.strings; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Optional; import java.util.StringTokenizer; import java.util.regex.Matcher; import java.util.regex.Pattern; import com.google.common.base.CharMatcher; import org.apache.commons.lang3.StringUtils; public class StringUtil { // Non-letters which are used to denote accents in LaTeX-commands, e.g., in {\"{a}} public static final String SPECIAL_COMMAND_CHARS = "\"`^~'=.|"; // contains all possible line breaks, not omitting any break such as "\\n" private static final Pattern LINE_BREAKS = Pattern.compile("\\r\\n|\\r|\\n"); private static final Pattern BRACED_TITLE_CAPITAL_PATTERN = Pattern.compile("\\{[A-Z]+\\}"); private static final UnicodeToReadableCharMap UNICODE_CHAR_MAP = new UnicodeToReadableCharMap(); public static String booleanToBinaryString(boolean expression) { return expression ? "1" : "0"; } /** * Quote special characters. * * @param toQuote The String which may contain special characters. * @param specials A String containing all special characters except the quoting * character itself, which is automatically quoted. * @param quoteChar The quoting character. * @return A String with every special character (including the quoting * character itself) quoted. */ public static String quote(String toQuote, String specials, char quoteChar) { if (toQuote == null) { return ""; } StringBuilder result = new StringBuilder(); char c; boolean isSpecial; for (int i = 0; i < toQuote.length(); ++i) { c = toQuote.charAt(i); isSpecial = (c == quoteChar); // If non-null specials performs logic-or with specials.indexOf(c) >= 0 isSpecial |= ((specials != null) && (specials.indexOf(c) >= 0)); if (isSpecial) { result.append(quoteChar); } result.append(c); } return result.toString(); } /** * Creates a substring from a text * * @param text * @param startIndex * @param terminateOnEndBraceOnly * @return */ public static String getPart(String text, int startIndex, boolean terminateOnEndBraceOnly) { char c; int count = 0; StringBuilder part = new StringBuilder(); // advance to first char and skip whitespace int index = startIndex + 1; while ((index < text.length()) && Character.isWhitespace(text.charAt(index))) { index++; } // then grab whatever is the first token (counting braces) while (index < text.length()) { c = text.charAt(index); if (!terminateOnEndBraceOnly && (count == 0) && Character.isWhitespace(c)) { // end argument and leave whitespace for further processing break; } if ((c == '}') && (--count < 0)) { break; } else if (c == '{') { count++; } part.append(c); index++; } return part.toString(); } /** * Returns the string, after shaving off whitespace at the beginning and end, * and removing (at most) one pair of braces or " surrounding it. * * @param toShave * @return */ public static String shaveString(String toShave) { if ((toShave == null) || (toShave.isEmpty())) { return ""; } String shaved = toShave.trim(); if (isInCurlyBrackets(shaved) || isInCitationMarks(shaved)) { return shaved.substring(1, shaved.length() - 1); } return shaved; } /** * Concatenate all strings in the array from index 'from' to 'to' (excluding * to) with the given separator. * <p> * Example: * <p> * String[] s = "ab/cd/ed".split("/"); join(s, "\\", 0, s.length) -> * "ab\\cd\\ed" * * @param strings * @param separator * @param from * @param to Excluding strings[to] * @return */ public static String join(String[] strings, String separator, int from, int to) { if ((strings.length == 0) || (from >= to)) { return ""; } int updatedFrom = Math.max(from, 0); int updatedTo = Math.min(strings.length, to); StringBuilder stringBuilder = new StringBuilder(); for (int i = updatedFrom; i < (updatedTo - 1); i++) { stringBuilder.append(strings[i]).append(separator); } return stringBuilder.append(strings[updatedTo - 1]).toString(); } /** * Removes optional square brackets from the string s * * @param toStrip * @return */ public static String stripBrackets(String toStrip) { if (isInSquareBrackets(toStrip)) { return toStrip.substring(1, toStrip.length() - 1); } return toStrip; } /** * extends the filename with a default Extension, if no Extension '.x' could * be found */ public static String getCorrectFileName(String orgName, String defaultExtension) { if (orgName == null) { return ""; } if (orgName.toLowerCase(Locale.ROOT).endsWith("." + defaultExtension.toLowerCase(Locale.ROOT))) { return orgName; } int hiddenChar = orgName.indexOf('.', 1); // hidden files Linux/Unix (?) if (hiddenChar < 1) { return orgName + "." + defaultExtension; } return orgName; } /** * Formats field contents for output. Must be "symmetric" with the parse method above, * so stored and reloaded fields are not mangled. * * @param in * @param wrapAmount * @param newline * @return the wrapped String. */ public static String wrap(String in, int wrapAmount, String newline) { String[] lines = in.split("\n"); StringBuilder result = new StringBuilder(); // remove all whitespace at the end of the string, this especially includes \r created when the field content has \r\n as line separator addWrappedLine(result, CharMatcher.WHITESPACE.trimTrailingFrom(lines[0]), wrapAmount, newline); // See for (int i = 1; i < lines.length; i++) { if (lines[i].trim().isEmpty()) { result.append(newline); result.append('\t'); } else { result.append(newline); result.append('\t'); result.append(newline); result.append('\t'); // remove all whitespace at the end of the string, this especially includes \r created when the field content has \r\n as line separator String line = CharMatcher.WHITESPACE.trimTrailingFrom(lines[i]); addWrappedLine(result, line, wrapAmount, newline); } } return result.toString(); } private static void addWrappedLine(StringBuilder result, String line, int wrapAmount, String newline) { // Set our pointer to the beginning of the new line in the StringBuffer: int length = result.length(); // Add the line, unmodified: result.append(line); while (length < result.length()) { int current = result.indexOf(" ", length + wrapAmount); if ((current < 0) || (current >= result.length())) { break; } result.deleteCharAt(current); result.insert(current, newline + "\t"); length = current + newline.length(); } } /** * Quotes each and every character, e.g. '!' as !. Used for verbatim * display of arbitrary strings that may contain HTML entities. */ public static String quoteForHTML(String toQuote) { StringBuilder result = new StringBuilder(); for (int i = 0; i < toQuote.length(); ++i) { result.append("&#").append((int) toQuote.charAt(i)).append(';'); } return result.toString(); } /** * Decodes an encoded double String array back into array form. The array * is assumed to be square, and delimited by the characters ';' (first dim) and * ':' (second dim). * @param value The encoded String to be decoded. * @return The decoded String array. */ public static String[][] decodeStringDoubleArray(String value) { List<List<String>> newList = new ArrayList<>(); StringBuilder sb = new StringBuilder(); List<String> thisEntry = new ArrayList<>(); boolean escaped = false; for (int i = 0; i < value.length(); i++) { char c = value.charAt(i); if (!escaped && (c == '\\')) { escaped = true; continue; } else if (!escaped && (c == ':')) { thisEntry.add(sb.toString()); sb = new StringBuilder(); } else if (!escaped && (c == ';')) { thisEntry.add(sb.toString()); sb = new StringBuilder(); newList.add(thisEntry); thisEntry = new ArrayList<>(); } else { sb.append(c); } escaped = false; } if (sb.length() > 0) { thisEntry.add(sb.toString()); } if (!thisEntry.isEmpty()) { newList.add(thisEntry); } // Convert to String[][]: String[][] res = new String[newList.size()][]; for (int i = 0; i < res.length; i++) { res[i] = new String[newList.get(i).size()]; for (int j = 0; j < res[i].length; j++) { res[i][j] = newList.get(i).get(j); } } return res; } /** * Wrap all uppercase letters, or sequences of uppercase letters, in curly * braces. Ignore letters within a pair of # character, as these are part of * a string label that should not be modified. * * @param s * The string to modify. * @return The resulting string after wrapping capitals. */ public static String putBracesAroundCapitals(String s) { boolean inString = false; boolean isBracing = false; boolean escaped = false; int inBrace = 0; StringBuilder buf = new StringBuilder(); for (int i = 0; i < s.length(); i++) { // Update variables based on special characters: int c = s.charAt(i); if (c == '{') { inBrace++; } else if (c == '}') { inBrace--; } else if (!escaped && (c == '#')) { inString = !inString; } // See if we should start bracing: if ((inBrace == 0) && !isBracing && !inString && Character.isLetter((char) c) && Character.isUpperCase((char) c)) { buf.append('{'); isBracing = true; } // See if we should close a brace set: if (isBracing && !(Character.isLetter((char) c) && Character.isUpperCase((char) c))) { buf.append('}'); isBracing = false; } // Add the current character: buf.append((char) c); // Check if we are entering an escape sequence: escaped = (c == '\\') && !escaped; } // Check if we have an unclosed brace: if (isBracing) { buf.append('}'); } return buf.toString(); } /** * This method looks for occurrences of capital letters enclosed in an * arbitrary number of pairs of braces, e.g. "{AB}" or "{{T}}". All of these * pairs of braces are removed. * * @param s * The String to analyze. * @return A new String with braces removed. */ public static String removeBracesAroundCapitals(String s) { String current = s; String previous = s; while ((current = removeSingleBracesAroundCapitals(current)).length() < previous.length()) { previous = current; } return current; } /** * This method looks for occurrences of capital letters enclosed in one pair * of braces, e.g. "{AB}". All these are replaced by only the capitals in * between the braces. * * @param s * The String to analyze. * @return A new String with braces removed. */ private static String removeSingleBracesAroundCapitals(String s) { Matcher mcr = BRACED_TITLE_CAPITAL_PATTERN.matcher(s); StringBuffer buf = new StringBuffer(); while (mcr.find()) { String replaceStr = mcr.group(); mcr.appendReplacement(buf, replaceStr.substring(1, replaceStr.length() - 1)); } mcr.appendTail(buf); return buf.toString(); } /** * Replaces all platform-dependent line breaks by OS.NEWLINE line breaks. * * We do NOT use UNIX line breaks as the user explicitly configures its linebreaks and this method is used in bibtex field writing * * <example> * Legacy Macintosh \r -> OS.NEWLINE * Windows \r\n -> OS.NEWLINE * </example> * * @return a String with only OS.NEWLINE as line breaks */ public static String unifyLineBreaks(String s, String newline) { return LINE_BREAKS.matcher(s).replaceAll(newline); } /** * Checks if the given String has exactly one pair of surrounding curly braces <br> * Strings with escaped characters in curly braces at the beginning and end are respected, too * @param toCheck The string to check * @return True, if the check was succesful. False otherwise. */ public static boolean isInCurlyBrackets(String toCheck) { int count = 0; int brackets = 0; if ((toCheck == null) || toCheck.isEmpty()) { return false; } else { if ((toCheck.charAt(0) == '{') && (toCheck.charAt(toCheck.length() - 1) == '}')) { for (char c : toCheck.toCharArray()) { if (c == '{') { if (brackets == 0) { count++; } brackets++; } else if (c == '}') { brackets--; } } return count == 1; } return false; } } public static boolean isInSquareBrackets(String toCheck) { if ((toCheck == null) || toCheck.isEmpty()) { return false; // In case of null or empty string } else { return (toCheck.charAt(0) == '[') && (toCheck.charAt(toCheck.length() - 1) == ']'); } } public static boolean isInCitationMarks(String toCheck) { if ((toCheck == null) || (toCheck.length() <= 1)) { return false; // In case of null, empty string, or a single citation mark } else { return (toCheck.charAt(0) == '"') && (toCheck.charAt(toCheck.length() - 1) == '"'); } } /** * Optimized method for converting a String into an Integer * * From http://stackoverflow.com/questions/1030479/most-efficient-way-of-converting-string-to-integer-in-java * * @param str the String holding an Integer value * @throws NumberFormatException if str cannot be parsed to an int * @return the int value of str */ public static int intValueOf(String str) { int idx = 0; int end; boolean sign = false; char ch; if ((str == null) || ((end = str.length()) == 0) || ((((ch = str.charAt(0)) < '0') || (ch > '9')) && (!(sign = ch == '-') || (++idx == end) || ((ch = str.charAt(idx)) < '0') || (ch > '9')))) { throw new NumberFormatException(str); } int ival = 0; for (;; ival *= 10) { ival += '0' - ch; if (++idx == end) { return sign ? ival : -ival; } if (((ch = str.charAt(idx)) < '0') || (ch > '9')) { throw new NumberFormatException(str); } } } /** * Optimized method for converting a String into an Integer * * From http://stackoverflow.com/questions/1030479/most-efficient-way-of-converting-string-to-integer-in-java * * @param str the String holding an Integer value * @return the int value of str or Optional.empty() if not possible */ public static Optional<Integer> intValueOfOptional(String str) { int idx = 0; int end; boolean sign = false; char ch; if ((str == null) || ((end = str.length()) == 0) || ((((ch = str.charAt(0)) < '0') || (ch > '9')) && (!(sign = ch == '-') || (++idx == end) || ((ch = str.charAt(idx)) < '0') || (ch > '9')))) { return Optional.empty(); } int ival = 0; for (;; ival *= 10) { ival += '0' - ch; if (++idx == end) { return Optional.of(sign ? ival : -ival); } if (((ch = str.charAt(idx)) < '0') || (ch > '9')) { return Optional.empty(); } } } /** * This method ensures that the output String has only * valid XML unicode characters as specified by the * XML 1.0 standard. For reference, please see * <a href="http://www.w3.org/TR/2000/REC-xml-20001006#NT-Char">the * standard</a>. This method will return an empty * String if the input is null or empty. * <p> * URL: http://cse-mjmcl.cse.bris.ac.uk/blog/2007/02/14/1171465494443.html * * @param in The String whose non-valid characters we want to remove. * @return The in String, stripped of non-valid characters. */ public static String stripNonValidXMLCharacters(String in) { if ((in == null) || in.isEmpty()) { return ""; // vacancy test. } StringBuilder out = new StringBuilder(); // Used to hold the output. char current; // Used to reference the current character. for (int i = 0; i < in.length(); i++) { current = in.charAt(i); // NOTE: No IndexOutOfBoundsException caught here; it should not happen. if ((current == 0x9) || (current == 0xA) || (current == 0xD) || ((current >= 0x20) && (current <= 0xD7FF)) || ((current >= 0xE000) && (current <= 0xFFFD))) { out.append(current); } } return out.toString(); } /* * @param buf String to be tokenized * @param delimstr Delimiter string * @return list {@link java.util.List} of <tt>String</tt> */ public static List<String> tokenizeToList(String buf, String delimstr) { List<String> list = new ArrayList<>(); String buffer = buf + '\n'; StringTokenizer st = new StringTokenizer(buffer, delimstr); while (st.hasMoreTokens()) { list.add(st.nextToken()); } return list; } public static String limitStringLength(String s, int maxLength) { if (s == null) { return ""; } if (s.length() <= maxLength) { return s; } return s.substring(0, maxLength - 3) + "..."; } /** * Replace non-English characters like umlauts etc. with a sensible letter or letter combination that bibtex can * accept. The basis for replacement is the HashMap UnicodeToReadableCharMap. */ public static String replaceSpecialCharacters(String s) { String result = s; for (Map.Entry<String, String> chrAndReplace : UNICODE_CHAR_MAP.entrySet()) { result = result.replace(chrAndReplace.getKey(), chrAndReplace.getValue()); } return result; } /** * Return a String with n spaces * * @param n Number of spaces * @return String with n spaces */ public static String repeatSpaces(int n) { return repeat(n, ' '); } /** * Return a String with n copies of the char c * * @param n Number of copies * @param c char to copy * @return String with n copies of c */ public static String repeat(int n, char c) { StringBuilder resultSB = new StringBuilder(n); for (int i = 0; i < n; i++) { resultSB.append(c); } return resultSB.toString(); } public static boolean isNullOrEmpty(String toTest) { return ((toTest == null) || toTest.isEmpty()); } public static boolean isBlank(String string) { return !isNotBlank(string); } public static boolean isBlank(Optional<String> string) { return !isNotBlank(string); } public static boolean isNotBlank(String string) { return StringUtils.isNotBlank(string); } public static boolean isNotBlank(Optional<String> string) { return string.isPresent() && isNotBlank(string.get()); } /** * Return string enclosed in HTML bold tags */ public static String boldHTML(String input) { return "<b>" + input + "</b>"; } /** * Return string enclosed in HTML bold tags if not null, otherwise return alternative text in HTML bold tags */ public static String boldHTML(String input, String alternative) { if (input == null) { return "<b>" + alternative + "</b>"; } return "<b>" + input + "</b>"; } /** * Unquote special characters. * * @param toUnquote The String which may contain quoted special characters. * @param quoteChar The quoting character. * @return A String with all quoted characters unquoted. */ public static String unquote(String toUnquote, char quoteChar) { StringBuilder result = new StringBuilder(); char c; boolean quoted = false; for (int i = 0; i < toUnquote.length(); ++i) { c = toUnquote.charAt(i); if (quoted) { // append literally... if (c != '\n') { result.append(c); } quoted = false; } else if (c == quoteChar) { // quote char quoted = true; } else { result.append(c); } } return result.toString(); } public static String stripAccents(String searchQuery) { return StringUtils.stripAccents(searchQuery); } /** * Make first character of String uppercase, and the * rest lowercase. */ public static String capitalizeFirst(String toCapitalize) { if (toCapitalize.length() > 1) { return toCapitalize.substring(0, 1).toUpperCase(Locale.ROOT) + toCapitalize.substring(1, toCapitalize.length()).toLowerCase(Locale.ROOT); } else { return toCapitalize.toUpperCase(Locale.ROOT); } } /** * Returns a list of words contained in the given text. * Whitespace, comma and semicolon are considered as separator between words. * * @param text the input * @return a list of words */ public static List<String> getStringAsWords(String text) { return Arrays.asList(text.split("[\\s,;]+")); } public static boolean containsIgnoreCase(String text, String searchString) { return StringUtils.containsIgnoreCase(text, searchString); } }