/* * This file is part of the Sejda source code * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package org.sejda.core.support.util; import java.util.*; public final class StringUtils { private StringUtils() { // hide } /** * Removes control characters like \n, \t or \r * Replaces whitespace, including Unicode 'non-breaking-space', with plain regular 'space' */ public static String normalizeWhitespace(String in) { // removes control characters like \n, \r or \t // replaces all whitespace (eg:  ) with ' ' (space) String result = in.replaceAll("[\\n\\t\\r]", "").replaceAll("\\p{Z}\\s", " "); result = result.replace((char) 160, ' '); return result; } // Useful to debug weird strings public static String asUnicodes(String in) { if(in == null) return null; StringBuilder result = new StringBuilder(); for (int offset = 0; offset < in.length(); ) { int codepoint = in.codePointAt(offset); result.append("\\U+").append(Integer.toHexString(codepoint).toUpperCase()); offset += Character.charCount(codepoint); } return result.toString(); } /** * Returns a list of characters that exist in s1 but not in s2 */ public static Set<Character> difference(String s1, String s2) { Set<Character> result = new LinkedHashSet<>(); for(Character c: s1.toCharArray()) { if(!s2.contains(c.toString())) { result.add(c); } } return result; } }