/******************************************************************************* * Copyright 2016 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ package de.tudarmstadt.ukp.lmf.transform; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * Different useful functions * @author chebotar * */ public class StringUtils { private static final Pattern HTML_ENTITIES = Pattern.compile("^(.*?)&#(\\d+);"); /** Replaces HTML entities such as ̣ with the corresponding * characters. */ public static String replaceHtmlEntities(final String text) { String t = text; StringBuilder result = new StringBuilder(); do { Matcher matcher = HTML_ENTITIES.matcher(t); if (matcher.find()) { result.append(matcher.group(1)); result.append((char) Integer.parseInt(matcher.group(2))); t = matcher.replaceFirst(""); } else break; } while (true); result.append(t); // if (!text.equals(result.toString())) // System.out.println(">" + text + "< to >" + result.toString() + "<"); return result.toString(); } /** * Removes all UTF8 characters that cause errors in MySQL database * @param text * @return */ public static String replaceNonUtf8(String text){ text = text.replaceAll("[^\\u0000-\\uFFFF]", "?"); return text; } /** * Removes all UTF8 characters that cause errors in MySQL database * and trims the text to maxLenth * @param text * @param maxLength * @return */ public static String replaceNonUtf8(String text, int maxLength){ if(text.length() > maxLength) text = text.substring(0, maxLength-1); return replaceNonUtf8(text); } }