/********************************************************************************** * $URL: https://source.sakaiproject.org/svn/search/trunk/search-api/api/src/java/org/sakaiproject/search/api/SearchUtils.java $ * $Id: SearchUtils.java 105078 2012-02-24 23:00:38Z ottenhoff@longsight.com $ *********************************************************************************** * * Copyright (c) 2006, 2007, 2008 The Sakai Foundation * * Licensed under the Educational Community License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.opensource.org/licenses/ECL-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * **********************************************************************************/ package org.sakaiproject.search.api; public class SearchUtils { public static String getCleanStringXX(String text) { text = text.replaceAll("[\\x00-\\x08\\x0b\\x0c\\x0e-\\x1f\\ud800-\\udfff\\uffff\\ufffe]", ""); return text; } /** * @param string * @param sb * @param minWordLength * @return */ public static StringBuilder filterWordLengthIgnore(String string, StringBuilder sb, int minWordLength) { if (sb == null) { sb = new StringBuilder(); } if (true) { sb.append(string); return sb; } if (minWordLength == -1) { sb.append(string); return sb; } char[] content = string.toCharArray(); int startOfWord = -1; boolean symbol = false; for (int i = 0; i < content.length; i++) { // only take words longer than 3 charaters // if ( isIdiom(content[i]) ) { // symbol = true; // } if (Character.isWhitespace(content[i])) { if (startOfWord != -1 && (symbol || (i - startOfWord) > minWordLength)) { if (!symbol || Character.isWhitespace(content[startOfWord])) { content[startOfWord] = ' '; } else if ((sb.length() > 0) && sb.charAt(sb.length() - 1) != ' ') { sb.append(' '); } String word = new String(content, startOfWord, i - startOfWord); sb.append(word); } symbol = false; startOfWord = i; } else { if (startOfWord == -1) { startOfWord = i - 1; if (startOfWord == -1) { startOfWord = 0; } } } } if (startOfWord != -1 && (content.length - startOfWord - 1) > minWordLength) { if (Character.isWhitespace(content[startOfWord])) { content[startOfWord] = ' '; } String word = new String(content, startOfWord, content.length - startOfWord); sb.append(word).append(" "); } return sb; } /** * @param string * @param sb */ public static StringBuilder appendCleanString(String string, StringBuilder sb) { if ( string == null ) { return sb; } return appendCleanString(string.toCharArray(),sb); } public static StringBuilder appendCleanString(char[] content, StringBuilder sb) { if (sb == null) { sb = new StringBuilder(); } boolean ignore = true; for (int i = 0; i < content.length; i++) { char c = content[i]; if (Character.isWhitespace(c) || Character.isISOControl(c) || (c == 160 ) || (c >= 0x00 && c <= 0x08) || (c == 0x0b) || (c == 0x0c) || (c == 0x0e && c <= 0x1f) || (c >= 0xd800 && c <= 0xdfff) || (c == 0xffff) || (c == 0xfffe)) { ignore = true; } else { if (ignore) { sb.append(" "); ignore = false; } sb.append(c); } } return sb; } }