/*
* Autopsy Forensic Browser
*
* Copyright 2012 Basis Technology Corp.
* Contact: carrier <at> sleuthkit <dot> org
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.sleuthkit.autopsy.coreutils;
import java.awt.ComponentOrientation;
/**
* Text utilities
*/
public class TextUtil {
/**
* Determine and return text orientation
*
* @param text text to determine the text orientation in
*
* @return detected text orientation that should be used for this type of
* text
*/
public static ComponentOrientation getTextDirection(String text) {
int rtl_cnt = 0;
for (char c : text.toCharArray()) {
if (Character.isWhitespace(c)) {
continue;
}
// count the RTL chars
byte direction = Character.getDirectionality(c);
if (direction == Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC
|| direction == Character.DIRECTIONALITY_RIGHT_TO_LEFT) {
++rtl_cnt;
}
}
ComponentOrientation orientation = ComponentOrientation.LEFT_TO_RIGHT;
if (text.length() > 1024 && rtl_cnt > 50) {
orientation = ComponentOrientation.RIGHT_TO_LEFT;
} else if (text.length() <= 1024 && rtl_cnt > text.length() / 4) {
orientation = ComponentOrientation.RIGHT_TO_LEFT;
}
return orientation;
}
/**
* This method determines if a passed-in Java char (16 bits) is a valid
* UTF-8 printable character, returning true if so, false if not.
*
* Note that this method can have ramifications for characters outside the
* Unicode Base Multilingual Plane (BMP), which require more than 16 bits.
* We are using Java characters (16 bits) to look at the data and this will
* not accurately identify any non-BMP character (larger than 16 bits)
* ending with 0xFFFF and 0xFFFE. In the interest of a fast solution, we
* have chosen to ignore the extended planes above Unicode BMP for the time
* being. The net result of this is some non-BMP characters may be
* interspersed with '^' characters in Autopsy.
*
* @param ch the character to test
*
* @return Returns true if the character is valid UTF-8, false if not.
*/
public static boolean isValidSolrUTF8(char ch) {
return ((ch <= 0xFDD0 || ch >= 0xFDEF) && (ch > 0x1F || ch == 0x9 || ch == 0xA || ch == 0xD) && (ch != 0xFFFF) && (ch != 0xFFFE));
}
}