/*
* Copyright (C) 2011 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package com.android.talkback;
import android.content.Context;
import android.text.TextUtils;
import android.util.SparseIntArray;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Utilities for cleaning up speech text.
*/
public class SpeechCleanupUtils {
/** The regular expression used to match consecutive identical characters */
// Double escaping of regex characters is required. "\\1" refers to the
// first capturing group between the outer nesting of "[]"s and "{2,}"
// refers to two or more additional repetitions thereof.
private static final
String CONSECUTIVE_CHARACTER_REGEX = "([\\-\\\\/|!@#$%^&*\\(\\)=_+\\[\\]\\{\\}.?;'\":<>])\\1{2,}";
/** The Pattern used to match consecutive identical characters */
private static Pattern CONSECUTIVE_CHARACTER_PATTERN = Pattern.compile(
CONSECUTIVE_CHARACTER_REGEX);
/** Map containing string to speech conversions. */
private static final SparseIntArray UNICODE_MAP = new SparseIntArray();
static {
UNICODE_MAP.put('&', R.string.symbol_ampersand);
UNICODE_MAP.put('<', R.string.symbol_angle_bracket_left);
UNICODE_MAP.put('>', R.string.symbol_angle_bracket_right);
UNICODE_MAP.put('\'', R.string.symbol_apostrophe);
UNICODE_MAP.put('*', R.string.symbol_asterisk);
UNICODE_MAP.put('@', R.string.symbol_at_sign);
UNICODE_MAP.put('\\', R.string.symbol_backslash);
UNICODE_MAP.put('\u2022', R.string.symbol_bullet);
UNICODE_MAP.put('^', R.string.symbol_caret);
UNICODE_MAP.put('¢', R.string.symbol_cent);
UNICODE_MAP.put(':', R.string.symbol_colon);
UNICODE_MAP.put(',', R.string.symbol_comma);
UNICODE_MAP.put('©', R.string.symbol_copyright);
UNICODE_MAP.put('{', R.string.symbol_curly_bracket_left);
UNICODE_MAP.put('}', R.string.symbol_curly_bracket_right);
UNICODE_MAP.put('°', R.string.symbol_degree);
UNICODE_MAP.put('\u00F7', R.string.symbol_division);
UNICODE_MAP.put('$', R.string.symbol_dollar_sign);
UNICODE_MAP.put('…', R.string.symbol_ellipsis);
UNICODE_MAP.put('\u2014', R.string.symbol_em_dash);
UNICODE_MAP.put('\u2013', R.string.symbol_en_dash);
UNICODE_MAP.put('€', R.string.symbol_euro);
UNICODE_MAP.put('!', R.string.symbol_exclamation_mark);
UNICODE_MAP.put('`', R.string.symbol_grave_accent);
UNICODE_MAP.put('-', R.string.symbol_hyphen_minus);
UNICODE_MAP.put('„', R.string.symbol_low_double_quote);
UNICODE_MAP.put('\u00D7', R.string.symbol_multiplication);
UNICODE_MAP.put('\n', R.string.symbol_new_line);
UNICODE_MAP.put('¶', R.string.symbol_paragraph_mark);
UNICODE_MAP.put('(', R.string.symbol_parenthesis_left);
UNICODE_MAP.put(')', R.string.symbol_parenthesis_right);
UNICODE_MAP.put('%', R.string.symbol_percent);
UNICODE_MAP.put('.', R.string.symbol_period);
UNICODE_MAP.put('π', R.string.symbol_pi);
UNICODE_MAP.put('#', R.string.symbol_pound);
UNICODE_MAP.put('£', R.string.symbol_pound_sterling);
UNICODE_MAP.put('?', R.string.symbol_question_mark);
UNICODE_MAP.put('"', R.string.symbol_quotation_mark);
UNICODE_MAP.put('®', R.string.symbol_registered_trademark);
UNICODE_MAP.put(';', R.string.symbol_semicolon);
UNICODE_MAP.put('/', R.string.symbol_slash);
UNICODE_MAP.put(' ', R.string.symbol_space);
UNICODE_MAP.put('[', R.string.symbol_square_bracket_left);
UNICODE_MAP.put(']', R.string.symbol_square_bracket_right);
UNICODE_MAP.put('√', R.string.symbol_square_root);
UNICODE_MAP.put('™', R.string.symbol_trademark);
UNICODE_MAP.put('_', R.string.symbol_underscore);
UNICODE_MAP.put('|', R.string.symbol_vertical_bar);
UNICODE_MAP.put('\u00a5', R.string.symbol_yen);
UNICODE_MAP.put('\u00ac', R.string.symbol_not_sign);
UNICODE_MAP.put('\u00a6', R.string.symbol_broken_bar);
UNICODE_MAP.put('\u00b5', R.string.symbol_micro_sign);
UNICODE_MAP.put('\u2248', R.string.symbol_almost_equals);
UNICODE_MAP.put('\u2260', R.string.symbol_not_equals);
UNICODE_MAP.put('\u00a4', R.string.symbol_currency_sign);
UNICODE_MAP.put('\u00a7', R.string.symbol_section_sign);
UNICODE_MAP.put('\u2191', R.string.symbol_upwards_arrow);
UNICODE_MAP.put('\u2190', R.string.symbol_leftwards_arrow);
UNICODE_MAP.put('\u20B9', R.string.symbol_rupee);
UNICODE_MAP.put('\u2665', R.string.symbol_black_heart);
UNICODE_MAP.put('\u007e', R.string.symbol_tilde);
UNICODE_MAP.put('\u003d', R.string.symbol_equal);
UNICODE_MAP.put('\uffe6', R.string.symbol_won);
UNICODE_MAP.put('\u203b', R.string.symbol_reference);
UNICODE_MAP.put('\u2606', R.string.symbol_white_star);
UNICODE_MAP.put('\u2605', R.string.symbol_black_star);
UNICODE_MAP.put('\u2661', R.string.symbol_white_heart);
UNICODE_MAP.put('\u25cb', R.string.symbol_white_circle);
UNICODE_MAP.put('\u25cf', R.string.symbol_black_circle);
UNICODE_MAP.put('\u2299', R.string.symbol_solar);
UNICODE_MAP.put('\u25ce', R.string.symbol_bullseye);
UNICODE_MAP.put('\u2667', R.string.symbol_white_club_suit);
UNICODE_MAP.put('\u2664', R.string.symbol_white_spade_suit);
UNICODE_MAP.put('\u261c', R.string.symbol_white_left_pointing_index);
UNICODE_MAP.put('\u261e', R.string.symbol_white_right_pointing_index);
UNICODE_MAP.put('\u25d0', R.string.symbol_circle_left_half_black);
UNICODE_MAP.put('\u25d1', R.string.symbol_circle_right_half_black);
UNICODE_MAP.put('\u25a1', R.string.symbol_white_square);
UNICODE_MAP.put('\u25a0', R.string.symbol_black_square);
UNICODE_MAP.put('\u25b3', R.string.symbol_white_up_pointing_triangle);
UNICODE_MAP.put('\u25bd', R.string.symbol_white_down_pointing_triangle);
UNICODE_MAP.put('\u25c1', R.string.symbol_white_left_pointing_triangle);
UNICODE_MAP.put('\u25b7', R.string.symbol_white_right_pointing_triangle);
UNICODE_MAP.put('\u25c7', R.string.symbol_white_diamond);
UNICODE_MAP.put('\u2669', R.string.symbol_quarter_note);
UNICODE_MAP.put('\u266a', R.string.symbol_eighth_note);
UNICODE_MAP.put('\u266c', R.string.symbol_beamed_sixteenth_note);
UNICODE_MAP.put('\u2640', R.string.symbol_female);
UNICODE_MAP.put('\u2642', R.string.symbol_male);
UNICODE_MAP.put('\u3010', R.string.symbol_left_black_lenticular_bracket);
UNICODE_MAP.put('\u3011', R.string.symbol_right_black_lenticular_bracket);
UNICODE_MAP.put('\u300c', R.string.symbol_left_corner_bracket);
UNICODE_MAP.put('\u300d', R.string.symbol_right_corner_bracket);
UNICODE_MAP.put('\u2192', R.string.symbol_rightwards_arrow);
UNICODE_MAP.put('\u2193', R.string.symbol_downwards_arrow);
UNICODE_MAP.put('\u00b1', R.string.symbol_plus_minus_sign);
UNICODE_MAP.put('\u2113', R.string.symbol_liter);
UNICODE_MAP.put('\u2103', R.string.symbol_celsius_degree);
UNICODE_MAP.put('\u2109', R.string.symbol_fahrenheit_degree);
UNICODE_MAP.put('\u00a2', R.string.symbol_cent);
UNICODE_MAP.put('\u2252', R.string.symbol_approximately_equals);
UNICODE_MAP.put('\u222b', R.string.symbol_integral);
}
/**
* Cleans up text for speech. Converts symbols to their spoken equivalents.
*
* @param context The context used to resolve string resources.
* @param text The text to clean up.
* @return Cleaned up text.
*/
public static CharSequence cleanUp(Context context, CharSequence text) {
if (text != null) {
int trimmedLength = TextUtils.getTrimmedLength(text);
if (trimmedLength == 1) {
return getCleanValueFor(context, text.toString().trim().charAt(0));
} else if (trimmedLength == 0 && text.length() > 0) {
// For example, just spaces.
return getCleanValueFor(context, text.toString().charAt(0));
}
}
return text;
}
/**
* Collapses repeated consecutive characters in a CharSequence by matching
* against {@link #CONSECUTIVE_CHARACTER_REGEX}.
*
* @param context Context for retrieving resources
* @param text The text to process
* @return The text with consecutive identical characters collapsed
*/
public static CharSequence collapseRepeatedCharacters(Context context, CharSequence text) {
if (TextUtils.isEmpty(text)) {
return null;
}
// TODO: Add tests
Matcher matcher = CONSECUTIVE_CHARACTER_PATTERN.matcher(text);
while (matcher.find()) {
final String replacement = context.getString(R.string.character_collapse_template,
matcher.group().length(), getCleanValueFor(context, matcher.group().charAt(0)));
final int matchFromIndex = matcher.end() - matcher.group().length()
+ replacement.length();
text = matcher.replaceFirst(replacement);
matcher = CONSECUTIVE_CHARACTER_PATTERN.matcher(text);
matcher.region(matchFromIndex, text.length());
}
return text;
}
/**
* Convenience method that feeds the given text through {@link #collapseRepeatedCharacters}
* and then {@link #cleanUp}.
*/
public static CharSequence collapseRepeatedCharactersAndCleanUp(Context context,
CharSequence text) {
CharSequence collapsed = collapseRepeatedCharacters(context, text);
CharSequence cleanedUp = cleanUp(context, collapsed);
return cleanedUp;
}
/**
* Returns the "clean" value for the specified character.
*/
public static String getCleanValueFor(Context context, char key) {
final int resId = UNICODE_MAP.get(key);
if (resId != 0) {
return context.getString(resId);
}
if (Character.isUpperCase(key)) {
return context.getString(R.string.template_capital_letter, Character.toString(key));
}
return Character.toString(key);
}
}