/* LanguageTool, a natural language style checker
* Copyright (C) 2009 Ionuț Păduraru
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
* USA
*/
package org.languagetool.dev.wikipedia;
/**
* Helper class for romanian diacritics correction. Many romanian texts
* (including Romanian wikipedia) contains wrong diacritics: <b>ş</b> instead of
* <b>ș</b> and <b>ţ</b> instead of <b>ț</b>.
*
* @author Ionuț Păduraru
*/
public final class RomanianDiacriticsModifier {
private RomanianDiacriticsModifier() {
// private constructor
}
private static final int REPLACEMENT_BUFF_SIZE = 10 * 1024;
private static char[] cCorrectDiacritics = null;
private static char[] replacementBuff = null;
/**
* Initialize internal buffers
*/
private synchronized static void initCharMap() {
if (cCorrectDiacritics == null) {
replacementBuff = new char[REPLACEMENT_BUFF_SIZE];
cCorrectDiacritics = new char[Character.MAX_VALUE - Character.MIN_VALUE];
char c = Character.MIN_VALUE;
for (int i = 0; i < Character.MAX_VALUE - Character.MIN_VALUE; i++) {
final char newC = diac(c);
cCorrectDiacritics[i] = newC;
c++;
}
}
}
/**
* Single character correction. Used internally during buffers
* initialization.
*/
private static char diac(char c) {
char result = c;
switch (c) {
case 'ş':
result = 'ș';
break;
case 'ţ':
result = 'ț';
break;
case 'Ţ':
result = 'Ț';
break;
case 'Ş':
result = 'Ș';
break;
default:
break;
}
return result;
}
/**
* Romanian diacritics correction: replace <b>ş</b> with <b>ș</b> and
* <b>ţ</b> with <b>ț</b> (including upper-case variants).<br/>
* Thread-safe method.
*/
public static synchronized String correctDiacritics(String s) {
if (null == s) {
return null;
}
initCharMap();
final int length = s.length();
// check buffer size
if (length > replacementBuff.length) {
replacementBuff = new char[length];
}
// get current chars
s.getChars(0, length, replacementBuff, 0);
// replace
for (int i = 0; i < length; i++) {
replacementBuff[i] = cCorrectDiacritics[replacementBuff[i]];
}
// return the corrected string
return String.valueOf(replacementBuff, 0, length);
}
}