/*
* WPCleaner: A tool to help on Wikipedia maintenance tasks.
* Copyright (C) 2013 Nicolas Vervelle
*
* See README.txt file for licensing information.
*/
package org.wikipediacleaner.api.check;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.Map;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.methods.GetMethod;
import org.wikipediacleaner.api.constants.EnumWikipedia;
/**
* Management of special characters.
*/
public class SpecialCharacters {
/**
* Punctuation characters.
*/
private final static String punctuation = ".,;:!?";
/**
* Characters authorized for every Wiki.
*/
private final static String authorizedCharacters =
"abcdefghijklmnopqrstuvwxyz" +
"ABCDEFGHIJKLMNOPQRSTUVWXYZ" +
" 0123456789-:,.!?'#/()*";
/**
* Characters authorized for specific Wiki.
*/
private final static Map<EnumWikipedia, String> localAuthorizedCharacters =
new HashMap<EnumWikipedia, String>();
/**
* Possible replacements for unauthorized characters for every Wiki.
*/
private final static Map<Character, String> replacements = new HashMap<Character, String>();
/**
* Possible replacements for unauthorized characters for specific Wiki.
*/
private final static Map<EnumWikipedia, Map<Character, String>> localReplacements =
new HashMap<EnumWikipedia, Map<Character,String>>();
/**
* Set to true to check all replacements.
*/
private final static boolean CHECK_REPLACEMENTS = false;
static {
// Possible replacements for every Wiki
replacements.put((char) 0xFEFF, "");
replacements.put((char) 0x200E, "");
replacements.put((char) 0x200B, "");
addReplacements("ʻ’“”„‟′", "'");
addReplacements("ÀÁÂÃÄÅĀĂĄǍǞǠǺȀȂȦȺАḀẠẢẤẦẨẪẬẮẰẲẴẶ", "A");
addReplacements("àáâãäåāăąǎǟǡǻȁȃȧаḁẚạảấầẩẫậắằẳẵặⱥ", "a");
addReplacements("ÆǢǼ", "Ae");
addReplacements("æǣǽ", "ae");
addReplacements("ƁɃḂḄḆ", "B");
addReplacements("ƀɓбᶀḃḅḇ", "b");
addReplacements("ÇĆĈĊČƇȻḈ", "C");
addReplacements("çćĉċčƈȼḉ", "c");
addReplacements("ÐĎĐƉƊƋḊḌḎḐ", "D");
addReplacements("ðďđƌȡɖɗᶁḋḍḏḑ", "d");
addReplacements("DžDz", "Dz");
addReplacements("ÈÉÊËĒĔĖĘĚƎƏƐȄȆȨɆΕЁЕḘḚẸẺẼẾỀỂỄỆ", "E");
addReplacements("èéêëēĕėęěǝȅȇȩɇəеёḕḗḙḛḝẹẻẽếềểễệ", "e");
addReplacements("ƑḞ", "F");
addReplacements("ƒᶂḟ", "f");
addReplacements("ĜĞĠĢƓǤǦǴḠ", "G");
addReplacements("ĝğġģǥǧǵɠᶃḡ", "g");
addReplacements("ĤĦȞḢḤḦḨḪⱧΗ", "H");
addReplacements("ĥħȟɦḣḥḧḩḫẖⱨ", "h");
addReplacements("ÌÍÎÏĨĪĬĮİƗǏȈȊІỊ", "I");
addReplacements("ìíîïĩīĭįıǐȉȋɨіỉị", "i");
addReplacements("ĴɈ", "J");
addReplacements("ĵǰɉ", "j");
addReplacements("ĶƘǨКḰḲḴⱩ", "K");
addReplacements("ķƙǩкᶄḱḳḵⱪ", "k");
addReplacements("ĹĻĽĿŁȽḶḸḺⱠⱢ", "L");
addReplacements("ĺļľŀłƚȴлᶅḷḹḻⱡ", "l");
addReplacements("Lj", "Lj");
addReplacements("lj", "lj");
addReplacements("МḾṀṂ", "M");
addReplacements("ɱмᶆḿṁṃ", "m");
addReplacements("ÑŃŅŇƝǸȠṄṆṈ", "N");
addReplacements("ñńņňŊŋƞǹȵɲᶇṅṇṉ", "n");
addReplacements("Nj", "Nj");
addReplacements("nj", "nj");
addReplacements("ÒÓÔÕÖØŌŎŐƟƠǑǪǬǾȌȎȪȬȮȰОỌỎỐỒỔỖỘỚỜỞỠỢ", "O");
addReplacements("òóôõöøōŏőơǒǫǭǿȍȏȫȭȯȱɵоọỏốồổỗộớờởỡợ", "o");
addReplacements("Œ", "Oe");
addReplacements("œ", "oe");
addReplacements("ƤṔṖⱣ", "P");
addReplacements("ƥᵽᶈṕṗ", "p");
addReplacements("ʠ", "q");
addReplacements("ŔŖŘȐȒɌṘṚṜṞⱤ", "R");
addReplacements("ŕŗřȑȓɍᶉṙṛṝṟ", "r");
addReplacements("ŚŜŞŠȘṠṢ", "S");
addReplacements("śŝşšșȿᶊṡṣ", "s");
addReplacements("ß", "ss");
addReplacements("ŢŤŦƬƮȚȾṪṬṮṰ", "T");
addReplacements("ţťŧƫƭțȶʈṫṭṯṱẗⱦ", "t");
addReplacements("Þ", "Th");
addReplacements("þ", "th");
addReplacements("ÙÚÛÜŨŪŬŮŰŲƯǓǕǗǙǛȔȖɄṲṶỤỨỪỬỮỰ", "U");
addReplacements("ùúûüũūŭůűųưǔǖǘǚǜȕȗʉṳṷụủứừửữự", "u");
addReplacements("ƲṼṾ", "V");
addReplacements("ʋṽṿⱴ", "v");
addReplacements("ŴƜẀẂẄẆẈⱲ", "W");
addReplacements("ŵẁẃẅẇẉẘⱳ", "w");
addReplacements("ХẊẌ", "X");
addReplacements("хᶍẋẍ", "x");
addReplacements("ÝŶŸƳȲɎӲẎỲỴỶỸ", "Y");
addReplacements("ýÿŷƴȳɏӳᶌẏẙỳỵỷỹ", "y");
addReplacements("ŹŻŽƵȤẐẒẔⱫ", "Z");
addReplacements("źżžƶȥɀʐᶎẑẓⱬ", "z");
addReplacements("²", "2");
addReplacements("–—−", "-");
addReplacements("…", "...");
// Specific configuration for CS
localAuthorizedCharacters.put(EnumWikipedia.CS, "čďěňřšťžČĎŇŘŠŤŽ");
// Specific configuration for DA
localAuthorizedCharacters.put(EnumWikipedia.DA, "ÆØÅæøå");
// Specific configuration for EL
localAuthorizedCharacters.put(EnumWikipedia.EL, "ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩαβγδεζηθικλμνξοπρσςτυφχψω");
addReplacements(EnumWikipedia.EL, "ά", "α");
addReplacements(EnumWikipedia.EL, "έ", "ε");
addReplacements(EnumWikipedia.EL, "ή", "η");
addReplacements(EnumWikipedia.EL, "ί", "ι");
addReplacements(EnumWikipedia.EL, "ϊ", "ι");
addReplacements(EnumWikipedia.EL, "ό", "ο");
addReplacements(EnumWikipedia.EL, "ύ", "υ");
addReplacements(EnumWikipedia.EL, "ϋ", "υ");
addReplacements(EnumWikipedia.EL, "ώ", "ω");
addReplacements(EnumWikipedia.EL, "Ά", "Α");
addReplacements(EnumWikipedia.EL, "Έ", "Ε");
addReplacements(EnumWikipedia.EL, "Ή", "Η");
addReplacements(EnumWikipedia.EL, "Ί", "Ι");
addReplacements(EnumWikipedia.EL, "Ό", "Ο");
addReplacements(EnumWikipedia.EL, "Ύ", "Υ");
addReplacements(EnumWikipedia.EL, "Ώ", "Ω");
// Specific configuration for EN
// TODO: Correctly handle HTML entity...
// addReplacements(EnumWikipedia.EN, "&", "And");
// Specific configuration for ES
addReplacements(EnumWikipedia.ES, "Ñ", "Nzz");
addReplacements(EnumWikipedia.ES, "ñ", "nzz");
// Specific configuration for FI
localAuthorizedCharacters.put(EnumWikipedia.FI, "ÅÄÖåäö");
// Specific configuration for NN
//localAuthorizedCharacters.put(EnumWikipedia.NN, "ÆØÅæøå");
// Specific configuration for NO
localAuthorizedCharacters.put(EnumWikipedia.NO, "ÆØÅæøå");
// Specific configuration for RO
localAuthorizedCharacters.put(EnumWikipedia.RO, "ăîâşţ");
// Specific configuration for RU
localAuthorizedCharacters.put(EnumWikipedia.RU, "АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЬЫЪЭЮЯабвгдежзийклмнопрстуфхцчшщьыъэюя");
// Specific configuration for SV
localAuthorizedCharacters.put(EnumWikipedia.SV, "ÅÄÖåäö");
// Specific configuration for UK
localAuthorizedCharacters.put(EnumWikipedia.UK, "АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЬЫЪЭЮЯабвгдежзийклмнопрстуфхцчшщьыъэюяіїґ");
if (CHECK_REPLACEMENTS) {
// Compare with AWB
BufferedReader br = null;
GetMethod method = null;
try {
HttpClient httpClient = new HttpClient();
method = new GetMethod("http://sourceforge.net/p/autowikibrowser/code/HEAD/tree/AWB/WikiFunctions/Tools.cs?format=raw");
int statusCode = httpClient.executeMethod(method);
if (statusCode == HttpStatus.SC_OK) {
br = new BufferedReader(new InputStreamReader(method.getResponseBodyAsStream(), "UTF8"));
String line = null;
boolean startDiacritics = false;
boolean endDiacritics = false;
while (!endDiacritics && ((line = br.readLine()) != null)) {
if (!startDiacritics) {
if (line.contains("Diacritics =")) {
startDiacritics = true;
}
} else {
if (line.contains("}")) {
endDiacritics = true;
} else if (line.contains("new KeyValuePair<string, string>")) {
int quote1 = line.indexOf('\"');
int quote2 = line.indexOf('\"', quote1 + 1);
int quote3 = line.indexOf('\"', quote2 + 1);
int quote4 = line.indexOf('\"', quote3 + 1);
if ((quote1 >= 0) && (quote2 >= 0) && (quote3 >= 0) && (quote4 >= 0) &&
(quote2 == quote1 + 2)) {
char awbDiacritic = line.charAt(quote1 + 1);
String awbReplacement = line.substring(quote3 + 1, quote4);
String replacement = replacements.get(awbDiacritic);
if (replacement == null) {
System.err.println("AWB replacement for " + awbDiacritic + "/" + awbReplacement + " not defined");
} else if (!replacement.equals(awbReplacement)) {
System.err.println("AWB replacement for " + awbDiacritic + "/" + awbReplacement + " different than " + replacement);
}
}
}
}
}
} else {
System.err.println("Error when accessing AWB source code: " + statusCode + " - " + HttpStatus.getStatusText(statusCode));
}
} catch (Exception e) {
System.err.println("Exception when accessing AWB source code (" + e.getClass().getName() + "): " + e.getMessage());
if (br != null) {
try {
br.close();
} catch (IOException e1) {
// Do nothing
}
}
if (method != null) {
method.releaseConnection();
}
}
}
}
/**
* Add replacements for some characters.
*
* @param characters All the characters to replace.
* @param replacement Replacement.
*/
private static void addReplacements(String characters, String replacement) {
// Check a few things : characters in double, characters order, several replacements, ...
if (CHECK_REPLACEMENTS) {
for (int i = 0; i < characters.length(); i++) {
char currentChar = characters.charAt(i);
for (int j = i + 1; j < characters.length(); j++) {
if (characters.charAt(j) == currentChar) {
System.err.println(
"Character in double for " + replacement + ": " +
currentChar + "/" + i + " = " +
characters.charAt(j) + "/" + j);
}
if (characters.charAt(j) < currentChar) {
System.err.println(
"Character " + characters.charAt(j) + "/" + j +
" for " + replacement +
" should be before " + currentChar + "/" + i);
}
}
if (replacements.containsKey(currentChar) &&
!replacement.equals(replacements.get(currentChar))) {
System.err.println("Several replacements defined for " + currentChar + ":" + replacements.get(currentChar) + "," + replacement);
}
}
}
// Add the replacement
for (int i = 0; i < characters.length(); i++) {
char currentChar = characters.charAt(i);
replacements.put(currentChar, replacement);
}
}
/**
* Add replacements for some characters for a specific Wiki.
*
* @param wiki Wiki.
* @param characters All the characters to replace.
* @param replacement Replacement.
*/
private static void addReplacements(EnumWikipedia wiki, String characters, String replacement) {
Map<Character, String> localReplacement = localReplacements.get(wiki);
if (localReplacement == null) {
localReplacement = new HashMap<Character, String>();
localReplacements.put(wiki, localReplacement);
}
for (int i = 0; i < characters.length(); i++) {
localReplacement.put(characters.charAt(i), replacement);
}
}
/**
* @param character Character tested.
* @return Flag indicating if the character is a punctuation.
*/
public static boolean isPunctuation(char character) {
if (punctuation.indexOf(character) >= 0) {
return true;
}
return false;
}
/**
* @param character Character tested.
* @param wiki Wiki.
* @return Flag indicating if the character is authorized.
*/
public static boolean isAuthorized(char character, EnumWikipedia wiki) {
if (authorizedCharacters.indexOf(character) >= 0) {
return true;
}
if (wiki == null) {
return false;
}
String local = localAuthorizedCharacters.get(wiki);
if (local == null) {
return false;
}
return (local.indexOf(character) >= 0);
}
/**
* @param character Character to be replaced.
* @param wiki Wiki.
* @return Replacement.
*/
public static String proposeReplacement(char character, EnumWikipedia wiki) {
Map<Character, String> localReplacement = localReplacements.get(wiki);
if (localReplacement != null) {
String replacement = localReplacement.get(Character.valueOf(character));
if (replacement != null) {
return replacement;
}
}
String replacement = replacements.get(Character.valueOf(character));
if (replacement != null) {
return replacement;
}
return Character.toString(character);
}
}