/* * WPCleaner: A tool to help on Wikipedia maintenance tasks. * Copyright (C) 2013 Nicolas Vervelle * * See README.txt file for licensing information. */ package org.wikipediacleaner.api.check.algorithm; import java.util.ArrayList; import java.util.Collection; import java.util.List; import java.util.Map; import org.wikipediacleaner.api.check.CheckErrorResult; import org.wikipediacleaner.api.check.HtmlCharacters; import org.wikipediacleaner.api.data.PageAnalysis; import org.wikipediacleaner.api.data.PageElementTemplate; import org.wikipediacleaner.gui.swing.component.MWPane; import org.wikipediacleaner.i18n.GT; /** * Algorithm for analyzing error 16 of check wikipedia project. * Error 16: Unicode control characters */ public class CheckErrorAlgorithm016 extends CheckErrorAlgorithmBase { /** * Possible global fixes. */ private final static String[] globalFixes = new String[] { GT._("Remove all control characters"), }; public CheckErrorAlgorithm016() { super("Unicode control characters"); } /** * Analyze a page to check if errors are present. * * @param analysis Page analysis. * @param errors Errors found in the page. * @param onlyAutomatic True if analysis could be restricted to errors automatically fixed. * @return Flag indicating if the error was found. */ @Override public boolean analyze( PageAnalysis analysis, Collection<CheckErrorResult> errors, boolean onlyAutomatic) { if (analysis == null) { return false; } // Retrieve configuration boolean onlyTemplates = Boolean.valueOf(getSpecificProperty("only_templates", true, true, false)); boolean result = false; String contents = analysis.getContents(); if (onlyTemplates) { Collection<PageElementTemplate> templates = analysis.getTemplates(); if (templates == null) { return false; } int lastEnd = 0; for (PageElementTemplate template : templates) { int begin = template.getBeginIndex(); int end = template.getEndIndex(); if (begin >= lastEnd) { if (analyzeArea(analysis, contents, begin, end, errors)) { if (errors == null) { return true; } result = true; } lastEnd = end; } } } else { result = analyzeArea(analysis, contents, 0, contents.length(), errors); } return result; } /** * Analyze a page area to check if errors are present. * * @param analysis Page analysis. * @param contents Page contents. * @param beginArea Begin index of the area. * @param endArea End index of the area. * @param errors Errors found in the page. * @return Flag indicating if the error was found. */ public boolean analyzeArea( PageAnalysis analysis, String contents, int beginArea, int endArea, Collection<CheckErrorResult> errors) { boolean result = false; // Check every character int index = beginArea; while (index < endArea) { int codePoint = contents.codePointAt(index); ControlCharacter control = getControlCharacter(codePoint); if (control != null) { if (errors == null) { return true; } result = true; // Find extent of the area to highlight int begin = index; if (begin > beginArea) { begin = Math.max(index - Character.charCount(contents.codePointBefore(index)), 0); } List<Integer> controls = new ArrayList<Integer>(); controls.add(Integer.valueOf(codePoint)); int end = index + Character.charCount(codePoint); while ((end < endArea) && ((getControlCharacter(contents.codePointBefore(end)) != null) || (getControlCharacter(contents.codePointAt(end)) != null))) { Integer controlNum = Integer.valueOf(contents.codePointAt(end)); if ((controlNum != null) && (!controls.contains(controlNum))) { controls.add(controlNum); } end += Character.charCount(contents.codePointAt(end)); } // Report error CheckErrorResult errorResult = createCheckErrorResult(analysis, begin, end); for (Integer controlFound : controls) { ControlCharacter found = getControlCharacter(controlFound.intValue()); if (found != null) { errorResult.addText( Integer.toHexString(controlFound.intValue()) + " - " + GT._(found.description)); } } StringBuilder replacementB = new StringBuilder(); List<String> otherReplacements = new ArrayList<String>(); boolean unsafeCharacter = false; boolean checkUnsafe = false; int i = begin; while (i < end) { codePoint = contents.codePointAt(i); control = getControlCharacter(codePoint); if (control == null) { replacementB.appendCodePoint(codePoint); unsafeCharacter |= (automaticChars.indexOf(codePoint) < 0); } else { if (!control.removable) { int replaceBy = 0; if (control == ControlCharacter.NON_BREAKING_SPACE) { if ((i > 0) && (contents.codePointBefore(i) == '«')) { replaceBy = ' '; } int next = i + Character.charCount(codePoint); if (next < end) { int codePointAfter = contents.codePointAt(next); if ((codePointAfter == '»') || (codePointAfter == ':')) { replaceBy = ' '; } } } if (replaceBy == 0) { replacementB.appendCodePoint(codePoint); checkUnsafe = true; } else { replacementB.appendCodePoint(replaceBy); } } checkUnsafe |= !control.safe; List<String> replacements = ControlCharacter.getReplacements(codePoint); if (replacements != null) { for (String replacement : replacements) { StringBuilder otherReplacement = new StringBuilder(); int j = begin; while (j < end) { int codePointJ = contents.codePointAt(j); if ((i != j) && (codePoint != codePointJ)) { otherReplacement.appendCodePoint(codePointJ); } else { otherReplacement.append(replacement); } j += Character.charCount(codePointJ); } if (!otherReplacements.contains(otherReplacement.toString())) { otherReplacements.add(otherReplacement.toString()); } } } } i += Character.charCount(codePoint); } boolean automatic = (!unsafeCharacter || !checkUnsafe) && !analysis.getPage().isRedirect(); String original = contents.substring(begin, end); String replacement = replacementB.toString(); if (!replacement.equals(original)) { errorResult.addReplacement( replacement, GT._("Remove all control characters"), automatic); } for (String otherReplacement : otherReplacements) { if ((!automatic || replacement.equals(original)) && !otherReplacement.equals(original) && !otherReplacement.equals(replacement)) { errorResult.addReplacement(otherReplacement); } } errors.add(errorResult); index = end; } else { index += Character.charCount(codePoint); } } return result; } /** * Authorized characters for automatic replacement. */ private final static String automaticChars = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "abcdefghijklmnopqrstuvwxyz" + "áàâäåãÀ" + "éèêëÉ" + "íìîïĩ" + "óôöōŌ" + "úùûü" + "ý" + "ćč" + "ńň" + "š" + "ź" + "0123456789" + " []|(){}<>,.!?;:--–=+*#/%'\"«»\n\t"; /** * @param codePoint Code point. * @return Control character related to the code point. */ private ControlCharacter getControlCharacter(int codePoint) { ControlCharacter control = ControlCharacter.getControlCharacter(codePoint); if (control != null) { String propertyName = "use_" + Integer.toHexString(codePoint).toUpperCase(); String filter = getSpecificProperty(propertyName, true, true, false); if ((filter != null) && (Boolean.FALSE.equals(Boolean.valueOf(filter)))) { control = null; } } return control; } /** * Automatic fixing of all the errors in the page. * * @param analysis Page analysis. * @return Page contents after fix. */ @Override protected String internalAutomaticFix(PageAnalysis analysis) { return fix(globalFixes[0], analysis, null); } /** * @return List of possible global fixes. */ @Override public String[] getGlobalFixes() { return globalFixes; } /** * Fix all the errors in the page. * * @param fixName Fix name (extracted from getGlobalFixes()). * @param analysis Page analysis. * @param textPane Text pane. * @return Page contents after fix. */ @Override public String fix(String fixName, PageAnalysis analysis, MWPane textPane) { return fixUsingAutomaticReplacement(analysis); } /** * @return Map of parameters (Name -> description). * @see org.wikipediacleaner.api.check.algorithm.CheckErrorAlgorithmBase#getParameters() */ @Override public Map<String, String> getParameters() { Map<String, String> parameters = super.getParameters(); parameters.put("only_templates", GT._("To report control characters only in templates")); return parameters; } /** * Control characters characteristics. */ private enum ControlCharacter { DELETE(0x007F, 0x007F, true, false, GT._No("Delete")), NON_BREAKING_SPACE(0x00A0, 0x00A0, false, true, GT._No("Non-breaking space")), SOFT_HYPHEN(0x00AD, 0x00AD, true, false, GT._No("Soft hyphen")), THREE_PER_EM_SPACE(0x2004, 0x2004, true, false, GT._No("Thee-per-em space")), FOUR_PER_EM_SPACE(0x2005, 0x2005, true, false, GT._No("Four-per-em space")), SIX_PER_EM_SPACE(0x2006, 0x2006, true, false, GT._No("Six-per-em space")), FIGURE_SPACE(0x2007, 0x2007, true, false, GT._No("Figure space")), PUNCTUATION_SPACE(0x2008, 0x2008, true, false, GT._No("Punctuation space")), ZERO_WIDTH_SPACE(0x200B, 0x200B, true, false, GT._No("Zero-width space")), LEFT_TO_RIGHT_MARK(0x200E, 0x200E, true, false, GT._No("Left-to-right mark")), LINE_SEPARATOR(0x2028, 0x2028, true, false, GT._No("Line separator")), LEFT_TO_RIGHT_EMBEDDING(0x202A, 0x202A, true, false, GT._No("Left-to-right embedding")), POP_DIRECTIONAL_FORMATTING(0x202C, 0x202C, true, false, GT._No("Pop directional formatting")), BYTE_ORDER_MARK(0xFEFF, 0xFEFF, true, false, GT._No("Byte order mark")), OBJECT_REPLACEMENT_CHARACTER(0xFFFC, 0xFFFC, true, false, GT._No("Object replacement character")), PUA(0xE000, 0xF8FF, false, false, GT._No("Private use area")), PUA_A(0XF0000, 0xFFFFD, false, false, GT._No("Private use area A")), PUA_B(0x100000, 0x10FFFD, false, false, GT._No("Private use area B")); public final int begin; public final int end; public final boolean removable; public final boolean safe; public final String description; /** * @param begin Begin of the range of control characters. * @param end End of the range of control characters. * @param removable True if the control character can be removed. * @param safe True if removing the control character is safe. * @param description Description of the control character. */ private ControlCharacter( int begin, int end, boolean removable, boolean safe, String description) { this.begin = begin; this.end = end; this.removable = removable; this.safe = safe; this.description = description; } /** * @param codePoint Code point. * @return Control character for the given code point. */ public static ControlCharacter getControlCharacter(int codePoint) { for (ControlCharacter control : values()) { if ((codePoint >= control.begin) && (codePoint <= control.end)) { return control; } } return null; } /** * @param codePoint Character. * @param control Control characters set. * @return True if the character is in the control character set. */ public static boolean isIncluded(int codePoint, ControlCharacter control) { if (control == null) { return false; } if ((codePoint >= control.begin) && (codePoint <= control.end)) { return true; } return false; } /** * @param codePoint Code point. * @return Potential replacement. */ public static List<String> getReplacements(int codePoint) { List<String> replacements = null; // TODO: Test replacing left to right mark with HTML character /*if (codePoint == HtmlCharacters.LEFT_TO_RIGHT_MARK.getValue()) { replacements = Collections.singletonList(HtmlCharacters.LEFT_TO_RIGHT_MARK.getFullEntity()); }*/ if (codePoint == HtmlCharacters.SYMBOL_NON_BREAKING_SPACE.getValue()) { replacements = new ArrayList<>(); replacements.add(" "); replacements.add(HtmlCharacters.SYMBOL_NON_BREAKING_SPACE.getFullEntity()); } if (codePoint == HtmlCharacters.SYMBOL_SOFT_HYPHEN.getValue()) { replacements = new ArrayList<>(); replacements.add(HtmlCharacters.SYMBOL_SOFT_HYPHEN.getFullEntity()); replacements.add("-"); replacements.add(""); } if (isIncluded(codePoint, THREE_PER_EM_SPACE) || isIncluded(codePoint, FOUR_PER_EM_SPACE) || isIncluded(codePoint, SIX_PER_EM_SPACE) || isIncluded(codePoint, FIGURE_SPACE) || isIncluded(codePoint, PUNCTUATION_SPACE)) { replacements = new ArrayList<>(); replacements.add(" "); replacements.add(""); } if (codePoint == 0xF0FC) { replacements = new ArrayList<>(); replacements.add("*"); replacements.add("✓"); } return replacements; } } }