/* * WPCleaner: A tool to help on Wikipedia maintenance tasks. * Copyright (C) 2013 Nicolas Vervelle * * See README.txt file for licensing information. */ package org.wikipediacleaner.api.check.algorithm; import java.util.Collection; import java.util.List; import org.wikipediacleaner.api.check.CheckErrorResult; import org.wikipediacleaner.api.check.HtmlCharacters; import org.wikipediacleaner.api.check.CheckErrorResult.ErrorLevel; import org.wikipediacleaner.api.data.PageAnalysis; import org.wikipediacleaner.api.data.PageElementExternalLink; import org.wikipediacleaner.api.data.PageElementTag; import org.wikipediacleaner.gui.swing.component.MWPane; import org.wikipediacleaner.i18n.GT; /** * Algorithm for analyzing errors based on HTML named entities. */ public abstract class CheckErrorAlgorithmHtmlNamedEntities extends CheckErrorAlgorithmBase { /** * Possible global fixes. */ private final static String[] globalFixes = new String[] { GT._("Replace all"), }; /** * @param name Name of the error. */ public CheckErrorAlgorithmHtmlNamedEntities(String name) { super(name); } /** * @return List of HTML characters managed by this error. */ protected abstract List<HtmlCharacters> getHtmlCharacters(); /** * Analyze a page to check if errors are present. * * @param analysis Page analysis. * @param errors Errors found in the page. * @param onlyAutomatic True if analysis could be restricted to errors automatically fixed. * @return Flag indicating if the error was found. */ @Override public boolean analyze( PageAnalysis analysis, Collection<CheckErrorResult> errors, boolean onlyAutomatic) { if (analysis == null) { return false; } // Analyzing the text from the beginning boolean result = false; String contents = analysis.getContents(); int ampersandIndex = contents.indexOf('&'); int maxLength = contents.length(); while ((ampersandIndex >= 0) && (ampersandIndex + 2 < maxLength)) { // Check if we should look for a match at this position boolean shouldMatch = true; if (shouldMatch && ((analysis.isInComment(ampersandIndex) != null) || (analysis.getSurroundingTag(PageElementTag.TAG_WIKI_SOURCE, ampersandIndex) != null) || (analysis.getSurroundingTag(PageElementTag.TAG_WIKI_SYNTAXHIGHLIGHT, ampersandIndex) != null))) { shouldMatch = false; } if (shouldMatch) { PageElementExternalLink link = analysis.isInExternalLink(ampersandIndex); if (link != null) { int offset = link.getTextOffset(); if ((offset < 0) || (ampersandIndex < link.getBeginIndex() + offset)) { shouldMatch = false; } } } if (shouldMatch) { for (HtmlCharacters htmlCharacter : getHtmlCharacters()) { String name = htmlCharacter.getName(); if ((name != null) && contents.startsWith(name, ampersandIndex + 1) && htmlCharacter.shouldReplaceName()) { ErrorLevel errorLevel = ErrorLevel.ERROR; // Analyze semicolon after the name int colonIndex = ampersandIndex + name.length() + 1; boolean found = false; if (useSemiColon()) { if ((colonIndex < maxLength) && (contents.charAt(colonIndex) == ';')) { found = true; } } else { if ((colonIndex >= maxLength) || (contents.charAt(colonIndex) != ';')) { if (Character.isLetterOrDigit(contents.charAt(colonIndex))) { errorLevel = ErrorLevel.WARNING; } found = true; colonIndex--; } } // Report error if (found) { if (errors == null) { return true; } result = true; // Analyze for possible semicolon afterwards int endIndex = colonIndex + 1; if (!useSemiColon()) { int tmpIndex = endIndex; while ((tmpIndex < contents.length()) && (contents.charAt(tmpIndex) == ' ')) { tmpIndex++; } if (contents.charAt(tmpIndex) == ';') { endIndex = tmpIndex + 1; } } CheckErrorResult errorResult = createCheckErrorResult( analysis, ampersandIndex, endIndex, errorLevel); errorResult.addReplacement("" + htmlCharacter.getValue()); if (endIndex > colonIndex + 1) { errorResult.addReplacement( "" + htmlCharacter.getValue() + contents.substring(colonIndex + 1, endIndex)); } if (!useSemiColon()) { errorResult.addReplacement( "&" + contents.substring(ampersandIndex + 1, endIndex)); } errors.add(errorResult); } } } } ampersandIndex = contents.indexOf('&', ampersandIndex + 1); } return result; } /** * @return True if full HTML named entities should be searched. */ protected boolean useSemiColon() { return true; } /** * Bot fixing of all the errors in the page. * * @param analysis Page analysis. * @return Page contents after fix. */ @Override protected String internalBotFix(PageAnalysis analysis) { return fix(globalFixes[0], analysis, null); } /** * @return List of possible global fixes. */ @Override public String[] getGlobalFixes() { return globalFixes; } /** * Fix all the errors in the page. * * @param fixName Fix name (extracted from getGlobalFixes()). * @param analysis Page analysis. * @param textPane Text pane. * @return Page contents after fix. */ @Override public String fix(String fixName, PageAnalysis analysis, MWPane textPane) { return fixUsingFirstReplacement(fixName, analysis); } }