/* * WPCleaner: A tool to help on Wikipedia maintenance tasks. * Copyright (C) 2013 Nicolas Vervelle * * See README.txt file for licensing information. */ package org.wikipediacleaner.api.check.algorithm; import java.util.ArrayList; import java.util.Collection; import java.util.List; import java.util.Map; import org.wikipediacleaner.api.check.CheckErrorResult; import org.wikipediacleaner.api.check.HtmlCharacters; import org.wikipediacleaner.api.check.SpecialCharacters; import org.wikipediacleaner.api.constants.WPCConfiguration; import org.wikipediacleaner.api.constants.WPCConfigurationStringList; import org.wikipediacleaner.api.data.PageAnalysis; import org.wikipediacleaner.api.data.PageElementComment; import org.wikipediacleaner.api.data.PageElementTag; import org.wikipediacleaner.i18n.GT; /** * Algorithm for analyzing error 67 of check wikipedia project. * Error 67: Reference after punctuation. */ public class CheckErrorAlgorithm067 extends CheckErrorAlgorithmBase { public CheckErrorAlgorithm067() { super("Reference after punctuation"); } /** * Analyze a page to check if errors are present. * * @param analysis Page analysis. * @param errors Errors found in the page. * @param onlyAutomatic True if analysis could be restricted to errors automatically fixed. * @return Flag indicating if the error was found. */ @Override public boolean analyze( PageAnalysis analysis, Collection<CheckErrorResult> errors, boolean onlyAutomatic) { if (analysis == null) { return false; } // Retrieve possible abbreviations before <ref> tag String abbreviations = getSpecificProperty( "abbreviations", true, false, false); List<String> abbreviationsList = null; if (abbreviations != null) { abbreviationsList = WPCConfiguration.convertPropertyToStringList(abbreviations); } WPCConfiguration config = analysis.getWPCConfiguration(); List<String[]> generalAbbreviations = null; List<String> tmpGeneralAbbreviations = config.getStringList(WPCConfigurationStringList.ABBREVIATIONS); if ((tmpGeneralAbbreviations != null) && (tmpGeneralAbbreviations.size() > 0)) { generalAbbreviations = new ArrayList<String[]>(); for (String tmp : tmpGeneralAbbreviations) { int pipeIndex1 = tmp.indexOf('|'); if (pipeIndex1 > 0) { int pipeIndex2 = tmp.indexOf('|', pipeIndex1 + 1); if (pipeIndex2 > 0) { String[] abbreviation = new String[3]; abbreviation[0] = tmp.substring(0, pipeIndex1).trim(); abbreviation[1] = tmp.substring(pipeIndex1 + 1, pipeIndex2).trim(); abbreviation[2] = tmp.substring(pipeIndex2 + 1).trim(); generalAbbreviations.add(abbreviation); } } } } // Retrieve separator between several <ref> tags String separator = getSpecificProperty( "separator", true, false, false); if (separator == null) { separator = ""; } // Analyze from the beginning List<PageElementTag> tags = analysis.getTags(PageElementTag.TAG_WIKI_REF); if (tags == null) { return false; } boolean result = false; String contents = analysis.getContents(); int tagIndex = 0; int maxTags = tags.size(); while (tagIndex < maxTags) { // Group tags separated only by punctuation characters int firstTagIndex = tagIndex; PageElementTag firstTag = tags.get(firstTagIndex); int lastTagIndex = PageElementTag.groupTags(tags, firstTagIndex, contents, ",;.\'", separator); PageElementTag lastTag = tags.get(lastTagIndex); tagIndex = lastTagIndex + 1; // Check if previous character is a punctuation int tmpIndex = firstTag.getBeginIndex() - 1; String previousComment = ""; boolean punctuationFoundBefore = false; boolean punctuationFoundBetween = false; char punctuation = ' '; if (firstTag.isComplete() || !firstTag.isEndTag()) { for (int currentTagIndex = firstTagIndex; currentTagIndex <= lastTagIndex; currentTagIndex++) { PageElementTag currentTag = tags.get(currentTagIndex); if ((currentTagIndex == firstTagIndex) || (currentTag.isFullTag()) || (!currentTag.isEndTag())) { int testIndex = currentTag.getBeginIndex() - 1; boolean done = false; while (!done) { done = true; if (testIndex >= 0) { char testChar = contents.charAt(testIndex); if (Character.isWhitespace(testChar)) { testIndex--; done = false; } else if ((testChar == '>')) { PageElementComment comment = analysis.isInComment(testIndex); if ((comment != null) && (comment.getEndIndex() == testIndex + 1)) { if (currentTagIndex == firstTagIndex) { previousComment += contents.substring(comment.getBeginIndex(), comment.getEndIndex()); } testIndex = comment.getBeginIndex() - 1; done = false; } } } } if (currentTagIndex == firstTagIndex) { tmpIndex = testIndex; } if (testIndex >= 0) { char currentPunctuation = contents.charAt(testIndex); if (SpecialCharacters.isPunctuation(currentPunctuation)) { boolean punctuationFound = true; if (punctuation == ';') { int punctuationIndex = testIndex; testIndex--; while((testIndex >= 0) && (Character.isLetterOrDigit(contents.charAt(testIndex)))) { testIndex--; } if ((testIndex >= 0) && (contents.charAt(testIndex) == '&')) { String name = contents.substring(testIndex + 1, punctuationIndex); for (HtmlCharacters htmlCharacter : HtmlCharacters.values()) { if (name.equals(htmlCharacter.getName())) { punctuationFound = false; } } } } if (punctuationFound) { if (currentTagIndex == firstTagIndex) { punctuationFoundBefore = true; punctuation = currentPunctuation; } else { punctuationFoundBetween = true; } } } } } } } int beginIndex = tmpIndex; // Check for possible abbreviations before punctuation boolean abbreviationFound = false; if ((punctuationFoundBefore && (abbreviationsList != null))) { for (String abbreviation : abbreviationsList) { if (abbreviation != null) { if (contents.startsWith(abbreviation, tmpIndex - abbreviation.length() + 1)) { abbreviationFound = true; } } } } // Punctuation found if ((punctuationFoundBefore && !abbreviationFound) || punctuationFoundBetween) { if (errors == null) { return true; } result = true; // Construct list of tags String replace = PageElementTag.createListOfTags( tags, firstTagIndex, lastTagIndex, contents, separator); String textReplace = PageElementTag.createReducedListOfTags( tags, firstTagIndex, lastTagIndex, separator); if (punctuationFoundBefore && !abbreviationFound) { // Search for general abbreviations int beginRefIndex = firstTag.getBeginIndex(); int firstAbbreviationIndex = beginRefIndex; List<String[]> generalAbbreviationFound = new ArrayList<String[]>(); if ((punctuationFoundBefore && (generalAbbreviations != null))) { for (String[] abbreviation : generalAbbreviations) { if ((abbreviation != null) && (abbreviation.length > 2) && (abbreviation[0] != null)) { String abbreviationText = abbreviation[0]; int abbreviationStart = tmpIndex - abbreviationText.length() + 1; if (contents.startsWith(abbreviationText, abbreviationStart)) { generalAbbreviationFound.add(abbreviation); firstAbbreviationIndex = Math.min(firstAbbreviationIndex, abbreviationStart); } } } } // Check if the punctuation before is multiple int lastPunctuationIndex = tmpIndex; while ((tmpIndex >= 0) && (contents.charAt(tmpIndex) == punctuation)) { tmpIndex--; } tmpIndex++; beginIndex = tmpIndex; String allPunctuations = contents.substring(tmpIndex, lastPunctuationIndex + 1); while ((tmpIndex > 0) && (contents.charAt(tmpIndex - 1) == ' ')) { tmpIndex--; } String moveablePrefix = contents.substring(tmpIndex, beginIndex); String prefix = ""; if (firstAbbreviationIndex < tmpIndex) { prefix = contents.substring(firstAbbreviationIndex, tmpIndex); } beginIndex = Math.min(tmpIndex, firstAbbreviationIndex); // Check for possible punctuation after tags tmpIndex = lastTag.getEndIndex(); int endIndex = tmpIndex; while ((tmpIndex < contents.length()) && (contents.charAt(tmpIndex) == ' ')) { tmpIndex++; } boolean punctuationFoundAfter = false; int punctuationAfterIndex = tmpIndex; while ((tmpIndex < contents.length()) && SpecialCharacters.isPunctuation(contents.charAt(tmpIndex))) { punctuationFoundAfter = true; tmpIndex++; } String punctuationAfter = contents.substring(punctuationAfterIndex, tmpIndex); if (punctuationFoundAfter) { endIndex = tmpIndex; } // Create error CheckErrorResult errorResult = createCheckErrorResult( analysis, beginIndex, endIndex); boolean automatic = false; if (allPunctuations.equals(".") && !punctuationFoundAfter) { tmpIndex = endIndex; while ((tmpIndex < contents.length()) && (contents.charAt(tmpIndex) == ' ')) { tmpIndex++; } if (contents.startsWith("\n\n", tmpIndex) || contents.startsWith("\n*", tmpIndex)) { if (previousComment.isEmpty()) { automatic = true; } } } for (String[] generalAbbreviation : generalAbbreviationFound) { if ((generalAbbreviation.length > 2)) { String abbreviation = generalAbbreviation[2]; String meaning = ""; if (generalAbbreviation[1].length() > 0) { meaning = " (" + generalAbbreviation[1] + ")"; } errorResult.addReplacement( abbreviation + previousComment + replace + punctuationAfter, abbreviation + textReplace + punctuationAfter + meaning); } } errorResult.addReplacement( prefix + previousComment + replace + moveablePrefix + allPunctuations, prefix + textReplace + moveablePrefix + allPunctuations, automatic); if (punctuationFoundAfter && !allPunctuations.equals(punctuationAfter)) { errorResult.addReplacement( prefix + previousComment + replace + moveablePrefix + punctuationAfter, prefix + textReplace + moveablePrefix + punctuationAfter); } errors.add(errorResult); } else { // Create error CheckErrorResult errorResult = createCheckErrorResult( analysis, firstTag.getBeginIndex(), lastTag.getEndIndex()); errorResult.addReplacement(replace, textReplace); errors.add(errorResult); } } } return result; } /** * Return the parameters used to configure the algorithm. * * @return Map of parameters (Name -> description). */ @Override public Map<String, String> getParameters() { Map<String, String> parameters = super.getParameters(); parameters.put( "abbreviations", GT._("A list of abbreviations that generate false positives when placed before {0}", "<ref>")); parameters.put( "separator", GT._("Used as a separator between consecutive {0} tags", "<ref>")); return parameters; } /** * Automatic fixing of all the errors in the page. * * @param analysis Page analysis. * @return Page contents after fix. */ @Override protected String internalAutomaticFix(PageAnalysis analysis) { return fixUsingAutomaticReplacement(analysis); } }