/* * WPCleaner: A tool to help on Wikipedia maintenance tasks. * Copyright (C) 2013 Nicolas Vervelle * * See README.txt file for licensing information. */ package org.wikipediacleaner.api.data; import java.util.ArrayList; import java.util.List; import org.wikipediacleaner.api.constants.WPCConfiguration; import org.wikipediacleaner.api.constants.WPCConfigurationStringList; import org.wikipediacleaner.api.data.PageElementTemplate.Parameter; /** * Class containing information about a RFC (Requests for Comments). */ public class PageElementRFC extends PageElement { /** RFC prefix */ private final static String RFC_PREFIX = "RFC"; /** RFC incorrect prefixes */ private final static String[] RFC_INCORRECT_PREFIX = { "rfc" }; /** RFC possible meaningful characters */ private final static String POSSIBLE_CHARACTERS = "0123456789"; /** RFC possible extraneous characters */ private final static String EXTRA_CHARACTERS = ""; /** RFC incorrect characters */ private final static String INCORRECT_CHARACTERS = "- :‐\t—=–\n"; /** RFC incorrect characters at the beginning */ private final static String INCORRECT_BEGIN_CHARACTERS = "- :‐\t—=–\n"; /** * @param analysis Page analysis. * @return List of RFC. */ public static List<PageElementRFC> analyzePage( PageAnalysis analysis) { List<PageElementRFC> rfcs = new ArrayList<PageElementRFC>(); // Search for RFC templates WPCConfiguration config = analysis.getWPCConfiguration(); List<String[]> rfcTemplates = config.getStringArrayList(WPCConfigurationStringList.RFC_TEMPLATES); if (rfcTemplates != null) { for (String[] rfcTemplate : rfcTemplates) { if (rfcTemplate.length > 0) { List<PageElementTemplate> templates = analysis.getTemplates(rfcTemplate[0]); if (templates != null) { for (PageElementTemplate template : templates) { analyzeTemplateParams( analysis, rfcs, template, (rfcTemplate.length > 1) ? rfcTemplate[1] : "1", false, false, false, false); } } } } } // Search for RFC templates where help is requested rfcTemplates = config.getStringArrayList(WPCConfigurationStringList.RFC_HELP_NEEDED_TEMPLATES); if (rfcTemplates != null) { for (String[] rfcTemplate : rfcTemplates) { if (rfcTemplate.length > 0) { List<PageElementTemplate> templates = analysis.getTemplates(rfcTemplate[0]); if (templates != null) { for (PageElementTemplate template : templates) { analyzeTemplateParams( analysis, rfcs, template, ((rfcTemplate.length > 1) && (rfcTemplate[1].length() > 0)) ? rfcTemplate[1] : "1", false, false, false, true); } } } } } // Search for RFC in template parameters List<PageElementTemplate> templates = analysis.getTemplates(); for (PageElementTemplate template : templates) { analyzeTemplateParams(analysis, rfcs, template, "RFC", true, true, true, false); } // Search for RFC in plain texts analyzePlainText(analysis, rfcs, RFC_PREFIX, true, true); for (String prefix : RFC_INCORRECT_PREFIX) { analyzePlainText(analysis, rfcs, prefix, false, false); } return rfcs; } /** * @param index Current index. * @param rfcs List of RFC. * @return True if the current index is already in a RFC. */ private static boolean isInRFC(int index, List<PageElementRFC> rfcs) { if (rfcs != null) { for (PageElementRFC tmpRfc : rfcs) { if ((tmpRfc.getBeginIndex() <= index) && (tmpRfc.getEndIndex() > index)) { return true; } } } return false; } /** * Analyze plain text for RFC. * * @param analysis Page analysis. * @param rfcs Current list of RFC. * @param prefix RFC prefix. * @param correct True if RFC should be considered correct by default. * @param caseSensitive True if RFC prefix is case sensitive. */ private static void analyzePlainText( PageAnalysis analysis, List<PageElementRFC> rfcs, String prefix, boolean correct, boolean caseSensitive) { String contents = analysis.getContents(); if ((contents == null) || (prefix == null)) { return; } int index = 0; int maxIndex = contents.length() - prefix.length(); while (index < maxIndex) { // Check if it's a potential RFC boolean isValid = true; String nextChars = contents.substring(index, index + prefix.length()); boolean isRFC = caseSensitive ? prefix.equals(nextChars) : prefix.equalsIgnoreCase(nextChars); if (isRFC && (analysis.isInComment(index) != null)) { isRFC = false; } if (isRFC && (analysis.isInTag(index) != null)) { isRFC = false; } if (isRFC && (analysis.getSurroundingTag(PageElementTag.TAG_WIKI_NOWIKI, index) != null)) { isRFC = false; } if (isRFC) { if ((analysis.getSurroundingTag(PageElementTag.TAG_WIKI_PRE, index) != null) || (analysis.getSurroundingTag(PageElementTag.TAG_WIKI_SOURCE, index) != null) || (analysis.getSurroundingTag(PageElementTag.TAG_WIKI_SYNTAXHIGHLIGHT, index) != null)) { isRFC = false; } } if (isRFC && isInRFC(index, rfcs)) { isRFC = false; } if (isRFC) { PageElementExternalLink link = analysis.isInExternalLink(index); if (link != null) { if (!link.hasSquare() || (index < link.getBeginIndex() + link.getTextOffset()) || (link.getText() == null)) { isValid = false; } } } if (isRFC) { PageElementTemplate template = analysis.isInTemplate(index); if (template != null) { if ((template.getParameterCount() == 0) || (index < template.getParameterPipeIndex(0))) { isRFC = false; } } } if (isRFC) { PageElementImage image = analysis.isInImage(index); if (image != null) { if (index < image.getBeginIndex() + image.getFirstPipeOffset()) { isRFC = false; } } } if (isRFC) { // Check if it's a template parameter boolean parameter = false; PageElementTemplate template = analysis.isInTemplate(index); if (template != null) { for (int paramNum = 0; paramNum < template.getParameterCount(); paramNum++) { if ((template.getParameterPipeIndex(paramNum) < index) && (template.getParameterValueStartIndex(paramNum) > index)) { parameter = true; } } } int beginIndex = index; index += prefix.length(); boolean isCorrect = correct; if (!parameter) { if ((beginIndex >= 2) && (index + 2 < contents.length())) { if (contents.startsWith("[[", beginIndex - 2) && contents.startsWith("]]", index)) { isCorrect = false; beginIndex -= 2; index += 2; } } boolean spaceFound = false; if (analysis.isInComment(index) == null) { while ((index < contents.length()) && (" \u00A0".indexOf(contents.charAt(index)) >= 0)) { index++; spaceFound = true; } while ((index < contents.length()) && (INCORRECT_BEGIN_CHARACTERS.indexOf(contents.charAt(index)) >= 0)) { index++; isCorrect = false; } } int beginNumber = -1; int endNumber = beginNumber; boolean finished = false; isCorrect &= spaceFound; boolean nextCorrect = isCorrect; while (!finished && (index < contents.length())) { char currentChar = contents.charAt(index); if (POSSIBLE_CHARACTERS.indexOf(currentChar) >= 0) { if (beginNumber < 0) { beginNumber = index; } endNumber = index + 1; index++; isCorrect = nextCorrect; } else if (EXTRA_CHARACTERS.indexOf(currentChar) >= 0) { if (beginNumber < 0) { nextCorrect = false; } index++; } else if (INCORRECT_CHARACTERS.indexOf(currentChar) >= 0) { index++; nextCorrect = false; } else { if ((endNumber == index) && (Character.isLetter(currentChar))) { isCorrect = false; } finished = true; } } if (endNumber > beginNumber) { String number = contents.substring(beginNumber, endNumber); rfcs.add(new PageElementRFC( beginIndex, endNumber, number, isValid, isCorrect, false, false)); index = endNumber; } } } else { index++; } } } /** * Check if template parameter is a RFC. * * @param analysis Page analysis. * @param rfcs Current list of RFC. * @param template Template. * @param argumentName Template parameter name. * @param ignoreCase True if parameter name should compared ignoring case. * @param acceptNumbers True if numbers are accepted after parameter name. * @param acceptAllValues True if all values are accepted, even if not compatible with RFC. * @param helpRequested True if help has been requested for this RFC. */ private static void analyzeTemplateParams( PageAnalysis analysis, List<PageElementRFC> rfcs, PageElementTemplate template, String argumentName, boolean ignoreCase, boolean acceptNumbers, boolean acceptAllValues, boolean helpRequested) { int paramDefaultName = 1; for (int paramNum = 0; paramNum < template.getParameterCount(); paramNum++) { // Check parameter name Parameter param = template.getParameter(paramNum); String paramName = param.getComputedName(); if ((paramName == null) || (paramName.trim().length() == 0)) { paramName = Integer.toString(paramDefaultName); paramDefaultName++; } boolean nameOk = false; if ((ignoreCase && argumentName.equalsIgnoreCase(paramName)) || (argumentName.equals(paramName))) { nameOk = true; } else if (acceptNumbers && (paramName.length() > argumentName.length())) { String shortParamName = paramName.substring(0, argumentName.length()); if ((ignoreCase && argumentName.equalsIgnoreCase(shortParamName)) || (argumentName.equals(paramName))) { nameOk = true; for (int i = argumentName.length(); i < paramName.length(); i++) { if (!Character.isDigit(paramName.charAt(i))) { nameOk = false; } } } } if (nameOk) { String paramValue = param.getStrippedValue(); boolean ok = true; boolean hasDigit = false; int i = 0; int beginIndex = -1; int endIndex = -1; boolean correct = true; while (ok && (i < paramValue.length())) { char currentChar = paramValue.charAt(i); if (POSSIBLE_CHARACTERS.indexOf(currentChar) >= 0) { if (Character.isDigit(currentChar)) { if (beginIndex < 0) { beginIndex = i; } endIndex = i + 1; hasDigit = true; } else if (Character.toUpperCase(currentChar) == 'X') { endIndex = i + 1; } i++; } else if (EXTRA_CHARACTERS.indexOf(currentChar) >= 0) { i++; } else if (INCORRECT_CHARACTERS.indexOf(currentChar) >= 0) { i++; correct = false; } else { ok = false; } } int delta = template.getParameterValueStartIndex(paramNum); if (beginIndex < 0) { beginIndex = 0; } beginIndex += delta; if (endIndex < 0) { endIndex = 0; } endIndex += delta; if (beginIndex < 0) { ok = false; } else { if (!ok && hasDigit && (paramValue.charAt(i) == '<')) { PageElementComment comment = analysis.isInComment(beginIndex + i); if ((comment != null) && (comment.getBeginIndex() == beginIndex + i)) { ok = true; i += comment.getEndIndex() - comment.getBeginIndex(); while (ok && (i < paramValue.length())) { char currentChar = paramValue.charAt(i); if (currentChar == '<') { comment = analysis.isInComment(beginIndex + i); if (comment != null) { i += comment.getEndIndex() - comment.getBeginIndex(); } else { ok = false; } } else if ((currentChar != ' ') && (currentChar != '\n')) { ok = false; } else { i++; } } } } } if (ok) { String value = analysis.getContents().substring(beginIndex, endIndex); if (paramValue.length() > 0) { rfcs.add(new PageElementRFC( beginIndex, endIndex, value, true, correct, helpRequested, true)); } } else if (acceptAllValues) { if (paramValue.length() > 0) { rfcs.add(new PageElementRFC( template.getParameterValueStartIndex(paramNum), template.getParameterValueStartIndex(paramNum) + paramValue.length(), paramValue, true, false, false, true)); } } } } } /** * RFC not trimmed. */ private final String rfcNotTrimmed; /** * RFC (trimmed). */ private final String rfc; /** * True if RFC is in a valid location. */ private final boolean isValid; /** * True if RFC syntax is correct. */ private final boolean isCorrect; /** * True if RFC is a template parameter (RFC=...) */ private final boolean isTemplateParameter; /** * True if help has been requested for this RFC */ private final boolean helpRequested; /** * @param beginIndex Begin index. * @param endIndex End index. * @param rfc RFC. * @param isValid True if RFC is in a valid location. * @param isCorrect True if RFC syntax is correct. * @param helpRequested True if help has been requested for this RFC. * @param isTemplateParameter True if RFC is a template parameter. */ private PageElementRFC( int beginIndex, int endIndex, String rfc, boolean isValid, boolean isCorrect, boolean helpRequested, boolean isTemplateParameter) { super(beginIndex, endIndex); this.rfcNotTrimmed = rfc; this.rfc = cleanRFC(rfc); this.isValid = isValid; this.isCorrect = isCorrect; this.helpRequested = helpRequested; this.isTemplateParameter = isTemplateParameter; } /** * @return RFC not trimmed. */ public String getRFCNotTrimmed() { return rfcNotTrimmed; } /** * @return RFC (trimmed). */ public String getRFC() { return rfc; } /** * @return True if RFC is in a valid location. */ public boolean isValid() { return isValid; } /** * @return True if RFC syntax is correct. */ public boolean isCorrect() { return isCorrect; } /** * @return True if help has been requested for this RFC. */ public boolean helpRequested() { return helpRequested; } /** * @return True if RFC is a template parameter. */ public boolean isTemplateParameter() { return isTemplateParameter; } /** * @return List of possible RFC. */ public List<String> getCorrectRFC() { List<String> result = new ArrayList<String>(); String prefix = isTemplateParameter ? "" : "RFC "; // Construct a basic RFC number StringBuilder buffer = new StringBuilder(); for (int i = 0; i < rfcNotTrimmed.length(); i++) { char currentChar = rfcNotTrimmed.charAt(i); if ((POSSIBLE_CHARACTERS.indexOf(currentChar) >= 0) || (EXTRA_CHARACTERS.indexOf(currentChar) >= 0)) { buffer.append(currentChar); } else if ((currentChar == '‐') || (currentChar == '.')) { buffer.append("-"); } else if (currentChar == '\t') { buffer.append(" "); } else { buffer.append(currentChar); } } String cleanedRFC = buffer.toString().trim(); // Basic replacement result.add(prefix + cleanedRFC); return result; } /** * @param helpNeededTemplate Name of template for asking for help. * @param reason Reason of the request. * @return Text for requesting for help. */ public String askForHelp( String[] helpNeededTemplate, String reason) { if ((helpNeededTemplate == null) || (helpNeededTemplate.length == 0)) { return null; } if (isTemplateParameter) { return null; } // Template name StringBuilder replacement = new StringBuilder(); replacement.append("{{"); replacement.append(helpNeededTemplate[0]); // ISBN replacement.append("|"); if ((helpNeededTemplate.length > 1) && (helpNeededTemplate[1].length() > 0)) { replacement.append(helpNeededTemplate[1]); replacement.append("="); } replacement.append(getRFCNotTrimmed()); // Reason if ((reason != null) && (helpNeededTemplate.length > 2) && (helpNeededTemplate[2].length() > 0)) { replacement.append("|"); replacement.append(helpNeededTemplate[2]); replacement.append("="); replacement.append(reason); } // Extra parameters for (int i = 3; i < helpNeededTemplate.length; i++) { replacement.append("|"); replacement.append(helpNeededTemplate[i]); } replacement.append("}}"); return replacement.toString(); } /** * @param comment Comment for asking for help. * @param reason Reason of the request. * @return Text for requesting for help. */ public String askForHelp( String comment, String reason) { if ((comment == null) || (comment.trim().length() == 0)) { return null; } StringBuilder replacement = new StringBuilder(); replacement.append("<!-- "); replacement.append(comment); if ((reason != null) && (reason.trim().length() > 0)) { replacement.append(" - "); replacement.append(reason); } replacement.append(" -->"); return replacement.toString(); } /** * @param rfc RFC number. * @return Cleaned up RFC number. */ public static String cleanRFC(String rfc) { if (rfc == null) { return null; } rfc = rfc.trim(); if (rfc.length() == 0) { return rfc; } PageAnalysis analysis = new PageAnalysis(null, rfc); StringBuilder result = new StringBuilder(); int i = 0; while (i < rfc.length()) { char current = Character.toUpperCase(rfc.charAt(i)); if (current == '<') { PageElementComment comment = analysis.isInComment(i); if ((comment != null) && (comment.getBeginIndex() == i)) { i = comment.getEndIndex() - 1; } else { PageElementTag refTag = analysis.isInTag(i, PageElementTag.TAG_WIKI_REF); if ((refTag != null) && (refTag.getBeginIndex() == i)) { i = refTag.getCompleteEndIndex() - 1; } } } else if (POSSIBLE_CHARACTERS.indexOf(current) >= 0) { result.append(current); } i++; } return result.toString(); } /** * @param rfcValue RFC value. * @return True if RFC value is valid. */ public static boolean isValid(String rfcValue) { if (rfcValue == null) { return false; } rfcValue = cleanRFC(rfcValue); if (rfcValue.length() == 0) { return false; } return true; } }