/* * WPCleaner: A tool to help on Wikipedia maintenance tasks. * Copyright (C) 2013 Nicolas Vervelle * * See README.txt file for licensing information. */ package org.wikipediacleaner.api.data; import java.util.ArrayList; import java.util.List; import org.wikipediacleaner.api.constants.WPCConfiguration; import org.wikipediacleaner.api.constants.WPCConfigurationStringList; /** * Class containing information about a tag (<<i>tag</i>>). */ public class PageElementTag extends PageElement { // ========================================================================= // HTML tags // ========================================================================= public final static String TAG_HTML_A = "a"; public final static String TAG_HTML_ABBR = "abbr"; public final static String TAG_HTML_B = "b"; public final static String TAG_HTML_BIG = "big"; public final static String TAG_HTML_BLOCKQUOTE = "blockquote"; public final static String TAG_HTML_BR = "br"; public final static String TAG_HTML_CENTER = "center"; public final static String TAG_HTML_CITE = "cite"; public final static String TAG_HTML_CODE = "code"; public final static String TAG_HTML_DEL = "del"; public final static String TAG_HTML_DIV = "div"; public final static String TAG_HTML_EM = "em"; public final static String TAG_HTML_FONT = "font"; public final static String TAG_HTML_H1 = "h1"; public final static String TAG_HTML_H2 = "h2"; public final static String TAG_HTML_H3 = "h3"; public final static String TAG_HTML_H4 = "h4"; public final static String TAG_HTML_H5 = "h5"; public final static String TAG_HTML_H6 = "h6"; public final static String TAG_HTML_H7 = "h7"; public final static String TAG_HTML_H8 = "h8"; public final static String TAG_HTML_H9 = "h9"; public final static String TAG_HTML_HR = "hr"; public final static String TAG_HTML_I = "i"; public final static String TAG_HTML_LI = "li"; public final static String TAG_HTML_OL = "ol"; public final static String TAG_HTML_P = "p"; public final static String TAG_HTML_S = "s"; public final static String TAG_HTML_SMALL = "small"; public final static String TAG_HTML_SPAN = "span"; public final static String TAG_HTML_STRIKE = "strike"; public final static String TAG_HTML_SUB = "sub"; public final static String TAG_HTML_SUP = "sup"; public final static String TAG_HTML_TABLE = "table"; public final static String TAG_HTML_TD = "td"; public final static String TAG_HTML_TH = "th"; public final static String TAG_HTML_TR = "tr"; public final static String TAG_HTML_TT = "tt"; public final static String TAG_HTML_U = "u"; public final static String TAG_HTML_UL = "ul"; // ========================================================================= // Wiki tags // ========================================================================= public final static String TAG_WIKI_GALLERY = "gallery"; public final static String TAG_WIKI_GRAPH = "graph"; public final static String TAG_WIKI_HIERO = "hiero"; public final static String TAG_WIKI_IMAGEMAP = "imagemap"; public final static String TAG_WIKI_INCLUDEONLY = "includeonly"; public final static String TAG_WIKI_MATH = "math"; public final static String TAG_WIKI_MATH_CHEM = "ce"; // Shortcut for math chem public final static String TAG_WIKI_NOINCLUDE = "noinclude"; public final static String TAG_WIKI_NOWIKI = "nowiki"; public final static String TAG_WIKI_ONLYINCLUDE = "onlyinclude"; public final static String TAG_WIKI_PRE = "pre"; public final static String TAG_WIKI_REF = "ref"; public final static String TAG_WIKI_REFERENCES = "references"; public final static String TAG_WIKI_SCORE = "score"; public final static String TAG_WIKI_SOURCE = "source"; public final static String TAG_WIKI_SYNTAXHIGHLIGHT = "syntaxhighlight"; public final static String TAG_WIKI_TEMPLATEDATA = "templatedata"; public final static String TAG_WIKI_TIMELINE = "timeline"; // ========================================================================= // Other tags // ========================================================================= public final static String TAG_OTHER_TYPO = "typo"; /** Possible characters for parameter name */ private final static String PARAM_NAME_CHARS = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "abcdefghijklmnopqrstuvwxyz" + "0123456789" + "-"; private final static String PARAM_VALUE_UNQUOTED_CHARS = "!$%&()*,-.:;<@[]^_`{|}~"; // List from https://en.wikipedia.org/wiki/Wikipedia:REFNAME /** Possible separation characters after tag name */ private final static String SEP_CHARS_AFTER_TAG_NAME = " \n"; /** Possible separation characters at the end */ private final static String SEP_CHARS_END = " \n" + '\u00A0'; /** Tag name */ private final String name; /** Normalized tag name */ private final String normalizedName; /** Tag parameters */ private final List<PageElementTag.Parameter> parameters; /** Flag indicating if this is a closing tag */ private final boolean endTag; /** Flag indicating if this is a full tag (opening + closing) */ private final boolean fullTag; /** Flag indicating if there are white space characters at the end */ private final boolean endWithSpace; /** Matching tag */ private PageElementTag matchingTag; /** * Analyze contents to check if it matches a tag. * * @param contents Contents. * @param index Block start index. * @return Block details it there's a block. */ public static PageElementTag analyzeBlock( String contents, int index) { // Verify arguments if (contents == null) { return null; } int maxLength = contents.length(); // Check for '<' int tmpIndex = index; if ((tmpIndex >= maxLength) || (contents.charAt(tmpIndex) != '<')) { return null; } tmpIndex++; // No whitespace characters between '<' and the tag name if ((tmpIndex >= maxLength) || (" \u00A0\n".indexOf(contents.charAt(tmpIndex)) >= 0)) { return null; } // Check for possible end tag if (tmpIndex >= maxLength) { return null; } boolean endTag = false; if (contents.charAt(tmpIndex) == '/') { endTag = true; tmpIndex++; while ((tmpIndex < maxLength) && (contents.charAt(tmpIndex) == ' ')) { tmpIndex++; } } int beginIndex = tmpIndex; // Retrieve tag name if (tmpIndex >= maxLength) { return null; } while ((tmpIndex < maxLength) && (Character.isLetterOrDigit(contents.charAt(tmpIndex))) && ((tmpIndex > beginIndex) || (Character.isLetter(contents.charAt(tmpIndex))))) { tmpIndex++; } if (tmpIndex == beginIndex) { return null; } String name = contents.substring(beginIndex, tmpIndex); // Possible whitespace characters while ((tmpIndex < maxLength) && (SEP_CHARS_AFTER_TAG_NAME.indexOf(contents.charAt(tmpIndex)) >= 0)) { tmpIndex++; } // Find end of tag int endIndex = contents.indexOf('>', tmpIndex); if (endIndex < 0) { return null; } // Possible whitespace characters int tmpIndex2 = endIndex - 1; boolean endWithSpace = false; while ((tmpIndex2 > tmpIndex) && (SEP_CHARS_END.indexOf(contents.charAt(tmpIndex2)) >= 0)) { tmpIndex2--; endWithSpace = true; } // Check for possible full tag boolean fullTag = false; if (contents.charAt(tmpIndex2) == '/') { if (endTag) { return null; // Tag with / at the beginning and at the end } fullTag = true; tmpIndex2--; while ((tmpIndex2 > tmpIndex) && (contents.charAt(tmpIndex2) == ' ')) { tmpIndex2--; } } // Check for parameters List<Parameter> parameters = null; if (tmpIndex2 > tmpIndex) { parameters = new ArrayList<PageElementTag.Parameter>(); if (!analyzeParameters(contents.substring(tmpIndex, tmpIndex2 + 1), parameters)) { return null; } } // Create tag return new PageElementTag( index, endIndex + 1, name, parameters, endTag, fullTag, endWithSpace); } /** * Analyze tag parameters. * * @param paramString String containing the parameters. * @param parameters Parameters. * @return True if analyze is correct. */ private static boolean analyzeParameters( String paramString, List<Parameter> parameters) { if (paramString == null) { return true; } int maxLength = paramString.length(); // Find parameter name int startNameIndex = 0; while ((startNameIndex < maxLength) && (paramString.charAt(startNameIndex) == ' ')) { startNameIndex++; } if (startNameIndex >= maxLength) { return true; } int endNameIndex = startNameIndex; while ((endNameIndex < maxLength) && (PARAM_NAME_CHARS.indexOf(paramString.charAt(endNameIndex)) >= 0)) { endNameIndex++; } if ((endNameIndex < maxLength) && (paramString.charAt(endNameIndex) != ' ') && (paramString.charAt(endNameIndex) != '=')) { return false; } String name = paramString.substring(startNameIndex, endNameIndex); // Find equal sign int equalIndex = endNameIndex; while ((equalIndex < maxLength) && (paramString.charAt(equalIndex) == ' ')) { equalIndex++; } if (equalIndex >= maxLength) { Parameter param = new Parameter(name); parameters.add(param); return true; } if (paramString.charAt(equalIndex) != '=') { Parameter param = new Parameter(name); parameters.add(param); return analyzeParameters(paramString.substring(equalIndex), parameters); } // Find beginning of parameter value int startValueIndex = equalIndex + 1; while ((startValueIndex < maxLength) && (paramString.charAt(startValueIndex) == ' ')) { startValueIndex++; } if (startValueIndex >= maxLength) { Parameter param = new Parameter(name); parameters.add(param); return analyzeParameters(paramString.substring(startValueIndex), parameters); } // Find parameter value String value = null; int endValueIndex = startValueIndex; char startValueChar = paramString.charAt(startValueIndex); String beforeMarker = null; String preferredAfterMarker = null; if (startValueChar == '\"') { beforeMarker = "\""; preferredAfterMarker = beforeMarker; } else if (startValueChar == '\'') { beforeMarker = "\'"; preferredAfterMarker = beforeMarker; } else if (startValueChar == '«') { beforeMarker = "«"; preferredAfterMarker = "»"; } String afterMarker = null; if (beforeMarker != null) { endValueIndex = startValueIndex + beforeMarker.length(); while ((endValueIndex < paramString.length()) && (afterMarker == null)) { if (paramString.startsWith(preferredAfterMarker, endValueIndex)) { afterMarker = preferredAfterMarker; } else { endValueIndex++; } } startValueIndex++; value = paramString.substring(startValueIndex, endValueIndex); if (afterMarker != null) { endValueIndex += afterMarker.length(); } } else { while ((endValueIndex < maxLength) && (paramString.charAt(endValueIndex) != ' ')) { char currentChar = paramString.charAt(endValueIndex); if (!Character.isLetterOrDigit(currentChar) && (PARAM_VALUE_UNQUOTED_CHARS.indexOf(currentChar) < 0)) { return false; } endValueIndex++; } value = paramString.substring(startValueIndex, endValueIndex); } Parameter param = new Parameter(name, value, beforeMarker, afterMarker); parameters.add(param); // Deal with next parameter if (endValueIndex < maxLength) { return analyzeParameters(paramString.substring(endValueIndex), parameters); } return true; } /** * @return Tag name. */ public String getName() { return name; } /** * @return Normalized tag name. */ public String getNormalizedName() { return normalizedName; } /** * @return Number of parameters. */ public int getParametersCount() { if (parameters != null) { return parameters.size(); } return 0; } /** * @param index Index of parameter. * @return Parameter. */ public PageElementTag.Parameter getParameter(int index) { if (parameters == null) { return null; } if ((index < 0) || (index >= parameters.size())) { return null; } return parameters.get(index); } /** * @param parameterName Parameter name. * @return Parameter. */ public PageElementTag.Parameter getParameter(String parameterName) { if (parameters == null) { return null; } for (PageElementTag.Parameter param : parameters) { if (param.getName().equals(parameterName)) { return param; } } return null; } /** * @return Is it an end tag ? */ public boolean isEndTag() { return endTag; } /** * @return Is it a full tag ? */ public boolean isFullTag() { return fullTag; } /** * @return Ends with extra space characters ? */ public boolean endWithSpace() { return endWithSpace; } /** * @return Beginning of the complete tag. */ public int getCompleteBeginIndex() { if (isEndTag() && (matchingTag != null)) { return matchingTag.getBeginIndex(); } return getBeginIndex(); } /** * @return Beginning of the value. */ public int getValueBeginIndex() { if (isFullTag() || !isComplete()) { return getEndIndex(); } if (isEndTag()) { return getMatchingTag().getEndIndex(); } return getEndIndex(); } /** * @return End of the value. */ public int getValueEndIndex() { if (isFullTag() || !isComplete()) { return getEndIndex(); } if (isEndTag()) { return getBeginIndex(); } return getMatchingTag().getBeginIndex(); } /** * @return End of the complete tag. */ public int getCompleteEndIndex() { if (isEndTag() || (matchingTag == null)) { return getEndIndex(); } return matchingTag.getEndIndex(); } /** * @return True if the tag is complete (either full or with matching tag). */ public boolean isComplete() { return (fullTag || (matchingTag != null)); } /** * @return Matching tag. */ public PageElementTag getMatchingTag() { return matchingTag; } /** * @param tag Matching tag. */ void setMatchingTag(PageElementTag tag) { if (tag == matchingTag) { return; } PageElementTag oldMatchingTag = matchingTag; matchingTag = tag; if (oldMatchingTag != null) { oldMatchingTag.setMatchingTag(null); } if (matchingTag != null) { matchingTag.setMatchingTag(this); } } /** * @param beginIndex Begin index. * @param endIndex End index. * @param name Tag name. * @param parameters Parameters. * @param endTag Is it a closing tag ? * @param fullTag Is it a full tag ? * @param Extra white space characters at the end ? */ private PageElementTag( int beginIndex, int endIndex, String name, List<PageElementTag.Parameter> parameters, boolean endTag, boolean fullTag, boolean endWithSpace) { super(beginIndex, endIndex); this.name = name; this.normalizedName = (name != null) ? name.trim().toLowerCase() : null; this.parameters = parameters; this.endTag = endTag; this.fullTag = fullTag; this.endWithSpace = endWithSpace; } /** * Create a tag. * * @param name Tag name. * @param closing True if it's a closing tag. * @param full True if it's a full tag. * @return Tag. */ public static String createTag(String name, boolean closing, boolean full) { StringBuilder sb = new StringBuilder(); sb.append("<"); if (closing && !full) { sb.append("/"); } if (name != null) { sb.append(name); } if (full) { sb.append("/"); } sb.append(">"); return sb.toString(); } /** * Retrieve the group name of a ref tag. * * @param analysis Page analysis. * @return Group of the ref tag. */ public String getGroupOfRef(PageAnalysis analysis) { String result = null; // Check for a group parameter in the tag Parameter group = getParameter("group"); if (group != null) { result = group.getValue(); } else { // Check for a group parameter in the references tag PageElementTag references = analysis.getSurroundingTag( PageElementTag.TAG_WIKI_REFERENCES, getBeginIndex()); if (references != null) { group = references.getParameter("group"); if (group != null) { result = group.getValue(); } } else { // Check for a group parameter in the references templates WPCConfiguration config = analysis.getWPCConfiguration(); List<String[]> templates = config.getStringArrayList(WPCConfigurationStringList.REFERENCES_TEMPLATES); if (templates != null) { PageElementTemplate template = analysis.isInTemplate(getBeginIndex()); if (template != null) { for (String[] elements : templates) { if ((elements.length > 1) && (Page.areSameTitle(template.getTemplateName(), elements[0]))) { String[] argNames = elements[1].split(","); for (String argName : argNames) { String tmp = template.getParameterValue(argName); if ((result == null) && (tmp != null)) { result = tmp; if ((result.length() > 2) && (result.charAt(0) == '"') && (result.charAt(result.length() - 1) == '"')) { result = result.substring(1, result.length() - 2); } } } } } } } } } if ((result == null) || (result.trim().length() == 0)) { return null; } return result.trim(); } /** * Find the main reference tag in a list of reference tags. * * @param refs List of reference tags. * @param analysis Page analysis. * @return Main reference tag in the list. */ public static PageElementTag getMainRef( List<PageElementTag> refs, List<PageElementTag> references, PageAnalysis analysis) { if (refs == null) { return null; } // Configuration WPCConfiguration config = analysis.getWPCConfiguration(); List<String[]> templates = config.getStringArrayList(WPCConfigurationStringList.REFERENCES_TEMPLATES); // Search for a named reference tag PageElementTag namedTag = null; PageElementTag namedTagInReferences = null; PageElementTag namedTagInTemplate = null; for (PageElementTag tag : refs) { // Check that the tag has a name boolean hasName = false; Parameter name = tag.getParameter("name"); if ((name != null) && (name.getTrimmedValue() != null) && (!name.getTrimmedValue().isEmpty())) { hasName = true; } // Check that the tag has a value boolean hasValue = false; int beginValue = tag.getValueBeginIndex(); int endValue = tag.getValueEndIndex(); String value = analysis.getContents().substring(beginValue, endValue); if ((value != null) && (!value.trim().isEmpty())) { hasValue = true; } // Check if the tag can be the main tag if (hasName && hasValue) { // Direct tag if (namedTag == null) { namedTag = tag; } // Tag inside <references /> for (PageElementTag reference : references) { if ((tag.getCompleteBeginIndex() > reference.getCompleteBeginIndex()) && (tag.getCompleteEndIndex() < reference.getCompleteEndIndex())) { if (namedTagInReferences == null) { namedTagInReferences = tag; } } } // Tag inside references template if (templates != null) { PageElementTemplate template = analysis.isInTemplate(tag.getCompleteBeginIndex()); if (template != null) { for (String[] elements : templates) { if ((elements.length > 0) && (Page.areSameTitle(template.getTemplateName(), elements[0])) && (namedTagInTemplate == null)) { namedTagInTemplate = tag; } } } } } } // Deal with named reference tag inside <references/> if (namedTagInReferences != null) { return namedTagInReferences; } // Deal with named reference tag inside template if (namedTagInTemplate != null) { return namedTagInTemplate; } // Deal with named references tag outside <references/> if (namedTag != null) { return namedTag; } return null; } /** * Retrieve index of matching end tag. * * @param tags List of tags. * @param tagIndex Index of opening tag. * @return Index of matching end tag. */ public static int getMatchingTagIndex(List<PageElementTag> tags, int tagIndex) { PageElementTag tag = tags.get(tagIndex); if (!tag.isFullTag() && tag.isComplete()) { int endIndex = tags.indexOf(tag.getMatchingTag()); if (endIndex > tagIndex) { return endIndex; } } return tagIndex; } /** * Group consecutive tags. * * @param tags List of tags. * @param firstTagIndex Index of first tag in the list. * @param contents Page contents. * @param punctuation Possible punctuation elements between tags. * @param separator Possible separator between tags. * @return Index of last tag in the group of consecutive tags. */ public static int groupTags( List<PageElementTag> tags, int firstTagIndex, String contents, String punctuation, String separator) { // TODO: Check if still necessary after refactoring of <ref> errors. if (tags == null) { return firstTagIndex; } int tagIndex = firstTagIndex; while (tagIndex < tags.size()) { // Search for matching end tag int lastTagIndex = getMatchingTagIndex(tags, tagIndex); tagIndex = lastTagIndex + 1; // Check text before next tag if (tagIndex >= tags.size()) { return lastTagIndex; } int nextBeginIndex = tags.get(tagIndex).getBeginIndex(); int currentIndex = tags.get(lastTagIndex).getEndIndex(); boolean separatorFound = false; while (currentIndex < nextBeginIndex) { if (!separatorFound && (separator != null) && contents.startsWith(separator, currentIndex)) { separatorFound = true; currentIndex += separator.length(); } else if (contents.startsWith(" ", currentIndex)) { currentIndex += " ".length(); } else if (!Character.isWhitespace(contents.charAt(currentIndex)) && ((punctuation == null) || (punctuation.indexOf(contents.charAt(currentIndex)) < 0))) { return lastTagIndex; } else { currentIndex++; } } } return tagIndex; } /** * Create a textual representation of a list of tags. * * @param tags List of tags. * @param firstTagIndex Index of first tag in the list. * @param lastTagIndex Index of last tag in the list. * @param contents Page contents. * @param separator Separator. * @return Textual representation of a list of tags. */ public static String createListOfTags( List<PageElementTag> tags, int firstTagIndex, int lastTagIndex, String contents, String separator) { // TODO: Check if still necessary after refactoring of <ref> errors. StringBuilder buffer = new StringBuilder(); int tagIndex = firstTagIndex; while (tagIndex <= lastTagIndex) { if ((tagIndex > firstTagIndex) && (separator != null)) { buffer.append(separator); } int beginIndex = tags.get(tagIndex).getBeginIndex(); tagIndex = getMatchingTagIndex(tags, tagIndex); int endIndex = tags.get(tagIndex).getEndIndex(); tagIndex++; buffer.append(contents.substring(beginIndex, endIndex)); } return buffer.toString(); } /** * Create a reduced textual representation of a list of tags. * * @param tags List of tags. * @param firstTagIndex Index of first tag in the list. * @param lastTagIndex Index of last tag in the list. * @param separator Separator. * @return Reduced textual representation of a list of tags. */ public static String createReducedListOfTags( List<PageElementTag> tags, int firstTagIndex, int lastTagIndex, String separator) { // TODO: Check if still necessary after refactoring of <ref> errors. int tagIndex = firstTagIndex; int count = 0; while (tagIndex <= lastTagIndex) { count++; tagIndex = getMatchingTagIndex(tags, tagIndex); tagIndex++; } if (count > 2) { return "<ref>...</ref>" + separator + "..." + separator + "<ref>...</ref>"; } if (count > 1) { return "<ref>...</ref>" + separator + "<ref>...</ref>"; } return "<ref>...</ref>"; } /** * Class for managing a parameter */ public static class Parameter { /** * Parameter name. */ private final String name; /** * Parameter value. */ private final String value; /** * Marker. */ private final String beforeMarker; private final String afterMarker; /** * @param name Parameter name. */ Parameter(String name) { this(name, null, null, null); } /** * @param name Parameter name. * @param value Parameter value. * @param beforeMarker Marker (like quote) before the parameter value. * @param afterMarker Marker (like quote) after the parameter value. */ Parameter( String name, String value, String beforeMarker, String afterMarker) { this.name = name; this.value = value; this.beforeMarker = beforeMarker; this.afterMarker = afterMarker; } /** * @return Parameter name. */ public String getName() { return name; } /** * @return Parameter value. */ public String getValue() { return value; } /** * @return Parameter value. */ public String getTrimmedValue() { if (value == null) { return null; } return value.trim(); } /** * @return True if parameter has unbalanced quotes. */ public boolean hasUnbalancedQuotes() { if ("\"".equals(beforeMarker) && (afterMarker == null)) { return true; } if ((beforeMarker == null) && "\"".equals(afterMarker)) { return true; } return false; } /** * @return Text equivalent to the parameter. * @see java.lang.Object#toString() */ @Override public String toString() { StringBuilder builder = new StringBuilder(); builder.append(name); builder.append('='); if (beforeMarker != null) { builder.append(beforeMarker); } builder.append(value); if (afterMarker != null) { builder.append(afterMarker); } return builder.toString(); } } }