/* * WPCleaner: A tool to help on Wikipedia maintenance tasks. * Copyright (C) 2013 Nicolas Vervelle * * See README.txt file for licensing information. */ package org.wikipediacleaner.api.data; import java.util.Collection; import java.util.List; import org.wikipediacleaner.api.check.CheckError; import org.wikipediacleaner.api.check.algorithm.CheckErrorAlgorithm; import org.wikipediacleaner.api.check.algorithm.CheckErrorAlgorithms; import org.wikipediacleaner.api.constants.EnumWikipedia; import org.wikipediacleaner.api.constants.WPCConfiguration; import org.wikipediacleaner.api.constants.WPCConfigurationBoolean; import org.wikipediacleaner.api.constants.WPCConfigurationString; /** * An utility class for automatic formatting of articles. */ public class AutomaticFormatter { /** * Tidy up an article. * * @param page Page. * @param contents Current contents. * @param algorithms List of Check Wiki algorithms. * @param botFix True to use bot fixes. * @param usedAlgorithms Algorithms used to tidy up the article. * @return New contents. */ public static String tidyArticle( Page page, String contents, Collection<CheckErrorAlgorithm> algorithms, boolean botFix, List<CheckError.Progress> usedAlgorithms) { if ((page == null) || (contents == null)) { return contents; } EnumWikipedia wiki = page.getWikipedia(); WPCConfiguration config = wiki.getConfiguration(); // Fix Check Wiki errors if (algorithms != null) { for (CheckErrorAlgorithm algorithm : algorithms) { if (algorithm.isAvailable() && CheckErrorAlgorithms.isAlgorithmActive(wiki, algorithm.getErrorNumber())) { String currentContents = contents; PageAnalysis analysis = page.getAnalysis(currentContents, true); contents = botFix ? algorithm.botFix(analysis) : algorithm.automaticFix(analysis); if ((usedAlgorithms != null) && (!contents.equals(currentContents))) { // TODO: compute if fix is complete ? usedAlgorithms.add(new CheckError.Progress(algorithm, true)); } } } } // Auto formatting options if (!page.isInMainNamespace()) { return contents; } if (!config.getBoolean(WPCConfigurationBoolean.AUTO_ACTIVE)) { return contents; } contents = fixSpaceAroundTitle(page, contents); contents = fixLinkDefaultsortCategory(page, contents); contents = fixLangLinksAfterCategory(page, contents); contents = fixCrBeforeCategory(page, contents); contents = fixCrDefaultsortCategory(page, contents); contents = fixCrBetweenCategory(page, contents); contents = fixEndOfArticle(page, contents); return contents; } /** * Auto formatting options: link default sort and categories. * * @param page Page. * @param contents Current contents. * @return New contents. */ public static String fixLinkDefaultsortCategory(Page page, String contents) { // Check configuration WPCConfiguration config = page.getWikipedia().getConfiguration(); boolean option = config.getBoolean(WPCConfigurationBoolean.AUTO_LINK_DEFAULTSORT_CATEGORY); if (!option) { return contents; } PageAnalysis analysis = page.getAnalysis(contents, true); // Retrieve default sort List<PageElementFunction> defaultSorts = analysis.getDefaultSorts(); if ((defaultSorts == null) || (defaultSorts.isEmpty())) { return contents; } PageElementFunction defaultSort = defaultSorts.get(0); int beginDefaultSort = defaultSort.getBeginIndex(); int endDefaultSort = defaultSort.getEndIndex(); // Retrieve categories List<PageElementCategory> categories = analysis.getCategories(); if ((categories == null) || (categories.isEmpty())) { return contents; } PageElementCategory category = categories.get(0); int beginCategory = category.getBeginIndex(); boolean defaultSortFirst = beginDefaultSort < beginCategory; // Analyze text between category and default sort if (!defaultSortFirst) { int index = category.getEndIndex(); while (index < beginDefaultSort) { char currentChar = contents.charAt(index); if ((currentChar == ' ') || (currentChar == '\n')) { index++; } else if (currentChar == '[') { PageElementCategory currentCategory = analysis.isInCategory(index); if (currentCategory == null) { return contents; } index = currentCategory.getEndIndex(); } else { return contents; } } } else { int index = endDefaultSort; boolean ok = true; while (ok && (index < beginCategory)) { char currentChar = contents.charAt(index); if ((currentChar != ' ') && (currentChar != '\n')) { ok = false; } index++; } if (ok) { return contents; } index = beginCategory; while ((index > 0) && ok) { char currentChar = contents.charAt(index); if (currentChar == '\n') { ok = false; } else if (currentChar != ' ') { return contents; } index--; } } // Fix default sort position int delta = 0; if ((beginDefaultSort <= 0) || (contents.charAt(beginDefaultSort - 1) == '\n')) { if ((endDefaultSort < contents.length()) && (contents.charAt(endDefaultSort) == '\n')) { delta = 1; if ((beginDefaultSort <= 1) || (contents.charAt(beginDefaultSort - 2) == '\n')) { if ((endDefaultSort + 1 < contents.length() && (contents.charAt(endDefaultSort + 1) == '\n'))) { delta = 2; } } } } StringBuilder sb = new StringBuilder(contents.substring( 0, defaultSortFirst ? beginDefaultSort : beginCategory)); if (defaultSortFirst) { sb.append(contents.substring(endDefaultSort + delta, beginCategory)); } sb.append(contents.substring(beginDefaultSort, endDefaultSort)); sb.append("\n"); if (defaultSortFirst) { if (beginCategory < contents.length()) { sb.append(contents.substring(beginCategory)); } } else { sb.append(contents.substring(beginCategory, beginDefaultSort)); if (endDefaultSort + delta < contents.length()) { sb.append(endDefaultSort + delta); } } contents = sb.toString(); return contents; } /** * Auto formatting options: number of carriage returns before categories. * * @param page Page. * @param contents Current contents. * @return New contents. */ public static String fixCrBeforeCategory(Page page, String contents) { // Check configuration WPCConfiguration config = page.getWikipedia().getConfiguration(); String option = config.getString(WPCConfigurationString.AUTO_CR_BEFORE_CATEGORY); if (!isValidCountOption(option)) { return contents; } int min = getMinCountOption(option); int max = getMaxCountOption(option); if ((min <= 0) && (max == Integer.MAX_VALUE)) { return contents; } PageAnalysis analysis = page.getAnalysis(contents, true); // Retrieve last title int lastTitle = 0; List<PageElementTitle> titles = analysis.getTitles(); if ((titles != null) && (titles.size() > 0)) { lastTitle = titles.get(titles.size() - 1).getEndIndex(); } // Analyze default sort List<PageElementFunction> defaultSorts = analysis.getDefaultSorts(); if ((defaultSorts != null) && (defaultSorts.size() > 0)) { int beginSort = defaultSorts.get(0).getBeginIndex(); if (beginSort < lastTitle) { return contents; } int nbCr = 0; int index = beginSort; boolean finished = false; while (!finished && (index > 0)) { index--; char currentChar = contents.charAt(index); if (currentChar == '\n') { nbCr++; } else if (currentChar != ' ') { finished = true; } } if ((nbCr == 0) || ((nbCr >= min) && (nbCr <= max))) { return contents; } contents = changeCharacters( contents, index + 1, '\n', normalizeValue(nbCr, min, max), beginSort); return contents; } // Analyze first category List<PageElementCategory> categories = analysis.getCategories(); if ((categories == null) || (categories.size() == 0)) { return contents; } int beginCat = categories.get(0).getBeginIndex(); if (beginCat < lastTitle) { return contents; } int nbCr = 0; int index = beginCat; boolean finished = false; while (!finished && (index > 0)) { index--; char currentChar = contents.charAt(index); if (currentChar == '\n') { nbCr++; } else if (currentChar != ' ') { finished = true; } } if ((nbCr == 0) || ((nbCr >= min) && (nbCr <= max))) { return contents; } contents = changeCharacters( contents, index + 1, '\n', normalizeValue(nbCr, min, max), beginCat); return contents; } /** * Auto formatting options: number of carriage returns between default sort and categories. * * @param page Page. * @param contents Current contents. * @return New contents. */ public static String fixCrDefaultsortCategory(Page page, String contents) { // Check configuration WPCConfiguration config = page.getWikipedia().getConfiguration(); String option = config.getString(WPCConfigurationString.AUTO_CR_DEFAULTSORT_CATEGORY); if (!isValidCountOption(option)) { return contents; } int min = getMinCountOption(option); int max = getMaxCountOption(option); if ((min <= 0) && (max == Integer.MAX_VALUE)) { return contents; } PageAnalysis analysis = page.getAnalysis(contents, true); // Analyze each default sort for (PageElementFunction function : analysis.getDefaultSorts()) { // Count carriage returns after default sort int nbCr = 0; int index = function.getEndIndex(); boolean finished = false; while (!finished && (index < contents.length())) { char currentChar = contents.charAt(index); if (currentChar == '\n') { nbCr++; index++; } else if (currentChar == ' ') { index++; } else { finished = true; } } // Update carriage returns after default sort if ((index < contents.length()) && (contents.charAt(index) == '[') && ((nbCr < min) || (nbCr > max))) { PageElementCategory category = analysis.isInCategory(index); if (category != null) { contents = changeCharacters( contents, function.getEndIndex(), '\n', normalizeValue(nbCr, min, max), category.getBeginIndex()); } } } return contents; } /** * Auto formatting options: number of carriage returns between each category. * * @param page Page. * @param contents Current contents. * @return New contents. */ public static String fixCrBetweenCategory(Page page, String contents) { // Check configuration WPCConfiguration config = page.getWikipedia().getConfiguration(); String option = config.getString(WPCConfigurationString.AUTO_CR_BETWEEN_CATEGORY); if (!isValidCountOption(option)) { return contents; } int min = getMinCountOption(option); int max = getMaxCountOption(option); if ((min <= 0) && (max == Integer.MAX_VALUE)) { return contents; } PageAnalysis analysis = page.getAnalysis(contents, true); // Analyze each category List<PageElementCategory> categories = analysis.getCategories(); if ((categories == null) || (categories.isEmpty())) { return contents; } StringBuilder sb = new StringBuilder(); int lastIndex = 0; for (int numCategory = 1; numCategory < categories.size(); numCategory++) { PageElementCategory previousCategory = categories.get(numCategory - 1); PageElementCategory category = categories.get(numCategory); // Check what is between the two categories int index = previousCategory.getEndIndex(); boolean ok = true; int nbCr = 0; while (ok && (index < category.getBeginIndex())) { char currentChar = contents.charAt(index); if (currentChar == '\n') { nbCr++; } else if (currentChar != ' ') { ok = false; } index++; } // Update text if needed if (ok && ((nbCr < min) || (nbCr > max))) { if (lastIndex < previousCategory.getEndIndex()) { sb.append(contents.substring(lastIndex, previousCategory.getEndIndex())); lastIndex = previousCategory.getEndIndex(); } nbCr = normalizeValue(nbCr, min, max); for (int i = 0; i < nbCr; i++) { sb.append('\n'); } lastIndex = category.getBeginIndex(); } } if (lastIndex > 0) { if (lastIndex < contents.length()) { sb.append(contents.substring(lastIndex)); } contents = sb.toString(); } return contents; } /** * Auto formatting options: language links after categories. * * @param page Page. * @param contents Current contents. * @return New contents. */ public static String fixLangLinksAfterCategory(Page page, String contents) { // Check configuration WPCConfiguration config = page.getWikipedia().getConfiguration(); boolean option = config.getBoolean(WPCConfigurationBoolean.AUTO_LANGLINK_AFTER_CATEGORY); if (!option) { return contents; } PageAnalysis analysis = page.getAnalysis(contents, true); // Check if language links are already after categories List<PageElementCategory> categories = analysis.getCategories(); if ((categories == null) || (categories.isEmpty())) { return contents; } List<PageElementLanguageLink> links = analysis.getLanguageLinks(); if ((links == null) || (links.isEmpty())) { return contents; } if (links.get(0).getBeginIndex() >= categories.get(categories.size() - 1).getEndIndex()) { return contents; } // Analyze each language link StringBuilder sb = new StringBuilder(); int lastIndex = 0; int numLink = 0; while (numLink < links.size()) { // Find first element before link int beginNum = numLink; PageElementLanguageLink begin = links.get(beginNum); boolean done = false; int beginIndex = begin.getBeginIndex(); while ((beginIndex > 0) && ((contents.charAt(beginIndex - 1) == ' ') || (contents.charAt(beginIndex - 1) == '\n'))) { beginIndex--; } // Group language links done = false; while ((numLink + 1 < links.size()) && !done) { int index = links.get(numLink).getEndIndex(); int max = links.get(numLink + 1).getBeginIndex(); while ((index < max) && !done) { char current = contents.charAt(index); if ((current != ' ') && (current != '\n')) { done = true; } index++; } if (!done) { numLink++; } } int endNum = numLink; PageElementLanguageLink end = links.get(endNum); // Check if the group is before default sort/categories int index = end.getEndIndex(); done = false; while ((index < contents.length()) && !done) { char current = contents.charAt(index); if ((current != ' ') && (current != '\n')) { done = true; } else { index++; } } PageElement firstElement = null; if (done) { char current = contents.charAt(index); if (current == '{') { firstElement = analysis.isInDefaultSort(index); } else if (current == '[') { firstElement = analysis.isInCategory(index); } } // Group default sort/categories if (firstElement != null) { index = firstElement.getEndIndex(); PageElement lastElement = firstElement; done = false; while ((index < contents.length()) && !done) { char current = contents.charAt(index); if ((current == ' ') || (current == '\n')) { index++; } else { PageElement element = null; if (current == '{') { element = analysis.isInDefaultSort(index); } else if (current == '[') { element = analysis.isInCategory(index); } if (element != null) { lastElement = element; index = element.getEndIndex(); } else { done = true; } } } // Modify contents if (lastIndex < begin.getBeginIndex()) { sb.append(contents.substring(lastIndex, beginIndex)); lastIndex = beginIndex; } sb.append(contents.substring(end.getEndIndex(), lastElement.getEndIndex())); sb.append(contents.substring(beginIndex, end.getEndIndex())); lastIndex = lastElement.getEndIndex(); } numLink++; } if (lastIndex > 0) { if (lastIndex < contents.length()) { sb.append(contents.substring(lastIndex)); } contents = sb.toString(); } return contents; } /** * Auto formatting options: end of article. * * @param page Page. * @param contents Current contents. * @return New contents. */ public static String fixEndOfArticle(Page page, String contents) { int index = contents.length(); int nbCr = 0; boolean finished = false; while (!finished && (index > 0)) { index--; char currentChar = contents.charAt(index); if (currentChar == '\n') { nbCr++; } else if (currentChar != ' ') { finished = true; } } if (index < contents.length() - 2) { if (nbCr >= 1) { return contents.substring(0, index + 1) + "\n"; } return contents.substring(0, index + 1); } return contents; } /** * Auto formatting options: number of space characters around titles. * * @param page Page. * @param contents Current contents. * @return New contents. */ public static String fixSpaceAroundTitle(Page page, String contents) { // Check configuration WPCConfiguration config = page.getWikipedia().getConfiguration(); String option = config.getString(WPCConfigurationString.AUTO_SPACE_AROUND_TITLE); if (!isValidCountOption(option)) { return contents; } int min = getMinCountOption(option); int max = getMaxCountOption(option); if ((min <= 0) && (max == Integer.MAX_VALUE)) { return contents; } PageAnalysis analysis = page.getAnalysis(contents, true); // Analyze each title List<PageElementTitle> titles = analysis.getTitles(); if ((titles == null) || (titles.isEmpty())) { return contents; } StringBuilder sb = new StringBuilder(); int lastIndex = 0; for (PageElementTitle title : titles) { String titleValue = title.getTitleNotTrimmed(); String titleAfter = title.getAfterTitle(); if ((titleValue != null) && ((titleAfter == null) || (titleAfter.equals(""))) && (title.getFirstLevel() == title.getSecondLevel())) { // Count space characters int nbSpaceBefore = 0; while ((nbSpaceBefore < titleValue.length()) && (" \u00A0".indexOf(titleValue.charAt(nbSpaceBefore)) >= 0)) { nbSpaceBefore++; } int nbSpaceAfter = 0; while ((nbSpaceAfter < titleValue.length()) && (" \u00A0".indexOf(titleValue.charAt(titleValue.length() - nbSpaceAfter - 1)) >= 0)) { nbSpaceAfter++; } // Update title if needed if ((nbSpaceBefore < min) || (nbSpaceBefore > max) || (nbSpaceAfter < min) || (nbSpaceAfter > max)) { if (lastIndex < title.getBeginIndex()) { sb.append(contents.substring(lastIndex, title.getBeginIndex())); lastIndex = title.getBeginIndex(); } nbSpaceBefore = normalizeValue(nbSpaceBefore, min, max); nbSpaceAfter = normalizeValue(nbSpaceAfter, min, max); StringBuilder newTitle = new StringBuilder(); for (int i = 0; i < nbSpaceBefore; i++) { newTitle.append(' '); } newTitle.append(titleValue.trim()); for (int i = 0; i < nbSpaceAfter; i++) { newTitle.append(' '); } sb.append(PageElementTitle.createUntrimmedTitle(title.getLevel(), newTitle.toString(), titleAfter)); lastIndex = title.getEndIndex(); } } } if (lastIndex > 0) { if (lastIndex < contents.length()) { sb.append(contents.substring(lastIndex)); } contents = sb.toString(); } return contents; } // Utility functions /** * Insert characters. * * @param contents Contents. * @param begin Index where to start inserting characters. * @param character Character to insert. * @param count Number of characters to insert. * @param end Index where to end inserting characters. * @return Modified string. */ private static String changeCharacters( String contents, int begin, char character, int count, int end) { StringBuilder sb = new StringBuilder(contents.substring(0, begin)); for (int i = 0; i < count; i++) { sb.append(character); } sb.append(contents.substring(end)); return sb.toString(); } /** * @param option Option for number of characters. * @return True if option is valid. */ private static boolean isValidCountOption(String option) { if ((option == null) || (option.length() == 0)) { return false; } int minusIndex = option.indexOf('-'); try { if (minusIndex < 0) { int value = Integer.parseInt(option); if (value < 0) { return false; } } else { if (minusIndex + 1 >= option.length()) { return false; } int min = Integer.parseInt(option.substring(0, minusIndex)); int max = Integer.parseInt(option.substring(minusIndex + 1)); if ((min < 0) || (max < min)) { return false; } } } catch (NumberFormatException e) { return false; } return true; } /** * @param option Option for number of characters. * @return Minimum number of characters. */ private static int getMinCountOption(String option) { if ((option == null) || (option.length() == 0)) { return 0; } int minusIndex = option.indexOf('-'); if (minusIndex < 0) { return Integer.parseInt(option); } return Integer.parseInt(option.substring(0, minusIndex)); } /** * @param option Option for number of characters. * @return Maximum number of characters. */ private static int getMaxCountOption(String option) { if ((option == null) || (option.length() == 0)) { return Integer.MAX_VALUE; } int minusIndex = option.indexOf('-'); if (minusIndex < 0) { return Integer.parseInt(option); } return Integer.parseInt(option.substring(minusIndex + 1)); } /** * @param value Current value. * @param min Minimum possible value. * @param max Maximum possible value. * @return Normalized value. */ private static int normalizeValue(int value, int min, int max) { if (value < min) { return min; } if (value > max) { return max; } return value; } }