/* * WPCleaner: A tool to help on Wikipedia maintenance tasks. * Copyright (C) 2013 Nicolas Vervelle * * See README.txt file for licensing information. */ package org.wikipediacleaner.api.data; import java.util.ArrayList; import java.util.List; import java.util.Map; import org.wikipediacleaner.api.constants.WPCConfiguration; import org.wikipediacleaner.api.constants.WPCConfigurationStringList; import org.wikipediacleaner.api.data.PageElementTemplate.Parameter; /** * Various utilities for page analysis. */ public class PageAnalysisUtils { // ========================================================================== // Comments management // ========================================================================== /** * @param pageAnalysis Page analysis. * @param currentIndex Current index. * @return First index after comments. */ public static int getFirstIndexAfterComments( PageAnalysis pageAnalysis, int currentIndex) { if (pageAnalysis == null) { return currentIndex; } List<PageElementComment> comments = pageAnalysis.getComments(); for (PageElementComment comment : comments) { if (currentIndex < comment.getEndIndex()) { if (currentIndex >= comment.getBeginIndex()) { return comment.getEndIndex(); } } else { return currentIndex; } } return currentIndex; } // ========================================================================== // Internal link management // ========================================================================== /** * Find internal links in a page. * * @param pageAnalysis Page analysis. * @param links Links that are requested. * @param notification For notifying when a link is found. */ public static void findInternalLinks( PageAnalysis pageAnalysis, List<Page> links, InternalLinkNotification notification) { if ((pageAnalysis == null) || (links == null) || (notification == null)) { return; } // Search for simple internal links [[link]], [[link|text]], [[link#anchor|text]], ... List<PageElementInternalLink> internalLinks = pageAnalysis.getInternalLinks(); WPCConfiguration wpcConfiguration = pageAnalysis.getWPCConfiguration(); List<String> templatesAfter = wpcConfiguration.getStringList( WPCConfigurationStringList.TEMPLATES_AFTER_HELP_ASKED); List<String> commentsAfter = wpcConfiguration.getStringList( WPCConfigurationStringList.COMMENTS_FOR_DAB_LINK); List<String[]> templatesIgnoreDab = wpcConfiguration.getStringArrayList( WPCConfigurationStringList.TEMPLATES_IGNORE_DAB); MagicWord redirect = pageAnalysis.getWikiConfiguration().getMagicWordByName(MagicWord.REDIRECT); String contents = pageAnalysis.getContents(); int maxSize = contents.length(); boolean firstLink = true; for (PageElementInternalLink internalLink : internalLinks) { for (Page link : links) { if (Page.areSameTitle(link.getTitle(), internalLink.getLink())) { int currentPos = internalLink.getEndIndex(); while ((currentPos < maxSize) && (contents.charAt(currentPos) == ' ')) { currentPos++; } // Check if link is marked as needing help boolean helpNeeded = false; if (templatesAfter != null) { if ((currentPos < maxSize) && (contents.charAt(currentPos) == '{')) { PageElementTemplate nextTemplate = pageAnalysis.isInTemplate(currentPos); if (nextTemplate != null) { for (String templateAfter : templatesAfter) { if (Page.areSameTitle(templateAfter, nextTemplate.getTemplateName())) { helpNeeded = true; } } } } } // Check if link is marked as normal boolean good = false; if ((currentPos < maxSize) && (contents.charAt(currentPos) == '<')) { PageElementComment nextComment = pageAnalysis.isInComment(currentPos); if ((nextComment != null) && (nextComment.getComment() != null)) { if (commentsAfter != null) { for (String commentAfter : commentsAfter) { if (nextComment.getComment().length() >= commentAfter.length()) { String comment = nextComment.getComment().substring(0, commentAfter.length()); if (comment.equalsIgnoreCase(commentAfter)) { good = true; } } } } } } if (!good && (templatesIgnoreDab != null) && !templatesIgnoreDab.isEmpty()) { PageElementTemplate template = pageAnalysis.isInTemplate(currentPos); if (template != null) { for (String[] currentTemplate : templatesIgnoreDab) { if ((currentTemplate != null) && (currentTemplate.length > 1) && Page.areSameTitle(currentTemplate[0], template.getTemplateName())) { Parameter parameter = template.getParameterAtIndex(currentPos); if (parameter != null) { for (int index = 1; index < currentTemplate.length; index++) { if (parameter.getComputedName().equals(currentTemplate[index])) { good = true; } } } } } } } // Check if link is in fact a redirection if (firstLink && (redirect != null)) { int tmpPos = 0; while ((contents.charAt(tmpPos) == ' ') && (tmpPos < internalLink.getBeginIndex())) { tmpPos++; } String redirectTag = null; for (String alias : redirect.getAliases()) { if (contents.startsWith(alias, tmpPos)) { char next = contents.charAt(tmpPos + alias.length()); if ((next == ' ') || (next == '[')) { redirectTag = alias; } } } if (redirectTag != null) { tmpPos += redirectTag.length(); while (contents.charAt(tmpPos) == ' ') { tmpPos++; } if (tmpPos == internalLink.getBeginIndex()) { good = true; } } } notification.linkFound(link, internalLink, good, helpNeeded); } } firstLink = false; } // Search for internal links created by templates WPCConfiguration configuration = pageAnalysis.getWPCConfiguration(); if (configuration.hasTemplateMatchers()) { List<PageElementTemplate> templates = pageAnalysis.getTemplates(); for (PageElementTemplate template : templates) { List<? extends TemplateMatcher> matchers = configuration.getTemplateMatchers(template.getTemplateName()); if (matchers != null) { for (TemplateMatcher matcher : matchers) { String linkTo = matcher.linksTo(pageAnalysis.getPage(), template); if (linkTo != null) { for (Page link : links) { if (Page.areSameTitle(link.getTitle(), linkTo)) { notification.linkFound(link, template, matcher); } } } } } } } } /** * Get anchors in internal links. * * @param pageAnalysis Page analysis. * @param pageLinks Page links. * @param anchors Anchors (OUT) */ public static void getAnchors(PageAnalysis pageAnalysis, List<Page> pageLinks, Map<Page, List<String>> anchors) { if (pageAnalysis == null) { return; } String pageContents = pageAnalysis.getContents(); if ((pageContents == null) || (pageContents.length() == 0) || (anchors == null)) { return; } // Check each internal link List<PageElementInternalLink> links = pageAnalysis.getInternalLinks(); for (PageElementInternalLink internalLink : links) { String anchor = internalLink.getAnchor(); if ((anchor != null) && (anchor.trim().length() > 0)) { String fullAnchor = internalLink.getFullLink(); // Check if the internal link is for one of the links for (Page link : pageLinks) { if ((link != null) && (Page.areSameTitle(link.getTitle(), internalLink.getLink()))) { List<String> listAnchors = anchors.get(link); if (listAnchors == null) { listAnchors = new ArrayList<String>(); anchors.put(link, listAnchors); } if (!listAnchors.contains(fullAnchor)) { listAnchors.add(fullAnchor); } } } } } } // ========================================================================== // Titles management // ========================================================================== /** * @param position Position in the text. * @return All titles leading to the given position. */ public static List<PageElementTitle> getCurrentTitles( PageAnalysis pageAnalysis, int position) { if (pageAnalysis == null) { return null; } // Analyze hierarchy of titles List<PageElementTitle> titles = pageAnalysis.getTitles(); List<PageElementTitle> currentTitles = new ArrayList<PageElementTitle>(); for (PageElementTitle title : titles) { if (title.getBeginIndex() < position) { while ((!currentTitles.isEmpty()) && (currentTitles.get(currentTitles.size() - 1).getLevel() >= title.getLevel())) { currentTitles.remove(currentTitles.size() - 1); } currentTitles.add(title); } } return currentTitles; } /** * Retrieve current chapter. * * @param pageAnalysis Page analysis. * @param position Position in the text. * @return Current title. */ public static PageElementTitle getCurrentChapter( PageAnalysis pageAnalysis, int position) { if (pageAnalysis == null) { return null; } // Analyze each title List<PageElementTitle> titles = pageAnalysis.getTitles(); PageElementTitle lastTitle = null; for (PageElementTitle title : titles) { if (title.getBeginIndex() < position) { lastTitle = title; } else { return lastTitle; } } return lastTitle; } /** * Retrieve the identifier of the current chapter. * * @param pageAnalysis Page analysis. * @param position Position in the text. * @return Identifier of the current chapter. */ public static String getCurrentChapterId( PageAnalysis pageAnalysis, int position) { if (pageAnalysis == null) { return null; } PageElementTitle title = getCurrentChapter(pageAnalysis, position); return pageAnalysis.getPage().getTitle() + "#" + ((title != null) ? title.getTitle() : ""); } }