/*
* WPCleaner: A tool to help on Wikipedia maintenance tasks.
* Copyright (C) 2013 Nicolas Vervelle
*
* See README.txt file for licensing information.
*/
package org.wikipediacleaner.api.data;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.wikipediacleaner.api.check.CheckErrorResult;
import org.wikipediacleaner.api.constants.EnumWikipedia;
import org.wikipediacleaner.api.constants.WPCConfiguration;
import org.wikipediacleaner.api.constants.WikiConfiguration;
import org.wikipediacleaner.api.constants.wiki.AbstractWikiSettings;
import org.wikipediacleaner.utils.Configuration;
import org.wikipediacleaner.utils.ConfigurationValueBoolean;
import org.wikipediacleaner.utils.Performance;
/**
* An analysis of a page.
*/
public class PageAnalysis {
/** Flag for tracing time taken by execution */
private static boolean traceTime = false;
/** Threshold for tracing time taken by execution */
private final static long TRACE_THRESHOLD = 100;
/** Page currently analyzed */
private final Page page;
/** Current version of the text */
private final String contents;
/** True if spelling should be checked */
private boolean checkSpelling;
/**
* @param page Page.
* @param contents Page contents (may differ from page.getContents()).
*/
PageAnalysis(Page page, String contents) {
this.page = page;
this.contents = (contents != null) ? contents : page.getContents();
this.areas = new PageElementAreas();
// Default configuration
Configuration config = Configuration.getConfiguration();
checkSpelling = config.getBoolean(
null, ConfigurationValueBoolean.SPELLING);
}
/**
* @return Page.
*/
public Page getPage() {
return page;
}
/**
* @return Wikipedia.
*/
public EnumWikipedia getWikipedia() {
if (page != null) {
return page.getWikipedia();
}
return null;
}
/**
* @return Wiki settings.
*/
public AbstractWikiSettings getSettings() {
EnumWikipedia wikipedia = getWikipedia();
if (wikipedia != null) {
return wikipedia.getSettings();
}
return null;
}
/**
* @return Wiki configuration.
*/
public WikiConfiguration getWikiConfiguration() {
EnumWikipedia wikipedia = getWikipedia();
if (wikipedia != null) {
return wikipedia.getWikiConfiguration();
}
return null;
}
/**
* @return WPCleaner configuration.
*/
public WPCConfiguration getWPCConfiguration() {
EnumWikipedia wikipedia = getWikipedia();
if (wikipedia != null) {
return wikipedia.getConfiguration();
}
return null;
}
/**
* @param namespace Namespace.
* @return true if the page is in the namespace.
*/
public boolean isInNamespace(int namespace) {
if ((page != null) &&
(page.getNamespace() != null)) {
return (page.getNamespace().intValue() == namespace);
}
return false;
}
/**
* @return Page contents.
*/
public String getContents() {
return contents;
}
/**
* @param check True if spelling should be checked.
*/
public void shouldCheckSpelling(boolean check) {
this.checkSpelling = check;
}
/**
* @return True if spelling should be checked.
*/
public boolean shouldCheckSpelling() {
return checkSpelling;
}
/**
* Perform page analysis.
*/
public void performFullPageAnalysis() {
firstLevelAnalysis();
secondLevelAnalysis();
thirdLevelAnalysis();
fourthLevelAnalysis();
fifthLevelAnalysis();
}
// ==========================================================================
// Elements management
// ==========================================================================
/**
* Management of non wiki text areas.
*/
private final PageElementAreas areas;
/**
* @return List of non wiki text areas.
*/
public PageElementAreas getAreas() {
fifthLevelAnalysis();
return areas;
}
public List<PageElement> getElements(
boolean withCategories, boolean withComments,
boolean withExternalLinks, boolean withFunctions,
boolean withImages, boolean withInternalLinks,
boolean withInterwikiLinks, boolean withLanguageLinks,
boolean withMagicWords, boolean withParameters,
boolean withTags, boolean withTemplates, boolean withTitles) {
List<PageElement> elements = new ArrayList<PageElement>();
if (withCategories) {
elements.addAll(getCategories());
}
if (withComments) {
elements.addAll(getComments());
}
if (withExternalLinks) {
elements.addAll(getExternalLinks());
}
if (withFunctions) {
elements.addAll(getFunctions());
}
if (withImages) {
elements.addAll(getImages());
}
if (withInternalLinks) {
elements.addAll(getInternalLinks());
}
if (withInterwikiLinks) {
elements.addAll(getInterwikiLinks());
}
if (withLanguageLinks) {
elements.addAll(getLanguageLinks());
}
if (withMagicWords) {
elements.addAll(getMagicWords());
}
if (withParameters) {
elements.addAll(getParameters());
}
if (withTags) {
elements.addAll(getTags());
}
if (withTemplates) {
elements.addAll(getTemplates());
}
if (withTitles) {
elements.addAll(getTitles());
}
Collections.sort(elements, new PageElementComparator());
return elements;
}
/**
* @param currentIndex Index.
* @return Element at the specified index.
*/
public PageElement isInElement(int currentIndex) {
// Check if in comment
PageElement element = isInComment(currentIndex);
if (element != null) {
return element;
}
// Check if in internal link
PageElementInternalLink internalLink = isInInternalLink(currentIndex);
element = internalLink;
// Check if in template
PageElementTemplate template = isInTemplate(currentIndex);
if ((template != null) &&
((element == null) || (element.getBeginIndex() < template.getBeginIndex()))) {
element = template;
}
// Check if in image
PageElementImage image = isInImage(currentIndex);
if ((image != null) &&
((element == null) || (element.getBeginIndex() < image.getBeginIndex()))) {
element = image;
}
// Check if in category
PageElementCategory category = isInCategory(currentIndex);
if ((category != null) &&
((element == null) || (element.getBeginIndex() < category.getBeginIndex()))) {
element = category;
}
// Check if in interwiki
PageElementInterwikiLink interwiki = isInInterwikiLink(currentIndex);
if ((interwiki != null) &&
((element == null) || (element.getBeginIndex() < interwiki.getBeginIndex()))) {
element = interwiki;
}
// Check if in language link
PageElementLanguageLink language = isInLanguageLink(currentIndex);
if ((language != null) &&
((element == null) || (element.getBeginIndex() < language.getBeginIndex()))) {
element = language;
}
// Check if in external link
PageElementExternalLink externalLink = isInExternalLink(currentIndex);
if ((externalLink != null) &&
((element == null) || (element.getBeginIndex() < externalLink.getBeginIndex()))) {
element = externalLink;
}
// Check if in tag
PageElementTag tag = isInTag(currentIndex);
if ((tag != null) &&
((element == null) || (element.getBeginIndex() < tag.getBeginIndex()))) {
element = tag;
}
// Check if in parameter
PageElementParameter parameter = isInParameter(currentIndex);
if ((parameter != null) &&
((element == null) || (element.getBeginIndex() < parameter.getBeginIndex()))) {
element = parameter;
}
// Check if in function
PageElementFunction function = isInFunction(currentIndex);
if ((function != null) &&
((element == null) || (element.getBeginIndex() < function.getBeginIndex()))) {
element = function;
}
// Check if in ISBN
PageElementISBN isbn = isInISBN(currentIndex);
if ((isbn != null) &&
((element == null) || (element.getBeginIndex() < isbn.getBeginIndex()))) {
element = isbn;
}
return element;
}
// ==========================================================================
// Content analysis
// ==========================================================================
/** Internal lock for first level analysis. */
private final Object firstLevelLock = new Object();
/** Internal lock for second level analysis. */
private final Object secondLevelLock = new Object();
/** Internal lock for third level analysis. */
private final Object thirdLevelLock = new Object();
/** Internal lock for fourth level analysis. */
private final Object fourthLevelLock = new Object();
/** Internal lock for fifth level analysis. */
private final Object fifthLevelLock = new Object();
/**
* Perform a first level analysis of the page (comments).
*/
private void firstLevelAnalysis() {
synchronized (firstLevelLock) {
if (comments != null) {
return;
}
Performance perf = null;
if (traceTime) {
perf = new Performance("PageAnalysis.firstLevelAnalysis", TRACE_THRESHOLD);
}
// Initialize
comments = new ArrayList<PageElementComment>();
// Go through all the text of the page
int maxIndex = (contents != null) ? contents.length() : 0;
int currentIndex = 0;
while (currentIndex < maxIndex) {
currentIndex = contents.indexOf("<!--", currentIndex);
if (currentIndex < 0) {
currentIndex = maxIndex;
} else {
PageElementComment comment = PageElementComment.analyzeBlock(
getWikipedia(), contents, currentIndex);
if (comment != null) {
comments.add(comment);
currentIndex = comment.getEndIndex();
} else {
currentIndex++;
}
}
}
// Update areas of non wiki text
areas.addComments(comments);
if (perf != null) {
perf.printEnd();
}
}
}
/**
* Perform a second level analysis of the page (tags).
*/
private void secondLevelAnalysis() {
synchronized (secondLevelLock) {
if (tags != null) {
return;
}
firstLevelAnalysis();
Performance perf = null;
if (traceTime) {
perf = new Performance("PageAnalysis.secondLevelAnalysis", TRACE_THRESHOLD);
}
// Initialize
tags = new ArrayList<PageElementTag>();
// Go through all the text of the page
int maxIndex = (contents != null) ? contents.length() : 0;
int currentIndex = 0;
while (currentIndex < maxIndex) {
currentIndex = contents.indexOf('<', currentIndex);
if (currentIndex < 0) {
currentIndex = maxIndex;
} else {
int nextIndex = areas.getEndArea(currentIndex);
if (nextIndex > currentIndex) {
currentIndex = nextIndex;
} else {
PageElementTag tag = PageElementTag.analyzeBlock(contents, currentIndex);
if (tag != null) {
if (tag.isEndTag() && !tag.isFullTag()) {
boolean found = false;
int i = tags.size();
int level = 0;
while ((i > 0) && !found) {
i--;
PageElementTag tmpTag = tags.get(i);
if (tag.getNormalizedName().equals(tmpTag.getNormalizedName())) {
if (!tmpTag.isFullTag()) {
if (tmpTag.isEndTag()) {
level++;
} else {
level--;
if (level < 0) {
found = true;
tmpTag.setMatchingTag(tag);
}
}
}
}
}
}
tags.add(tag);
currentIndex = tag.getEndIndex();
} else {
currentIndex++;
}
}
}
}
// Update areas of non wiki text
areas.addTags(tags);
if (perf != null) {
perf.printEnd();
}
}
}
/**
* Perform a third level analysis of the page (links, templates, ...).
*/
private void thirdLevelAnalysis() {
synchronized (thirdLevelLock) {
if (internalLinks != null) {
return;
}
secondLevelAnalysis();
Performance perf = null;
if (traceTime) {
perf = new Performance("PageAnalysis.thirdLevelAnalysis", TRACE_THRESHOLD);
perf.startPart();
}
internalLinks = new ArrayList<PageElementInternalLink>();
images = new ArrayList<PageElementImage>();
categories = new ArrayList<PageElementCategory>();
interwikiLinks = new ArrayList<PageElementInterwikiLink>();
languageLinks = new ArrayList<PageElementLanguageLink>();
functions = new ArrayList<PageElementFunction>();
magicWords = new ArrayList<PageElementMagicWord>();
templates = new ArrayList<PageElementTemplate>();
parameters = new ArrayList<PageElementParameter>();
titles = new ArrayList<PageElementTitle>();
if (perf != null) {
perf.stopPart("new");
}
// Go through all the text of the page
int maxIndex = (contents != null) ? contents.length() : 0;
int currentIndex = 0;
int areaIndex = 0;
List<PageElementAreas.Area> tmpAeras = areas.getAreas();
while (currentIndex < maxIndex) {
// Checking if the current index is in wiki text area.
boolean areaFound = false;
int nextIndex = currentIndex;
while ((areaIndex < tmpAeras.size()) && !areaFound) {
PageElementAreas.Area area = tmpAeras.get(areaIndex);
if (area.beginIndex > currentIndex) {
areaFound = true;
} else if (area.endIndex > currentIndex) {
areaFound = true;
nextIndex = area.endIndex;
} else {
areaIndex++;
}
}
if (perf != null) {
perf.stopPart("nextIndex");
}
if (nextIndex > currentIndex) {
currentIndex = nextIndex;
} else {
if (contents.startsWith("[[", currentIndex)) {
currentIndex = analyze2SquareBrackets(currentIndex);
if (perf != null) {
perf.stopPart("analyze2SquareBrackets");
}
} else if (contents.startsWith("{{{", currentIndex)) {
currentIndex = analyze3CurlyBrackets(currentIndex);
if (perf != null) {
perf.stopPart("analyze3CurlyBrackets");
}
} else if (contents.startsWith("{{", currentIndex)) {
currentIndex = analyze2CurlyBrackets(currentIndex);
if (perf != null) {
perf.stopPart("analyze2CurlyBrackets");
}
} else if (contents.startsWith("=", currentIndex)) {
currentIndex = analyze1Equal(currentIndex);
if (perf != null) {
perf.stopPart("analyze1Equal");
}
} else if (contents.startsWith("__", currentIndex)) {
currentIndex = analyze2Undescore(currentIndex);
if (perf != null) {
perf.stopPart("analyze2UnderscoreBrackets");
}
} else {
currentIndex++;
}
}
}
// Update areas of non wiki text
areas.addInternalLinks(internalLinks);
areas.addImages(images);
areas.addCategories(categories);
areas.addInterwikiLinks(interwikiLinks);
areas.addLanguageLinks(languageLinks);
areas.addTemplates(templates);
areas.addFunctions(functions);
areas.addMagicWords(magicWords);
areas.addParameters(parameters);
areas.addTitles(titles);
if (perf != null) {
perf.stopPart("addAreas");
perf.printEnd();
}
}
}
/**
* Perform a fourth level analysis of the page (external links).
*/
private void fourthLevelAnalysis() {
synchronized (fourthLevelLock) {
if (externalLinks != null) {
return;
}
thirdLevelAnalysis();
Performance perf = null;
if (traceTime) {
perf = new Performance("PageAnalysis.fourthLevelAnalysis", TRACE_THRESHOLD);
}
// Go through all the text of the page
externalLinks = new ArrayList<PageElementExternalLink>();
int maxIndex = (contents != null) ? contents.length() : 0;
int currentIndex = 0;
int areaIndex = 0;
List<PageElementAreas.Area> tmpAeras = areas.getAreas();
while (currentIndex < maxIndex) {
// Checking if the current index is in wiki text area.
boolean areaFound = false;
int nextIndex = currentIndex;
while ((areaIndex < tmpAeras.size()) && !areaFound) {
PageElementAreas.Area area = tmpAeras.get(areaIndex);
if (area.beginIndex > currentIndex) {
areaFound = true;
} else if (area.endIndex > currentIndex) {
areaFound = true;
nextIndex = area.endIndex;
} else {
areaIndex++;
}
}
if (nextIndex > currentIndex) {
currentIndex = nextIndex;
} else {
if (contents.startsWith("[", currentIndex)) {
currentIndex = analyze1SquareBracket(currentIndex);
if (perf != null) {
perf.stopPart("analyze1SquareBracket");
}
} else {
currentIndex = analyzeText(currentIndex);
if (perf != null) {
perf.stopPart("analyzeText");
}
}
}
}
areas.addExternalLinks(externalLinks);
if (perf != null) {
perf.printEnd();
}
}
}
/**
* Perform a fifth level analysis of the page (ISBN).
*/
private void fifthLevelAnalysis() {
synchronized (fifthLevelLock) {
if ((isbns != null) || (issns != null) || (pmids != null)) {
return;
}
fourthLevelAnalysis();
Performance perf = null;
if (traceTime) {
perf = new Performance("PageAnalysis.fifthLevelAnalysis", TRACE_THRESHOLD);
}
isbns = PageElementISBN.analyzePage(this);
areas.addISBN(isbns);
issns = PageElementISSN.analyzePage(this);
areas.addISSN(issns);
pmids = PageElementPMID.analyzePage(this);
areas.addPMID(pmids);
rfcs = PageElementRFC.analyzePage(this);
areas.addRFC(rfcs);
if (perf != null) {
perf.printEnd();
}
}
}
/**
* Part of the third level of analysis when text is beginning with "[[".
*
* @param currentIndex Current index in the text.
* @return Next index.
*/
private int analyze2SquareBrackets(int currentIndex) {
// Check if this is an internal link
PageElementInternalLink link = PageElementInternalLink.analyzeBlock(
getWikipedia(), contents, currentIndex);
if (link != null) {
internalLinks.add(link);
if (link.getText() == null) {
return link.getEndIndex();
}
return link.getBeginIndex() + Math.max(2, link.getTextOffset());
}
// Check if this is an image
PageElementImage image = PageElementImage.analyzeBlock(
getWikipedia(), contents, currentIndex);
if (image != null) {
images.add(image);
return image.getBeginIndex() + 2 + image.getNamespace().length() + 1;
}
// Check if this is a category
PageElementCategory category = PageElementCategory.analyzeBlock(
getWikipedia(), contents, currentIndex);
if (category != null) {
categories.add(category);
return category.getEndIndex();
}
// Check if this is an interwiki link
PageElementInterwikiLink interwiki = PageElementInterwikiLink.analyzeBlock(
getWikipedia(), contents, currentIndex);
if (interwiki != null) {
interwikiLinks.add(interwiki);
if (interwiki.getText() == null) {
return interwiki.getEndIndex();
}
return interwiki.getBeginIndex() + Math.max(2, interwiki.getTextOffset());
}
// Check if this is a language link
PageElementLanguageLink language = PageElementLanguageLink.analyzeBlock(
getWikipedia(), contents, currentIndex);
if (language != null) {
languageLinks.add(language);
return language.getEndIndex();
}
return currentIndex + 1;
}
/**
* Part of the third level of analysis when text is beginning with "[".
*
* @param currentIndex Current index in the text.
* @return Next index.
*/
private int analyze1SquareBracket(int currentIndex) {
// Check if this an external link
PageElementExternalLink link = PageElementExternalLink.analyzeBlock(
getWikipedia(), contents, currentIndex, this);
if (link != null) {
externalLinks.add(link);
if (link.getText() == null) {
return link.getEndIndex();
}
return link.getBeginIndex() + Math.max(2, link.getTextOffset());
}
return currentIndex + 1;
}
/**
* Part of the third level of analysis when text is beginning with "__".
*
* @param currentIndex Current index in the text.
* @return Next index.
*/
private int analyze2Undescore(int currentIndex) {
// Check if this a magic word
PageElementMagicWord magicWord = PageElementMagicWord.analyzeBlock(
getWikipedia(), contents, currentIndex);
if (magicWord != null) {
magicWords.add(magicWord);
return magicWord.getEndIndex();
}
return currentIndex + 1;
}
/**
* Part of the third level of analysis for regular text.
*
* @param currentIndex Current index in the text.
* @return Next index.
*/
private int analyzeText(int currentIndex) {
// Check if this is an external link
if ((externalLinks.size() == 0) ||
(externalLinks.get(externalLinks.size() - 1).getEndIndex() <= currentIndex)) {
PageElementExternalLink link = PageElementExternalLink.analyzeBlock(
getWikipedia(), contents, currentIndex, this);
if (link != null) {
externalLinks.add(link);
return link.getEndIndex();
}
}
return currentIndex + 1;
}
/**
* Part of the third level of analysis when text is beginning with "{{{".
*
* @param currentIndex Current index in the text.
* @return Next index.
*/
private int analyze3CurlyBrackets(int currentIndex) {
// Check if this is a parameter
PageElementParameter parameter = PageElementParameter.analyzeBlock(
getWikipedia(), contents, currentIndex, comments, tags);
if (parameter != null) {
parameters.add(parameter);
return currentIndex + 3;
}
return currentIndex + 1;
}
/**
* Part of the third level of analysis when text is beginning with "{{".
*
* @param currentIndex Current index in the text.
* @return Next index.
*/
private int analyze2CurlyBrackets(int currentIndex) {
// Check if this is a function
PageElementFunction function = PageElementFunction.analyzeBlock(
getWikipedia(), contents, currentIndex, comments, tags);
if (function != null) {
functions.add(function);
if (function.getParameterCount() == 0) {
return function.getEndIndex();
}
return currentIndex + 2;
}
// Check if this is a template
PageElementTemplate template = PageElementTemplate.analyzeBlock(
getWikipedia(), contents, currentIndex, comments, tags);
if (template != null) {
templates.add(template);
if (template.getParameterCount() == 0) {
return template.getEndIndex();
}
return currentIndex + 2;
}
return currentIndex + 1;
}
/**
* Part of the third level of analysis when text is beginning with "=".
*
* @param currentIndex Current index in the text.
* @return Next index.
*/
private int analyze1Equal(int currentIndex) {
// Check that it's a beginning of a line
boolean hasNewLine = false;
int tmpIndex = currentIndex;
while ((tmpIndex >= 0) && !hasNewLine) {
tmpIndex--;
if (tmpIndex < 0) {
hasNewLine = true;
} else if (contents.charAt(tmpIndex) == '\n') {
hasNewLine = true;
} else if (contents.charAt(tmpIndex) == '>') {
PageElementComment comment = null;
for (PageElementComment tmpComment : comments) {
if (tmpComment.getEndIndex() == tmpIndex + 1) {
comment = tmpComment;
}
}
if (comment == null) {
return currentIndex + 1;
}
tmpIndex = comment.getBeginIndex();
} else {
return currentIndex + 1;
}
}
// Check that it's not a template value
if (templates != null) {
PageElementTemplate template = null;
for (PageElementTemplate tmp : templates) {
if ((tmp.getBeginIndex() <= currentIndex) &&
(tmp.getEndIndex() > currentIndex)) {
template = tmp;
}
}
if (template != null) {
for (int i = 0; i < template.getParameterCount(); i++) {
int beginParam = template.getParameterPipeIndex(i);
int endParam = template.getParameterValueStartIndex(i);
if ((currentIndex >= beginParam) && (currentIndex < endParam)) {
return currentIndex + 1;
}
}
}
}
// Check if this is a title
PageElementTitle title = PageElementTitle.analyzeBlock(
getWikipedia(), contents, currentIndex, comments, tags);
if (title != null) {
titles.add(title);
return title.getBeginIndex() + title.getFirstLevel();
}
return currentIndex + 1;
}
// ==========================================================================
// Comments management
// ==========================================================================
/**
* All comments in the page
*/
private List<PageElementComment> comments;
/**
* @return All comments in the page.
*/
public List<PageElementComment> getComments() {
firstLevelAnalysis();
return comments;
}
/**
* @param currentIndex Current index.
* @return Comment if the current index is inside a comment.
*/
public PageElementComment isInComment(int currentIndex) {
List<PageElementComment> tmpComments = getComments();
for (PageElementComment comment : tmpComments) {
if ((comment.getBeginIndex() <= currentIndex) &&
(comment.getEndIndex() > currentIndex)) {
return comment;
}
}
return null;
}
// ==========================================================================
// Titles management
// ==========================================================================
/**
* All titles in the page.
*/
private List<PageElementTitle> titles;
/**
* @return All titles in the page.
*/
public List<PageElementTitle> getTitles() {
thirdLevelAnalysis();
return titles;
}
/**
* @param currentIndex Current index.
* @return Next title.
*/
public PageElementTitle getNextTitle(int currentIndex) {
List<PageElementTitle> tmpTitles = getTitles();
for (PageElementTitle title : tmpTitles) {
if (title.getBeginIndex() >= currentIndex) {
return title;
}
}
return null;
}
/**
* @param currentIndex Current index.
* @return Title if the current index is inside a title.
*/
public PageElementTitle isInTitle(int currentIndex) {
List<PageElementTitle> tmpTitles = getTitles();
for (PageElementTitle title : tmpTitles) {
if ((title.getBeginIndex() <= currentIndex) &&
(title.getEndIndex() > currentIndex)) {
return title;
}
}
return null;
}
/**
* @return True if titles seem to be reliable.
*/
public boolean areTitlesReliable() {
List<PageElementTitle> tmpTitles = getTitles();
for (PageElementTitle title : tmpTitles) {
if (!title.isCoherent()) {
return false;
}
if (isInTemplate(title.getBeginIndex()) != null) {
return false;
}
}
return true;
}
// ==========================================================================
// Internal links management
// ==========================================================================
/**
* All internal links in the page.
*/
private List<PageElementInternalLink> internalLinks;
/**
* @return All internal links in the page.
*/
public List<PageElementInternalLink> getInternalLinks() {
thirdLevelAnalysis();
return internalLinks;
}
/**
* @param currentIndex Current index.
* @return Next internal link.
*/
public PageElementInternalLink getNextInternalLink(int currentIndex) {
List<PageElementInternalLink> tmpInternalLinks = getInternalLinks();
for (PageElementInternalLink link : tmpInternalLinks) {
if (link.getBeginIndex() >= currentIndex) {
return link;
}
}
return null;
}
/**
* @param currentIndex Current index.
* @return Internal link if the current index is inside an internal link.
*/
public PageElementInternalLink isInInternalLink(int currentIndex) {
List<PageElementInternalLink> tmpLinks = getInternalLinks();
for (PageElementInternalLink link : tmpLinks) {
if ((link.getBeginIndex() <= currentIndex) &&
(link.getEndIndex() > currentIndex)) {
return link;
}
}
return null;
}
/**
* Links count.
*/
private Map<String, InternalLinkCount> linksCount = new HashMap<String, InternalLinkCount>();
/**
* @param link Link.
* @return Number of links to the page.
*/
public InternalLinkCount getLinkCount(Page link) {
InternalLinkCount result = linksCount.get(link.getTitle());
if (result != null) {
return result;
}
List<Page> links = Collections.singletonList(link);
InternalLinkCounter counter = new InternalLinkCounter(linksCount, links);
PageAnalysisUtils.findInternalLinks(this, links, counter);
return linksCount.get(link.getTitle());
}
/**
* Count number of links in the page.
*
* @param links Links.
*/
public void countLinks(List<Page> links) {
if ((links == null) || (links.size() == 0)) {
return;
}
List<Page> interestingLinks = new ArrayList<Page>();
for (Page link : links) {
if (!linksCount.containsKey(link.getTitle())) {
interestingLinks.add(link);
}
}
if (interestingLinks.size() > 0) {
InternalLinkCounter counter = new InternalLinkCounter(linksCount, interestingLinks);
PageAnalysisUtils.findInternalLinks(this, interestingLinks, counter);
}
}
// ==========================================================================
// Images management
// ==========================================================================
/**
* All images in the page.
*/
private List<PageElementImage> images;
/**
* @return All images in the page.
*/
public List<PageElementImage> getImages() {
thirdLevelAnalysis();
return images;
}
/**
* @param currentIndex Current index.
* @return Next image.
*/
public PageElementImage getNextImage(int currentIndex) {
List<PageElementImage> tmpImages = getImages();
for (PageElementImage image : tmpImages) {
if (image.getBeginIndex() >= currentIndex) {
return image;
}
}
return null;
}
/**
* @param currentIndex Current index.
* @return Image if the current index is inside an image.
*/
public PageElementImage isInImage(int currentIndex) {
List<PageElementImage> tmpImages = getImages();
PageElementImage result = null;
for (PageElementImage image : tmpImages) {
if ((image.getBeginIndex() <= currentIndex) &&
(image.getEndIndex() > currentIndex)) {
if ((result == null) ||
(image.getBeginIndex() > result.getBeginIndex())) {
result = image;
}
}
}
return result;
}
// ==========================================================================
// External links management
// ==========================================================================
/**
* All external links in the page.
*/
private List<PageElementExternalLink> externalLinks;
/**
* @return All external links in the page.
*/
public List<PageElementExternalLink> getExternalLinks() {
fourthLevelAnalysis();
return externalLinks;
}
/**
* @param currentIndex Current index.
* @return Next external link.
*/
public PageElementExternalLink getNextExternalLink(int currentIndex) {
List<PageElementExternalLink> tmpExternalLinks = getExternalLinks();
for (PageElementExternalLink link : tmpExternalLinks) {
if (link.getBeginIndex() >= currentIndex) {
return link;
}
}
return null;
}
/**
* @param currentIndex Current index.
* @return External link if the current index is inside an external link.
*/
public PageElementExternalLink isInExternalLink(int currentIndex) {
List<PageElementExternalLink> tmpLinks = getExternalLinks();
for (PageElementExternalLink link : tmpLinks) {
if ((link.getBeginIndex() <= currentIndex) &&
(link.getEndIndex() > currentIndex)) {
return link;
}
}
return null;
}
// ==========================================================================
// Templates management
// ==========================================================================
/**
* All templates in the page.
*/
private List<PageElementTemplate> templates;
/**
* @return All templates in the page.
*/
public List<PageElementTemplate> getTemplates() {
thirdLevelAnalysis();
return templates;
}
/**
* @param name Template name.
* @return All templates with this name in the page analysis.
*/
public List<PageElementTemplate> getTemplates(String name) {
if (name == null) {
return null;
}
List<PageElementTemplate> tmpTemplates = getTemplates();
List<PageElementTemplate> result = new ArrayList<PageElementTemplate>();
if (tmpTemplates != null) {
for (PageElementTemplate template : tmpTemplates) {
if (Page.areSameTitle(name, template.getTemplateName())) {
result.add(template);
}
}
}
return result;
}
/**
* @param currentIndex Current index.
* @return Next template.
*/
public PageElementTemplate getNextTemplate(int currentIndex) {
List<PageElementTemplate> tmpTemplates = getTemplates();
for (PageElementTemplate template : tmpTemplates) {
if (template.getBeginIndex() >= currentIndex) {
return template;
}
}
return null;
}
/**
* @param currentIndex Current index.
* @return Template if the current index is inside a template.
*/
public PageElementTemplate isInTemplate(int currentIndex) {
List<PageElementTemplate> tmpTemplates = getTemplates();
PageElementTemplate result = null;
for (PageElementTemplate template : tmpTemplates) {
if ((template.getBeginIndex() <= currentIndex) &&
(template.getEndIndex() > currentIndex)) {
result = template;
}
}
return result;
}
// ==========================================================================
// Parameters management
// ==========================================================================
/**
* All parameters in the page.
*/
private List<PageElementParameter> parameters;
/**
* @return All parameters in the page.
*/
public List<PageElementParameter> getParameters() {
thirdLevelAnalysis();
return parameters;
}
/**
* @param currentIndex Current index.
* @return Parameter if the current index is inside a parameter.
*/
public PageElementParameter isInParameter(int currentIndex) {
List<PageElementParameter> tmpParameters = getParameters();
PageElementParameter result = null;
for (PageElementParameter parameter : tmpParameters) {
if ((parameter.getBeginIndex() <= currentIndex) &&
(parameter.getEndIndex() > currentIndex)) {
result = parameter;
}
}
return result;
}
// ==========================================================================
// Functions management
// ==========================================================================
/**
* All functions in the page.
*/
private List<PageElementFunction> functions;
/**
* @return All functions in the page.
*/
public List<PageElementFunction> getFunctions() {
thirdLevelAnalysis();
return functions;
}
/**
* @param currentIndex Current index.
* @return Function if the current index is inside a function.
*/
public PageElementFunction isInFunction(int currentIndex) {
List<PageElementFunction> tmpFunctions = getFunctions();
PageElementFunction result = null;
for (PageElementFunction function : tmpFunctions) {
if ((function.getBeginIndex() <= currentIndex) &&
(function.getEndIndex() > currentIndex)) {
result = function;
}
}
return result;
}
// ==========================================================================
// Magic words management
// ==========================================================================
/**
* All magic words in the page.
*/
private List<PageElementMagicWord> magicWords;
/**
* @return All magic words in the page.
*/
public List<PageElementMagicWord> getMagicWords() {
thirdLevelAnalysis();
return magicWords;
}
/**
* @param currentIndex Current index.
* @return Magic word if the current index is inside a magic word.
*/
public PageElementMagicWord isInMagicWord(int currentIndex) {
List<PageElementMagicWord> tmpMagicWords = getMagicWords();
PageElementMagicWord result = null;
for (PageElementMagicWord magicWord : tmpMagicWords) {
if ((magicWord.getBeginIndex() <= currentIndex) &&
(magicWord.getEndIndex() > currentIndex)) {
result = magicWord;
}
}
return result;
}
// ==========================================================================
// Tags management
// ==========================================================================
/**
* All tags in the page.
*/
private List<PageElementTag> tags;
/**
* Lock for updating the tags categorized by name.
*/
private final Object lockTagsByName = new Object();
/**
* All tags in the page categorized by name.
*/
private Map<String, List<PageElementTag>> tagsByName;
/**
* All complete tags in the page categorized by name.
* Complete tags are either full tags or opening tags associated with a closing tag.
*/
private Map<String, List<PageElementTag>> completeTagsByName;
/**
* @return All tags in the page.
*/
public List<PageElementTag> getTags() {
secondLevelAnalysis();
return tags;
}
/**
* @param name Tag name.
* @return All tags with this name in the page.
*/
public List<PageElementTag> getTags(String name) {
if (name == null) {
return null;
}
synchronized (lockTagsByName) {
if (tagsByName == null) {
tagsByName = new HashMap<String, List<PageElementTag>>();
}
name = name.toLowerCase();
List<PageElementTag> result = tagsByName.get(name);
if (result == null) {
List<PageElementTag> tmpTags = getTags();
result = new ArrayList<PageElementTag>();
tagsByName.put(name, result);
for (PageElementTag tag : tmpTags) {
if (name.equals(tag.getNormalizedName())) {
result.add(tag);
}
}
}
return result;
}
}
/**
* @param name Tag name.
* @return All complete tags with this name in the page.
*/
public List<PageElementTag> getCompleteTags(String name) {
if (name == null) {
return null;
}
synchronized (lockTagsByName) {
if (completeTagsByName == null) {
completeTagsByName = new HashMap<String, List<PageElementTag>>();
}
name = name.toLowerCase();
List<PageElementTag> result = completeTagsByName.get(name);
if (result == null) {
List<PageElementTag> tmpTags = getTags(name);
result = new ArrayList<PageElementTag>();
completeTagsByName.put(name, result);
for (PageElementTag tag : tmpTags) {
if (tag.isFullTag()) {
result.add(tag);
} else if (!tag.isEndTag() && tag.isComplete()) {
result.add(tag);
}
}
}
return result;
}
}
/**
* @param name Tag name.
* @param currentIndex Current index.
* @return Surrounding tag.
*/
public PageElementTag getSurroundingTag(String name, int currentIndex) {
List<PageElementTag> tmpTags = getCompleteTags(name);
if (tmpTags == null) {
return null;
}
PageElementTag result = null;
for (PageElementTag tag : tmpTags) {
if ((!tag.isFullTag()) &&
(tag.getValueBeginIndex() <= currentIndex) &&
(tag.getValueEndIndex() > currentIndex)) {
if ((result == null) ||
(tag.getBeginIndex() > result.getBeginIndex())) {
result = tag;
}
}
}
return result;
}
/**
* @param currentIndex Current index.
* @return Next tag.
*/
public PageElementTag getNextTag(int currentIndex) {
List<PageElementTag> tmpTags = getTags();
for (PageElementTag tag : tmpTags) {
if (tag.getBeginIndex() >= currentIndex) {
return tag;
}
}
return null;
}
/**
* @param currentIndex Current index.
* @return Tag if the current index is inside a tag.
*/
public PageElementTag isInTag(int currentIndex) {
List<PageElementTag> tmpTags = getTags();
for (PageElementTag tag : tmpTags) {
if ((tag.getBeginIndex() <= currentIndex) &&
(tag.getEndIndex() > currentIndex)) {
return tag;
}
}
return null;
}
/**
* @param currentIndex Current index.
* @param tagName Tag name.
* @return Tag if the current index is inside a tag.
*/
public PageElementTag isInTag(int currentIndex, String tagName) {
List<PageElementTag> tmpTags = getTags(tagName);
for (PageElementTag tag : tmpTags) {
if ((tag.getBeginIndex() <= currentIndex) &&
(tag.getEndIndex() > currentIndex)) {
return tag;
}
}
return null;
}
// ==========================================================================
// DEFAULTSORT management
// ==========================================================================
/**
* @return All DEFAULTSORT in the page.
*/
public List<PageElementFunction> getDefaultSorts() {
List<PageElementFunction> tmpFunctions = getFunctions();
if (tmpFunctions == null) {
return null;
}
List<PageElementFunction> defaultSorts = new ArrayList<PageElementFunction>();
for (PageElementFunction function : tmpFunctions) {
if (MagicWord.DEFAULT_SORT.equals(function.getMagicWord().getName())) {
defaultSorts.add(function);
}
}
return defaultSorts;
}
/**
* @param currentIndex Current index.
* @return DefaultSort if the current index is inside a DEFAULTSORT.
*/
public PageElementFunction isInDefaultSort(int currentIndex) {
List<PageElementFunction> tmpDefaultSorts = getDefaultSorts();
for (PageElementFunction defaultSort : tmpDefaultSorts) {
if ((defaultSort.getBeginIndex() <= currentIndex) &&
(defaultSort.getEndIndex() > currentIndex)) {
return defaultSort;
}
}
return null;
}
// ==========================================================================
// Categories management
// ==========================================================================
/**
* All categories in the page.
*/
private List<PageElementCategory> categories;
/**
* @return All categories in the page.
*/
public List<PageElementCategory> getCategories() {
thirdLevelAnalysis();
return categories;
}
/**
* @param currentIndex Current index.
* @return Next category.
*/
public PageElementCategory getNextCategory(int currentIndex) {
List<PageElementCategory> tmpCategories = getCategories();
for (PageElementCategory category : tmpCategories) {
if (category.getBeginIndex() >= currentIndex) {
return category;
}
}
return null;
}
/**
* @param currentIndex Current index.
* @return Category if the current index is inside a category.
*/
public PageElementCategory isInCategory(int currentIndex) {
List<PageElementCategory> tmpCategories = getCategories();
for (PageElementCategory category : tmpCategories) {
if ((category.getBeginIndex() <= currentIndex) &&
(category.getEndIndex() > currentIndex)) {
return category;
}
}
return null;
}
// ==========================================================================
// Interwiki links management
// ==========================================================================
/**
* All interwiki links in the page.
*/
private List<PageElementInterwikiLink> interwikiLinks;
/**
* @return All interwiki links in the page.
*/
public List<PageElementInterwikiLink> getInterwikiLinks() {
thirdLevelAnalysis();
return interwikiLinks;
}
/**
* @param currentIndex Current index.
* @return Next interwiki link.
*/
public PageElementInterwikiLink getNextInterwikiLink(int currentIndex) {
List<PageElementInterwikiLink> tmpLinks = getInterwikiLinks();
for (PageElementInterwikiLink link : tmpLinks) {
if (link.getBeginIndex() >= currentIndex) {
return link;
}
}
return null;
}
/**
* @param currentIndex Current index.
* @return Interwiki link if the current index is inside an interwiki link.
*/
public PageElementInterwikiLink isInInterwikiLink(int currentIndex) {
List<PageElementInterwikiLink> tmpLinks = getInterwikiLinks();
for (PageElementInterwikiLink link : tmpLinks) {
if ((link.getBeginIndex() <= currentIndex) &&
(link.getEndIndex() > currentIndex)) {
return link;
}
}
return null;
}
// ==========================================================================
// Language links management
// ==========================================================================
/**
* All language links in the page.
*/
private List<PageElementLanguageLink> languageLinks;
/**
* @return All language links in the page.
*/
public List<PageElementLanguageLink> getLanguageLinks() {
thirdLevelAnalysis();
return languageLinks;
}
/**
* @param currentIndex Current index.
* @return Next language link.
*/
public PageElementLanguageLink getNextLanguageLink(int currentIndex) {
List<PageElementLanguageLink> tmpLinks = getLanguageLinks();
for (PageElementLanguageLink link : tmpLinks) {
if (link.getBeginIndex() >= currentIndex) {
return link;
}
}
return null;
}
/**
* @param currentIndex Current index.
* @return Language link if the current index is inside a language link.
*/
public PageElementLanguageLink isInLanguageLink(int currentIndex) {
List<PageElementLanguageLink> tmpLinks = getLanguageLinks();
for (PageElementLanguageLink link : tmpLinks) {
if ((link.getBeginIndex() <= currentIndex) &&
(link.getEndIndex() > currentIndex)) {
return link;
}
}
return null;
}
// ==========================================================================
// ISBN, ISSN, PMID and RFC management
// ==========================================================================
/**
* All ISBNs in the page
*/
private List<PageElementISBN> isbns;
/**
* @return All ISBNs in the page.
*/
public List<PageElementISBN> getISBNs() {
fifthLevelAnalysis();
return isbns;
}
/**
* @param currentIndex Current index.
* @return ISBN if the current index is inside an ISBN.
*/
public PageElementISBN isInISBN(int currentIndex) {
List<PageElementISBN> tmpIsbns = getISBNs();
for (PageElementISBN isbn : tmpIsbns) {
if ((isbn.getBeginIndex() <= currentIndex) &&
(isbn.getEndIndex() > currentIndex)) {
return isbn;
}
}
return null;
}
/**
* All ISSNs in the page
*/
private List<PageElementISSN> issns;
/**
* @return All ISSNs in the page.
*/
public List<PageElementISSN> getISSNs() {
fifthLevelAnalysis();
return issns;
}
/**
* @param currentIndex Current index.
* @return ISSN if the current index is inside an ISSN.
*/
public PageElementISSN isInISSN(int currentIndex) {
List<PageElementISSN> tmpIsbns = getISSNs();
for (PageElementISSN issn : tmpIsbns) {
if ((issn.getBeginIndex() <= currentIndex) &&
(issn.getEndIndex() > currentIndex)) {
return issn;
}
}
return null;
}
/**
* All PMIDs in the page
*/
private List<PageElementPMID> pmids;
/**
* @return All PMIDs in the page.
*/
public List<PageElementPMID> getPMIDs() {
fifthLevelAnalysis();
return pmids;
}
/**
* @param currentIndex Current index.
* @return PMID if the current index is inside a PMID.
*/
public PageElementPMID isInPMID(int currentIndex) {
List<PageElementPMID> tmpPmids = getPMIDs();
for (PageElementPMID pmid : tmpPmids) {
if ((pmid.getBeginIndex() <= currentIndex) &&
(pmid.getEndIndex() > currentIndex)) {
return pmid;
}
}
return null;
}
/**
* All RFCs in the page
*/
private List<PageElementRFC> rfcs;
/**
* @return All RFCs in the page.
*/
public List<PageElementRFC> getRFCs() {
fifthLevelAnalysis();
return rfcs;
}
/**
* @param currentIndex Current index.
* @return RFC if the current index is inside a RFC.
*/
public PageElementRFC isInRFC(int currentIndex) {
List<PageElementRFC> tmpRfcs = getRFCs();
for (PageElementRFC rfc : tmpRfcs) {
if ((rfc.getBeginIndex() <= currentIndex) &&
(rfc.getEndIndex() > currentIndex)) {
return rfc;
}
}
return null;
}
// ==========================================================================
// Errors management
// ==========================================================================
/**
* Bean for holding results about error detection.
*/
public static class Result {
/**
* True if errors of this kind have been found.
*/
private final boolean found;
/**
* List of errors found.
*/
private final List<CheckErrorResult> errors;
Result(boolean found, List<CheckErrorResult> errors) {
this.found = found;
this.errors = (errors != null) ? new ArrayList<CheckErrorResult>(errors) : null;
}
/**
* @param results (Out) List of errors found.
* @return True if errors of this kind have been found.
*/
public boolean getErrors(List<CheckErrorResult> results) {
if ((results != null) && (errors != null)) {
results.addAll(errors);
}
return found;
}
}
/**
* Memorizing Check Wiki errors.
*/
private Map<Integer, Result> checkWikiErrors;
/**
* Memorize Check Wiki errors.
*
* @param errorNumber Error number.
* @param found True if errors of this kind have been found.
* @param errors List of errors found.
*/
public void setCheckWikiErrors(int errorNumber, boolean found, List<CheckErrorResult> errors) {
if (checkWikiErrors == null) {
checkWikiErrors = new HashMap<Integer, PageAnalysis.Result>();
}
checkWikiErrors.put(Integer.valueOf(errorNumber), new Result(found, errors));
}
/**
* @param errorNumber Error number.
* @return Errors for this error number.
*/
public Result getCheckWikiErrors(int errorNumber) {
if (checkWikiErrors == null) {
return null;
}
return checkWikiErrors.get(Integer.valueOf(errorNumber));
}
}