/*
* WPCleaner: A tool to help on Wikipedia maintenance tasks.
* Copyright (C) 2013 Nicolas Vervelle
*
* See README.txt file for licensing information.
*/
package org.wikipediacleaner.api.data;
import java.util.ArrayList;
import java.util.List;
import org.wikipediacleaner.api.constants.EnumWikipedia;
/**
* Class containing information about a complete external link ([http://... text]).
*/
public class PageElementExternalLink extends PageElement {
private final String linkNotTrimmed;
private final String link;
private final String textNotTrimmed;
private final String text;
private final int textOffset;
private final boolean hasSquare;
private final boolean hasSecondSquare;
private final static String SEPARATORS_EXCLUDED = " \t\"";
private final static String SEPARATORS_INCLUDED = "<>|";
private final static String UNACCEPTABLE = "\n";
private final static String ALL_SEPARATORS = SEPARATORS_EXCLUDED + SEPARATORS_INCLUDED + UNACCEPTABLE + "[]";
private final static String IN_TEMPLATES_SEPARATORS = "|}";
private final static List<String> privateProtocols = new ArrayList<String>();
static {
privateProtocols.add("http://");
privateProtocols.add("https://");
privateProtocols.add("ftp://");
}
/**
* Analyze contents to check if it matches an external link.
*
* @param wikipedia Wikipedia.
* @param contents Contents.
* @param index Block start index.
* @param analysis Page analysis.
* @return Block details it there's a block.
*/
public static PageElementExternalLink analyzeBlock(
EnumWikipedia wikipedia, String contents, int index,
PageAnalysis analysis) {
// Verify arguments
if (contents == null) {
return null;
}
int maxLength = contents.length();
// Look for '['
int tmpIndex = index;
if (tmpIndex >= maxLength) {
return null;
}
boolean hasSquare = false;
if (contents.startsWith("[", tmpIndex)) {
hasSquare = true;
tmpIndex++;
}
// Possible white spaces characters
if (hasSquare) {
while ((tmpIndex < maxLength) &&
(contents.charAt(tmpIndex) == ' ')) {
tmpIndex++;
}
}
// Check for protocol
if (tmpIndex >= maxLength) {
return null;
}
boolean protocolOk = isPossibleProtocol(contents, tmpIndex);
if (!protocolOk) {
if (!hasSquare || !contents.startsWith("//", tmpIndex)) {
return null;
}
}
int beginUrlIndex = tmpIndex;
// Find if the external link is inside a template
String fullSeparators = ALL_SEPARATORS;
if (analysis != null) {
PageElementTemplate template = analysis.isInTemplate(index);
if (!hasSquare && (template != null)) {
fullSeparators += IN_TEMPLATES_SEPARATORS;
}
}
// Find destination of external link
int endUrlIndex = beginUrlIndex;
while ((endUrlIndex < maxLength) &&
(fullSeparators.indexOf(contents.charAt(endUrlIndex)) < 0)) {
endUrlIndex++;
}
// Situations where the external link consists only of the URL
if (!hasSquare ||
(endUrlIndex >= maxLength) ||
(UNACCEPTABLE.indexOf(contents.charAt(endUrlIndex)) >= 0)) {
return new PageElementExternalLink(
beginUrlIndex, endUrlIndex,
contents.substring(beginUrlIndex, endUrlIndex),
null, -1, false, false);
}
if ((endUrlIndex < maxLength) &&
(contents.charAt(endUrlIndex) == ']')) {
return new PageElementExternalLink(
index, endUrlIndex + 1,
contents.substring(beginUrlIndex, endUrlIndex),
null, -1, true, true);
}
// Compute maximum index for end of external link
int maxEndIndex = maxLength;
if (analysis != null) {
PageElementTag refTag = analysis.getSurroundingTag(PageElementTag.TAG_WIKI_REF, index);
if ((refTag != null) && !refTag.isFullTag() && refTag.isComplete()) {
maxEndIndex = refTag.getValueEndIndex();
}
}
// Find beginning of text
int beginTextIndex = endUrlIndex;
while ((beginTextIndex < maxEndIndex) &&
(SEPARATORS_EXCLUDED.indexOf(contents.charAt(beginTextIndex)) >= 0)) {
beginTextIndex++;
}
// Find end of text
int endTextIndex = beginTextIndex;
int prematureEndIndex = -1;
int doubleSquareCount = 0;
while (endTextIndex < maxEndIndex) {
if (contents.startsWith("[[", endTextIndex)) {
if (prematureEndIndex < 0) {
prematureEndIndex = endTextIndex;
}
doubleSquareCount++;
endTextIndex += 2;
} else if ((doubleSquareCount > 0) && contents.startsWith("]]", endTextIndex)) {
doubleSquareCount--;
endTextIndex += 2;
} else if (contents.charAt(endTextIndex) == ']') {
if (prematureEndIndex < 0) {
return new PageElementExternalLink(
index, endTextIndex + 1,
contents.substring(beginUrlIndex, endUrlIndex),
contents.substring(beginTextIndex, endTextIndex),
beginTextIndex - index, true, true);
}
return new PageElementExternalLink(
index, prematureEndIndex,
contents.substring(beginUrlIndex, endUrlIndex),
contents.substring(beginTextIndex, prematureEndIndex),
beginTextIndex - index, true, false);
} else if (UNACCEPTABLE.indexOf(contents.charAt(endTextIndex)) >= 0) {
return new PageElementExternalLink(
beginUrlIndex, endUrlIndex,
contents.substring(beginUrlIndex, endUrlIndex),
null, -1, false, false);
} else {
PageElementComment comment = null;
PageElementTag tagNowiki = null;
if ((contents.charAt(endTextIndex) == '<') && (analysis != null)) {
comment = analysis.isInComment(endTextIndex);
tagNowiki = analysis.isInTag(endTextIndex, PageElementTag.TAG_WIKI_NOWIKI);
}
if (comment != null) {
endTextIndex = comment.getEndIndex();
} else if (tagNowiki != null) {
endTextIndex = tagNowiki.getCompleteEndIndex();
} else {
endTextIndex++;
}
}
}
// No end found
return new PageElementExternalLink(
beginUrlIndex, endUrlIndex,
contents.substring(beginUrlIndex, endUrlIndex),
null, -1, false, false);
}
/**
* @param text Text.
* @param offset Offset in the text.
* @return True if the offset in the text is a possible protocol.
*/
public static boolean isPossibleProtocol(String text, int offset) {
for (String protocol : privateProtocols) {
int pos = 0;
boolean same = true;
while (same && (pos < protocol.length())) {
if (offset + pos >= text.length()) {
same = false;
} else if (protocol.charAt(pos) != Character.toLowerCase(text.charAt(offset + pos))) {
same = false;
} else {
pos++;
}
}
if (same) {
return true;
}
}
return false;
}
/**
* @return External link.
*/
public String getLink() {
return link;
}
/**
* @return Text.
*/
public String getText() {
return text;
}
/**
* @return Text offset.
*/
public int getTextOffset() {
return textOffset;
}
/**
* @return Text.
*/
public String getTextNotTrimmed() {
return textNotTrimmed;
}
/**
* @return Displayed text.
*/
public String getDisplayedText() {
if (text != null) {
return text;
}
return linkNotTrimmed;
}
/**
* @return True if the link is in [...]
*/
public boolean hasSquare() {
return hasSquare;
}
/**
* @return True if the link is in [...] and not in ([...[[...]]...])
*/
public boolean hasSecondSquare() {
return hasSecondSquare;
}
private PageElementExternalLink(
int beginIndex, int endIndex,
String link, String text, int textOffset,
boolean hasSquare, boolean hasSecondSquare) {
super(beginIndex, endIndex);
this.linkNotTrimmed = link;
String tmpLink = (link != null) ? link.trim() : null;
if ((tmpLink != null) && (tmpLink.startsWith("//"))) {
tmpLink = "http:" + tmpLink;
}
this.link = tmpLink;
if ((text != null) && (text.trim().length() > 0)) {
this.textNotTrimmed = text;
this.text = text.trim();
} else {
this.textNotTrimmed = null;
this.text = null;
}
this.textOffset = textOffset;
this.hasSquare = hasSquare;
this.hasSecondSquare = hasSecondSquare;
}
/* (non-Javadoc)
* @see java.lang.Object#toString()
*/
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("[");
sb.append(linkNotTrimmed);
if (textNotTrimmed != null) {
sb.append(' ');
sb.append(textNotTrimmed);
}
sb.append("]");
return sb.toString();
}
/**
* Create an external link.
*
* @param link Link.
* @param text Displayed text.
* @return External link.
*/
public static String createExternalLink(String link, String text) {
StringBuilder sb = new StringBuilder();
sb.append("[");
if (link != null) {
sb.append(link);
}
if (text != null) {
sb.append(" ");
sb.append(text);
}
sb.append("]");
return sb.toString();
}
}