/*
* WPCleaner: A tool to help on Wikipedia maintenance tasks.
* Copyright (C) 2013 Nicolas Vervelle
*
* See README.txt file for licensing information.
*/
package org.wikipediacleaner.api.data;
import java.util.ArrayList;
import java.util.List;
import org.wikipediacleaner.api.constants.WPCConfiguration;
import org.wikipediacleaner.api.constants.WPCConfigurationStringList;
import org.wikipediacleaner.api.data.PageElementTemplate.Parameter;
/**
* Class containing information about an ISSN.
*/
public class PageElementISSN extends PageElement {
/**
* ISSN prefix.
*/
private final static String ISSN_PREFIX = "ISSN";
/**
* ISSN possible meaningful characters.
*/
private final static String POSSIBLE_CHARACTERS = "0123456789Xx";
/**
* ISSN possible extraneous characters.
*/
private final static String EXTRA_CHARACTERS = "-";
/**
* ISSN incorrect characters.
*/
private final static String INCORRECT_CHARACTERS = ":‐\t—=–# ";
/**
* ISSN incorrect characters at the beginning.
*/
private final static String INCORRECT_BEGIN_CHARACTERS = ":;‐\t—=–#";
/**
* @param analysis Page analysis.
* @return List of ISSN.
*/
public static List<PageElementISSN> analyzePage(
PageAnalysis analysis) {
List<PageElementISSN> issns = new ArrayList<PageElementISSN>();
// Configuration
WPCConfiguration config = analysis.getWPCConfiguration();
List<String[]> issnIgnoreTemplates = config.getStringArrayList(WPCConfigurationStringList.ISSN_IGNORE_TEMPLATES);
// Search for ISSN templates
List<String[]> issnTemplates = config.getStringArrayList(WPCConfigurationStringList.ISSN_TEMPLATES);
if (issnTemplates != null) {
for (String[] issnTemplate : issnTemplates) {
if (issnTemplate.length > 0) {
List<PageElementTemplate> templates = analysis.getTemplates(issnTemplate[0]);
if (templates != null) {
for (PageElementTemplate template : templates) {
String[] params = null;
if (issnTemplate.length > 1) {
params = issnTemplate[1].split(",");
} else {
params = new String[]{ "1" };
}
for (String param : params) {
if ((param != null) && (param.length() > 0)) {
analyzeTemplateParams(
analysis, issns, issnIgnoreTemplates,
template, param,
false, false, true, false);
}
}
}
}
}
}
}
// Search for ISSN templates where help is requested
issnTemplates = config.getStringArrayList(WPCConfigurationStringList.ISSN_HELP_NEEDED_TEMPLATES);
if (issnTemplates != null) {
for (String[] issnTemplate : issnTemplates) {
if (issnTemplate.length > 0) {
List<PageElementTemplate> templates = analysis.getTemplates(issnTemplate[0]);
if (templates != null) {
for (PageElementTemplate template : templates) {
analyzeTemplateParams(
analysis, issns, issnIgnoreTemplates,
template,
((issnTemplate.length > 1) && (issnTemplate[1].length() > 0)) ? issnTemplate[1] : "1",
false, false, false, true);
}
}
}
}
}
// Search for ISSN in template parameters
List<PageElementTemplate> templates = analysis.getTemplates();
for (PageElementTemplate template : templates) {
analyzeTemplateParams(
analysis, issns, issnIgnoreTemplates,
template, "ISSN", true, true, true, false);
}
// Search for ISSN in plain texts
String contents = analysis.getContents();
if (contents == null) {
return issns;
}
int index = 0;
int maxIndex = contents.length() - ISSN_PREFIX.length();
while (index < maxIndex) {
// Check if it's a potential ISSN
boolean isValid = true;
boolean isISSN = ISSN_PREFIX.equalsIgnoreCase(
contents.substring(index, index + ISSN_PREFIX.length()));
if (isISSN && (analysis.isInComment(index) != null)) {
isISSN = false;
}
if (isISSN &&
(index > 0) &&
(contents.charAt(index - 1) == '/') &&
(index + ISSN_PREFIX.length() < contents.length()) &&
(Character.isDigit(contents.charAt(index + ISSN_PREFIX.length())))) {
isISSN = false; // to avoid DOI like doi:10.5547/issn0195-6574-ej-vol10-no1-14
}
if (isISSN && (analysis.isInTag(index) != null)) {
isISSN = false;
}
if (isISSN && (analysis.getSurroundingTag(PageElementTag.TAG_WIKI_NOWIKI, index) != null)) {
isISSN = false;
}
if (isISSN && isInISSN(index, issns)) {
isISSN = false; // to avoid issn=ISSN xxxx-xxxx being detected twice
}
if (isISSN) {
PageElementExternalLink link = analysis.isInExternalLink(index);
if (link != null) {
if (!link.hasSquare() ||
(index < link.getBeginIndex() + link.getTextOffset()) ||
(link.getText() == null)) {
isValid = false;
}
}
}
if (isISSN) {
PageElementTemplate template = analysis.isInTemplate(index);
if (template != null) {
if ((template.getParameterCount() == 0) ||
(index < template.getParameterPipeIndex(0))) {
isISSN = false;
}
}
}
if (isISSN) {
PageElementImage image = analysis.isInImage(index);
if (image != null) {
if (index < image.getBeginIndex() + image.getFirstPipeOffset()) {
isISSN = false;
}
}
}
if (isISSN) {
// Check if it's a template parameter
boolean parameter = false;
PageElementTemplate template = analysis.isInTemplate(index);
if (template != null) {
for (int paramNum = 0; paramNum < template.getParameterCount(); paramNum++) {
if ((template.getParameterPipeIndex(paramNum) < index) &&
(template.getParameterValueStartIndex(paramNum) > index)) {
parameter = true;
}
}
}
int beginIndex = index;
index += ISSN_PREFIX.length();
if (!parameter) {
boolean correct = true;
if ((beginIndex >= 2) && (index + 2 < contents.length())) {
if (contents.startsWith("[[", beginIndex - 2) &&
contents.startsWith("]]", index)) {
correct = false;
beginIndex -= 2;
index += 2;
}
}
boolean spaceFound = false;
if (analysis.isInComment(index) == null) {
while ((index < contents.length()) && (contents.charAt(index) == ' ')) {
index++;
spaceFound = true;
}
while ((index < contents.length()) &&
(INCORRECT_BEGIN_CHARACTERS.indexOf(contents.charAt(index)) >= 0)) {
index++;
correct = false;
}
}
int beginNumber = -1;
int endNumber = beginNumber;
boolean finished = false;
correct &= spaceFound;
boolean nextCorrect = correct;
int digitCount = 0;
boolean hasSeparator = false;
boolean hasExtraSeparator = false;
while (!finished && (index < contents.length())) {
char currentChar = contents.charAt(index);
if (POSSIBLE_CHARACTERS.indexOf(currentChar) >= 0) {
if (beginNumber < 0) {
beginNumber = index;
}
endNumber = index + 1;
index++;
digitCount++;
if (!Character.isDigit(currentChar) && (digitCount != 8)) {
correct = false;
}
correct &= nextCorrect;
} else if (EXTRA_CHARACTERS.indexOf(currentChar) >= 0) {
if (beginNumber < 0) {
nextCorrect = false;
} else if ((digitCount == 4) && !hasSeparator) {
hasSeparator = true;
} else {
hasExtraSeparator = true;
}
index++;
} else if (INCORRECT_CHARACTERS.indexOf(currentChar) >= 0) {
index++;
nextCorrect = false;
} else {
if ((endNumber == index) && (Character.isLetter(currentChar))) {
correct = false;
}
finished = true;
}
}
if (digitCount == 8) {
if (!hasSeparator || hasExtraSeparator) {
correct = false;
}
}
if (endNumber > beginNumber) {
String number = contents.substring(beginNumber, endNumber);
issns.add(new PageElementISSN(
beginIndex, endNumber, analysis, number,
isValid, correct, false, null));
index = endNumber;
}
}
} else {
index++;
}
}
return issns;
}
/**
* @param index Current index.
* @param issns List of ISSN.
* @return True if the current index is already in a ISSN.
*/
private static boolean isInISSN(int index, List<PageElementISSN> issns) {
if (issns != null) {
for (PageElementISSN tmpIssn : issns) {
if ((tmpIssn.getBeginIndex() <= index) &&
(tmpIssn.getEndIndex() > index)) {
return true;
}
}
}
return false;
}
/**
* Check if template parameter is an ISSN.
*
* @param analysis Page analysis.
* @param issns Current list of ISSN.
* @param ignoreTemplates List of templates (with parameter and value) to ignore.
* @param template Template.
* @param argumentName Template parameter name.
* @param ignoreCase True if parameter name should compared ignoring case.
* @param acceptNumbers True if numbers are accepted after parameter name.
* @param acceptAllValues True if all values are accepted, even if not compatible with ISSN.
* @param helpRequested True if help has been requested for this ISSN.
*/
private static void analyzeTemplateParams(
PageAnalysis analysis, List<PageElementISSN> issns,
List<String[]> ignoreTemplates,
PageElementTemplate template,
String argumentName,
boolean ignoreCase, boolean acceptNumbers,
boolean acceptAllValues, boolean helpRequested) {
// Check if template should be ignored
if (ignoreTemplates != null) {
for (String[] ignoreTemplate : ignoreTemplates) {
if ((ignoreTemplate != null) &&
(ignoreTemplate.length > 0) &&
(Page.areSameTitle(ignoreTemplate[0], template.getTemplateName()))) {
if (ignoreTemplate.length > 1) {
String paramValue = template.getParameterValue(ignoreTemplate[1]);
if (ignoreTemplate.length > 2) {
if ((paramValue != null) &&
(paramValue.trim().equals(ignoreTemplate[2].trim()))) {
return; // Ignore all templates with this name and parameter set to a given value
}
} else {
if (paramValue != null) {
return; // Ignore all templates with this name and parameter present
}
}
} else {
return; // Ignore all templates with this name
}
}
}
}
int paramDefaultName = 1;
for (int paramNum = 0; paramNum < template.getParameterCount(); paramNum++) {
// Check parameter name
Parameter param = template.getParameter(paramNum);
String paramName = param.getName();
if ((paramName == null) || (paramName.trim().length() == 0)) {
paramName = Integer.toString(paramDefaultName);
paramDefaultName++;
}
boolean nameOk = false;
if ((ignoreCase && argumentName.equalsIgnoreCase(paramName)) ||
(argumentName.equals(paramName))) {
nameOk = true;
} else if (acceptNumbers && (paramName.length() > argumentName.length())) {
String shortParamName = paramName.substring(0, argumentName.length());
if ((ignoreCase && argumentName.equalsIgnoreCase(shortParamName)) ||
(argumentName.equals(paramName))) {
nameOk = true;
for (int i = argumentName.length(); i < paramName.length(); i++) {
if (!Character.isDigit(paramName.charAt(i))) {
nameOk = false;
}
}
}
}
// Parameter is for an ISSN, analyze that it's not filled by WikiData
if (nameOk) {
String paramValue = param.getValue();
if ((paramValue == null) ||
(paramValue.trim().length() == 0) ||
"{{#property:p236}}".equalsIgnoreCase(paramValue.trim())) {
nameOk = false;
}
}
// Parameter is for an ISSN, analyze its value
if (nameOk) {
String paramValue = param.getValue();
int delta = param.getValueStartIndex();
int i = 0;
int beginIndex = -1;
int endIndex = -1;
int digitCount = 0;
boolean hasSeparator = false;
boolean hasExtraSeparator = false;
boolean hasExtraCharacters = false;
boolean ok = true;
boolean correct = true;
boolean isEmpty = true;
while (ok && (i < paramValue.length())) {
char currentChar = paramValue.charAt(i);
if (currentChar == '<') {
PageElementComment comment = analysis.isInComment(delta + i + 1);
if ((comment != null) && (comment.getBeginIndex() == delta + i)) {
i += comment.getEndIndex() - comment.getBeginIndex();
} else {
ok = false;
isEmpty = false;
}
} else if (" \n".indexOf(currentChar) >= 0) {
isEmpty = false;
i++;
if (beginIndex >= 0) {
hasExtraCharacters = true;
}
} else if (POSSIBLE_CHARACTERS.indexOf(currentChar) >= 0) {
isEmpty = false;
if (hasExtraCharacters) {
correct = false;
}
if (Character.isDigit(currentChar)) {
digitCount++;
if (beginIndex < 0) {
beginIndex = i;
}
endIndex = i + 1;
} else if (Character.toUpperCase(currentChar) == 'X') {
endIndex = i + 1;
digitCount++;
if (digitCount != 8) {
correct = false;
}
}
i++;
} else if (EXTRA_CHARACTERS.indexOf(currentChar) >= 0) {
isEmpty = false;
i++;
// Only one separation character after 4th digit
if ((digitCount == 4) && !hasSeparator) {
hasSeparator = true;
} else {
hasExtraSeparator = true;
}
} else if (INCORRECT_CHARACTERS.indexOf(currentChar) >= 0) {
isEmpty = false;
i++;
correct = false;
} else {
isEmpty = false;
ok = false;
}
}
if ((beginIndex < 0) || (endIndex < 0)) {
ok = false;
}
if (digitCount == 8) {
if (!hasSeparator || hasExtraSeparator) {
correct = false;
}
}
beginIndex += delta;
endIndex += delta;
if (ok) {
String contents = analysis.getContents();
String value = contents.substring(beginIndex, endIndex);
if (paramValue.length() > 0) {
issns.add(new PageElementISSN(
beginIndex, endIndex, analysis, value,
true, correct, helpRequested, template));
}
} else if (acceptAllValues) {
if (!isEmpty) {
issns.add(new PageElementISSN(
template.getParameterValueStartIndex(paramNum),
template.getParameterValueStartIndex(paramNum) + paramValue.length(),
analysis, paramValue, true, false, false, template));
}
}
}
}
}
/** WPCleaner configuration */
private final WPCConfiguration wpcConfiguration;
/** Full text */
private final String fullText;
/** ISSN not trimmed */
private final String issnNotTrimmed;
/** ISSN (trimmed) */
private final String issn;
/** True if ISSN is in a valid location */
private final boolean isValid;
/** True if ISSN syntax is correct */
private final boolean isCorrect;
/** Template if ISSN is a template parameter (ISSN=...) */
private final PageElementTemplate template;
/** True if help has been requested for this ISSN */
private final boolean helpRequested;
/**
* @param beginIndex Begin index.
* @param endIndex End index.
* @param issn ISSN.
* @param isValid True if ISSN is in a valid location.
* @param isCorrect True if ISSN syntax is correct.
* @param helpRequested True if help has been requested for this ISSN.
* @param template Template if ISSN is a template parameter.
*/
private PageElementISSN(
int beginIndex, int endIndex, PageAnalysis analysis,
String issn, boolean isValid,
boolean isCorrect, boolean helpRequested,
PageElementTemplate template) {
super(beginIndex, endIndex);
this.wpcConfiguration = analysis.getWPCConfiguration();
this.fullText = analysis.getContents().substring(beginIndex, endIndex);
this.issnNotTrimmed = issn;
this.issn = cleanISSN(issn);
this.isValid = isValid;
this.isCorrect = isCorrect;
this.helpRequested = helpRequested;
this.template = template;
}
/**
* @return ISSN not trimmed.
*/
public String getISSNNotTrimmed() {
return issnNotTrimmed;
}
/**
* @return ISSN (trimmed).
*/
public String getISSN() {
return issn;
}
/**
* @return True if ISSN is in a valid location.
*/
public boolean isValid() {
return isValid;
}
/**
* @return True if ISSN syntax is correct.
*/
public boolean isCorrect() {
return isCorrect;
}
/**
* @return True if help has been requested for this ISSN.
*/
public boolean helpRequested() {
return helpRequested;
}
/**
* @return True if ISSN is a template parameter.
*/
public boolean isTemplateParameter() {
return (template != null);
}
/**
* @return List of possible ISSN.
*/
public List<String> getCorrectISSN() {
List<String> result = new ArrayList<String>();
String prefix = isTemplateParameter() ? "" : "ISSN ";
// Prefix outside the template
if ((template != null) &&
(getBeginIndex() < template.getBeginIndex())) {
if (fullText != null) {
result.add(fullText.substring(template.getBeginIndex() - getBeginIndex()));
}
return result;
}
// Construct a basic ISSN number
String tmpISSN = issnNotTrimmed.trim();
if (tmpISSN.startsWith(ISSN_PREFIX)) {
tmpISSN = tmpISSN.substring(ISSN_PREFIX.length()).trim();
}
StringBuilder buffer = new StringBuilder();
for (int i = 0; i < tmpISSN.length(); i++) {
char currentChar = tmpISSN.charAt(i);
if (POSSIBLE_CHARACTERS.indexOf(currentChar) >= 0) {
buffer.append(currentChar);
} else if (EXTRA_CHARACTERS.indexOf(currentChar) >= 0) {
// Nothing to add
} else if ((currentChar == '‐') ||
(currentChar == '–') ||
(currentChar == '.')) {
// Nothing to add
} else if ((currentChar == '\t') ||
(currentChar == ' ')) {
// Nothing to add
} else {
buffer.append(currentChar);
}
}
if (buffer.length() == 8) {
buffer.insert(4, '-');
}
String cleanedISSN = buffer.toString().trim();
// Basic replacement
addCorrectISSN(result, prefix, cleanedISSN);
// Common mistyped characters
cleanedISSN = cleanedISSN.replaceAll("x", "X");
cleanedISSN = cleanedISSN.replaceAll("O", "0");
cleanedISSN = cleanedISSN.replaceAll("I", "1");
cleanedISSN = cleanedISSN.replaceAll("B", "8");
addCorrectISSN(result, prefix, cleanedISSN);
return result;
}
/**
* @param result List of possible replacements.
* @param prefix ISSN prefix.
* @param cleanedISSN Cleaned up ISSN.
*/
private void addCorrectISSN(List<String> result, String prefix, String cleanedISSN) {
if (computeChecksum(cleanedISSN) != cleanedISSN.charAt(cleanedISSN.length() - 1)) {
return;
}
addCorrectISSN(result, prefix + cleanedISSN);
if (!isTemplateParameter()) {
List<String[]> issnTemplates = wpcConfiguration.getStringArrayList(
WPCConfigurationStringList.ISSN_TEMPLATES);
if (issnTemplates != null) {
for (String[] issnTemplate : issnTemplates) {
if (issnTemplate.length > 2) {
String[] params = issnTemplate[1].split(",");
Boolean suggested = Boolean.valueOf(issnTemplate[2]);
if ((params.length > 0) && (Boolean.TRUE.equals(suggested))) {
StringBuilder buffer = new StringBuilder();
buffer.append("{{");
buffer.append(issnTemplate[0]);
buffer.append("|");
if (!"1".equals(params[0])) {
buffer.append(params[0]);
buffer.append("=");
}
buffer.append(cleanedISSN);
buffer.append("}}");
addCorrectISSN(result, buffer.toString());
}
}
}
}
}
}
/**
* @param result List of possible replacements.
* @param correctISSN Possible replacement.
*/
private void addCorrectISSN(List<String> result, String correctISSN) {
if ((result == null) || (correctISSN == null)) {
return;
}
if (!result.contains(correctISSN)) {
result.add(correctISSN);
}
}
/**
* @param helpNeededTemplate Name of template for asking for help.
* @param reason Reason of the request.
* @return Text for requesting for help.
*/
public String askForHelp(
String[] helpNeededTemplate, String reason) {
if ((helpNeededTemplate == null) ||
(helpNeededTemplate.length == 0)) {
return null;
}
if (isTemplateParameter()) {
return null;
}
// Template name
StringBuilder replacement = new StringBuilder();
replacement.append("{{");
replacement.append(helpNeededTemplate[0]);
// ISSN
replacement.append("|");
if ((helpNeededTemplate.length > 1) &&
(helpNeededTemplate[1].length() > 0)) {
replacement.append(helpNeededTemplate[1]);
replacement.append("=");
}
replacement.append(getISSNNotTrimmed());
// Reason
if ((reason != null) &&
(helpNeededTemplate.length > 2) &&
(helpNeededTemplate[2].length() > 0)) {
replacement.append("|");
replacement.append(helpNeededTemplate[2]);
replacement.append("=");
replacement.append(reason);
}
// Extra parameters
for (int i = 3; i < helpNeededTemplate.length; i++) {
replacement.append("|");
replacement.append(helpNeededTemplate[i]);
}
replacement.append("}}");
return replacement.toString();
}
/**
* @param comment Comment for asking for help.
* @param reason Reason of the request.
* @return Text for requesting for help.
*/
public String askForHelp(
String comment, String reason) {
if ((comment == null) ||
(comment.trim().length() == 0)) {
return null;
}
StringBuilder replacement = new StringBuilder();
replacement.append("<!-- ");
replacement.append(comment);
if ((reason != null) && (reason.trim().length() > 0)) {
replacement.append(" - ");
replacement.append(reason);
}
replacement.append(" -->");
return replacement.toString();
}
/**
* @param issn ISSN number.
* @return Cleaned up ISSN number.
*/
public static String cleanISSN(String issn) {
if (issn == null) {
return null;
}
issn = issn.trim();
if (issn.length() == 0) {
return issn;
}
PageAnalysis analysis = new PageAnalysis(null, issn);
StringBuilder result = new StringBuilder();
int i = 0;
while (i < issn.length()) {
char current = Character.toUpperCase(issn.charAt(i));
if (current == '<') {
PageElementComment comment = analysis.isInComment(i);
if ((comment != null) && (comment.getBeginIndex() == i)) {
i = comment.getEndIndex() - 1;
} else {
PageElementTag refTag = analysis.isInTag(i, PageElementTag.TAG_WIKI_REF);
if ((refTag != null) && (refTag.getBeginIndex() == i)) {
i = refTag.getCompleteEndIndex() - 1;
}
}
} else if (POSSIBLE_CHARACTERS.indexOf(current) >= 0) {
result.append(current);
}
i++;
}
return result.toString();
}
/**
* @param issnValue ISSN value.
* @return Computed checksum.
*/
public static char computeChecksum(String issnValue) {
if (issnValue == null) {
return 0;
}
issnValue = cleanISSN(issnValue);
// Check for ISSN-8
if (issnValue.length() == 8) {
int check = 0;
for (int i = 0; i < 7; i++) {
char currentChar = issnValue.charAt(i);
if (Character.isDigit(currentChar)) {
check += (8 - i) * (currentChar - '0');
} else {
return 0;
}
}
check = check % 11; // Modulus 11
check = 11 - check; // Invert
check = check % 11; // 11 -> 0
char checksum = (check < 10) ? (char) ('0' + check): 'X';
return checksum;
}
return 0;
}
/**
* @param issnValue ISSN value.
* @return True if ISSN value is valid.
*/
public static boolean isValid(String issnValue) {
if (issnValue == null) {
return false;
}
issnValue = cleanISSN(issnValue);
if (issnValue.length() != 8) {
return false;
}
if (issnValue.charAt(issnValue.length() - 1) != computeChecksum(issnValue)) {
return false;
}
return true;
}
}