/*
* WPCleaner: A tool to help on Wikipedia maintenance tasks.
* Copyright (C) 2013 Nicolas Vervelle
*
* See README.txt file for licensing information.
*/
package org.wikipediacleaner.api.data;
import java.util.ArrayList;
import java.util.List;
import org.wikipediacleaner.api.constants.WPCConfiguration;
import org.wikipediacleaner.api.constants.WPCConfigurationStringList;
import org.wikipediacleaner.api.data.PageElementTemplate.Parameter;
/**
* Class containing information about a PMID (PubMed identifier).
*/
public class PageElementPMID extends PageElement {
/** PMID prefix */
public final static String PMID_PREFIX = "PMID";
/** PMID incorrect prefixes */
private final static String[] PMID_INCORRECT_PREFIX = {
"pmid"
};
/** PMID possible meaningful characters */
public final static String POSSIBLE_CHARACTERS = "0123456789";
/** PMID possible extraneous characters */
public final static String EXTRA_CHARACTERS = "";
/** PMID incorrect characters */
private final static String INCORRECT_CHARACTERS = "- :‐\t—=–\n";
/** PMID incorrect characters at the beginning */
private final static String INCORRECT_BEGIN_CHARACTERS = "- :‐\t—=–\n";
/**
* @param analysis Page analysis.
* @return List of PMID.
*/
public static List<PageElementPMID> analyzePage(
PageAnalysis analysis) {
List<PageElementPMID> pmids = new ArrayList<PageElementPMID>();
// Search for PMID templates
WPCConfiguration config = analysis.getWPCConfiguration();
List<String[]> pmidTemplates = config.getStringArrayList(WPCConfigurationStringList.PMID_TEMPLATES);
if (pmidTemplates != null) {
for (String[] pmidTemplate : pmidTemplates) {
if (pmidTemplate.length > 0) {
List<PageElementTemplate> templates = analysis.getTemplates(pmidTemplate[0]);
if (templates != null) {
for (PageElementTemplate template : templates) {
analyzeTemplateParams(
analysis, pmids, template,
(pmidTemplate.length > 1) ? pmidTemplate[1] : "1",
false, false, false, false);
}
}
}
}
}
// Search for PMID templates where help is requested
pmidTemplates = config.getStringArrayList(WPCConfigurationStringList.PMID_HELP_NEEDED_TEMPLATES);
if (pmidTemplates != null) {
for (String[] pmidTemplate : pmidTemplates) {
if (pmidTemplate.length > 0) {
List<PageElementTemplate> templates = analysis.getTemplates(pmidTemplate[0]);
if (templates != null) {
for (PageElementTemplate template : templates) {
analyzeTemplateParams(
analysis, pmids, template,
((pmidTemplate.length > 1) && (pmidTemplate[1].length() > 0)) ? pmidTemplate[1] : "1",
false, false, false, true);
}
}
}
}
}
// Search for PMID in template parameters
List<PageElementTemplate> templates = analysis.getTemplates();
for (PageElementTemplate template : templates) {
analyzeTemplateParams(analysis, pmids, template, "PMID", true, true, true, false);
}
// Search for PMID in plain texts
analyzePlainText(analysis, pmids, PMID_PREFIX, true, true);
for (String prefix : PMID_INCORRECT_PREFIX) {
analyzePlainText(analysis, pmids, prefix, false, false);
}
return pmids;
}
/**
* @param index Current index.
* @param pmids List of PMID.
* @return True if the current index is already in a PMID.
*/
private static boolean isInPMID(int index, List<PageElementPMID> pmids) {
if (pmids != null) {
for (PageElementPMID tmpPmid : pmids) {
if ((tmpPmid.getBeginIndex() <= index) &&
(tmpPmid.getEndIndex() > index)) {
return true;
}
}
}
return false;
}
/**
* Analyze plain text for PMID.
*
* @param analysis Page analysis.
* @param pmids Current list of PMID.
* @param prefix PMID prefix.
* @param correct True if PMID should be considered correct by default.
* @param caseSensitive True if PMID prefix is case sensitive.
*/
private static void analyzePlainText(
PageAnalysis analysis, List<PageElementPMID> pmids,
String prefix, boolean correct, boolean caseSensitive) {
String contents = analysis.getContents();
if ((contents == null) || (prefix == null)) {
return;
}
int index = 0;
int maxIndex = contents.length() - prefix.length();
while (index < maxIndex) {
// Check if it's a potential PMID
boolean isValid = true;
String nextChars = contents.substring(index, index + prefix.length());
boolean isPMID = caseSensitive ?
prefix.equals(nextChars) : prefix.equalsIgnoreCase(nextChars);
if (isPMID && (analysis.isInComment(index) != null)) {
isPMID = false;
}
if (isPMID && (analysis.isInTag(index) != null)) {
isPMID = false;
}
if (isPMID && (analysis.getSurroundingTag(PageElementTag.TAG_WIKI_NOWIKI, index) != null)) {
isPMID = false;
}
if (isPMID) {
if ((analysis.getSurroundingTag(PageElementTag.TAG_WIKI_PRE, index) != null) ||
(analysis.getSurroundingTag(PageElementTag.TAG_WIKI_SOURCE, index) != null) ||
(analysis.getSurroundingTag(PageElementTag.TAG_WIKI_SYNTAXHIGHLIGHT, index) != null)) {
isPMID = false;
}
}
if (isPMID && isInPMID(index, pmids)) {
isPMID = false;
}
if (isPMID) {
PageElementExternalLink link = analysis.isInExternalLink(index);
if (link != null) {
if (!link.hasSquare() ||
(index < link.getBeginIndex() + link.getTextOffset()) ||
(link.getText() == null)) {
isValid = false;
}
}
}
if (isPMID) {
PageElementTemplate template = analysis.isInTemplate(index);
if (template != null) {
if ((template.getParameterCount() == 0) ||
(index < template.getParameterPipeIndex(0))) {
isPMID = false;
}
}
}
if (isPMID) {
PageElementImage image = analysis.isInImage(index);
if (image != null) {
if (index < image.getBeginIndex() + image.getFirstPipeOffset()) {
isPMID = false;
}
}
}
if (isPMID) {
// Check if it's a template parameter
boolean parameter = false;
PageElementTemplate template = analysis.isInTemplate(index);
if (template != null) {
for (int paramNum = 0; paramNum < template.getParameterCount(); paramNum++) {
if ((template.getParameterPipeIndex(paramNum) < index) &&
(template.getParameterValueStartIndex(paramNum) > index)) {
parameter = true;
}
}
}
int beginIndex = index;
index += prefix.length();
boolean isCorrect = correct;
if (!parameter) {
if ((beginIndex >= 2) && (index + 2 < contents.length())) {
if (contents.startsWith("[[", beginIndex - 2) &&
contents.startsWith("]]", index)) {
isCorrect = false;
beginIndex -= 2;
index += 2;
}
}
boolean spaceFound = false;
if (analysis.isInComment(index) == null) {
while ((index < contents.length()) &&
(" \u00A0".indexOf(contents.charAt(index)) >= 0)) {
index++;
spaceFound = true;
}
while ((index < contents.length()) &&
(INCORRECT_BEGIN_CHARACTERS.indexOf(contents.charAt(index)) >= 0)) {
index++;
isCorrect = false;
}
}
int beginNumber = -1;
int endNumber = beginNumber;
boolean finished = false;
isCorrect &= spaceFound;
boolean nextCorrect = isCorrect;
while (!finished && (index < contents.length())) {
char currentChar = contents.charAt(index);
if (POSSIBLE_CHARACTERS.indexOf(currentChar) >= 0) {
if (beginNumber < 0) {
beginNumber = index;
}
endNumber = index + 1;
index++;
isCorrect = nextCorrect;
} else if (EXTRA_CHARACTERS.indexOf(currentChar) >= 0) {
if (beginNumber < 0) {
nextCorrect = false;
}
index++;
} else if (INCORRECT_CHARACTERS.indexOf(currentChar) >= 0) {
index++;
nextCorrect = false;
} else {
if ((endNumber == index) && (Character.isLetter(currentChar))) {
isCorrect = false;
}
finished = true;
}
}
if (endNumber > beginNumber) {
String number = contents.substring(beginNumber, endNumber);
pmids.add(new PageElementPMID(
beginIndex, endNumber, analysis, number,
isValid, isCorrect, false, false));
index = endNumber;
}
}
} else {
index++;
}
}
}
/**
* Check if template parameter is a PMID.
*
* @param analysis Page analysis.
* @param pmids Current list of PMID.
* @param template Template.
* @param argumentName Template parameter name.
* @param ignoreCase True if parameter name should compared ignoring case.
* @param acceptNumbers True if numbers are accepted after parameter name.
* @param acceptAllValues True if all values are accepted, even if not compatible with PMID.
* @param helpRequested True if help has been requested for this PMID.
*/
private static void analyzeTemplateParams(
PageAnalysis analysis, List<PageElementPMID> pmids,
PageElementTemplate template,
String argumentName,
boolean ignoreCase, boolean acceptNumbers,
boolean acceptAllValues, boolean helpRequested) {
int paramDefaultName = 1;
for (int paramNum = 0; paramNum < template.getParameterCount(); paramNum++) {
// Check parameter name
Parameter param = template.getParameter(paramNum);
String paramName = param.getComputedName();
if ((paramName == null) || (paramName.trim().length() == 0)) {
paramName = Integer.toString(paramDefaultName);
paramDefaultName++;
}
boolean nameOk = false;
if ((ignoreCase && argumentName.equalsIgnoreCase(paramName)) ||
(argumentName.equals(paramName))) {
nameOk = true;
} else if (acceptNumbers && (paramName.length() > argumentName.length())) {
String shortParamName = paramName.substring(0, argumentName.length());
if ((ignoreCase && argumentName.equalsIgnoreCase(shortParamName)) ||
(argumentName.equals(paramName))) {
nameOk = true;
for (int i = argumentName.length(); i < paramName.length(); i++) {
if (!Character.isDigit(paramName.charAt(i))) {
nameOk = false;
}
}
}
}
if (nameOk) {
String paramValue = param.getStrippedValue();
boolean ok = true;
boolean hasDigit = false;
int i = 0;
int beginIndex = -1;
int endIndex = -1;
boolean correct = true;
while (ok && (i < paramValue.length())) {
char currentChar = paramValue.charAt(i);
if (POSSIBLE_CHARACTERS.indexOf(currentChar) >= 0) {
if (Character.isDigit(currentChar)) {
if (beginIndex < 0) {
beginIndex = i;
}
endIndex = i + 1;
hasDigit = true;
} else if (Character.toUpperCase(currentChar) == 'X') {
endIndex = i + 1;
}
i++;
} else if (EXTRA_CHARACTERS.indexOf(currentChar) >= 0) {
i++;
} else if (INCORRECT_CHARACTERS.indexOf(currentChar) >= 0) {
i++;
correct = false;
} else {
ok = false;
}
}
int delta = template.getParameterValueStartIndex(paramNum);
if (beginIndex < 0) {
beginIndex = 0;
}
beginIndex += delta;
if (endIndex < 0) {
endIndex = 0;
}
endIndex += delta;
if (beginIndex < 0) {
ok = false;
} else {
if (!ok && hasDigit && (paramValue.charAt(i) == '<')) {
PageElementComment comment = analysis.isInComment(beginIndex + i);
if ((comment != null) &&
(comment.getBeginIndex() == beginIndex + i)) {
ok = true;
i += comment.getEndIndex() - comment.getBeginIndex();
while (ok && (i < paramValue.length())) {
char currentChar = paramValue.charAt(i);
if (currentChar == '<') {
comment = analysis.isInComment(beginIndex + i);
if (comment != null) {
i += comment.getEndIndex() - comment.getBeginIndex();
} else {
ok = false;
}
} else if ((currentChar != ' ') && (currentChar != '\n')) {
ok = false;
} else {
i++;
}
}
}
}
}
if (ok) {
String value = analysis.getContents().substring(beginIndex, endIndex);
if (paramValue.length() > 0) {
pmids.add(new PageElementPMID(
beginIndex, endIndex, analysis,
value, true, correct, helpRequested, true));
}
} else if (acceptAllValues) {
if (paramValue.length() > 0) {
pmids.add(new PageElementPMID(
template.getParameterValueStartIndex(paramNum),
template.getParameterValueStartIndex(paramNum) + paramValue.length(),
analysis, paramValue, true, false, false, true));
}
}
}
}
}
/** WPCleaner configuration */
private final WPCConfiguration wpcConfiguration;
/** PMID not trimmed. */
private final String pmidNotTrimmed;
/** PMID (trimmed). */
private final String pmid;
/** True if PMID is in a valid location. */
private final boolean isValid;
/** True if PMID syntax is correct. */
private final boolean isCorrect;
/** True if PMID is a template parameter (PMID=...) */
private final boolean isTemplateParameter;
/** True if help has been requested for this PMID */
private final boolean helpRequested;
/**
* @param beginIndex Begin index.
* @param endIndex End index.
* @param pmid PMID.
* @param isValid True if PMID is in a valid location.
* @param isCorrect True if PMID syntax is correct.
* @param helpRequested True if help has been requested for this PMID.
* @param isTemplateParameter True if PMID is a template parameter.
*/
private PageElementPMID(
int beginIndex, int endIndex, PageAnalysis analysis,
String pmid, boolean isValid,
boolean isCorrect, boolean helpRequested,
boolean isTemplateParameter) {
super(beginIndex, endIndex);
this.wpcConfiguration = analysis.getWPCConfiguration();
this.pmidNotTrimmed = pmid;
this.pmid = cleanPMID(pmid);
this.isValid = isValid;
this.isCorrect = isCorrect;
this.helpRequested = helpRequested;
this.isTemplateParameter = isTemplateParameter;
}
/**
* @return PMID not trimmed.
*/
public String getPMIDNotTrimmed() {
return pmidNotTrimmed;
}
/**
* @return PMID (trimmed).
*/
public String getPMID() {
return pmid;
}
/**
* @return True if PMID is in a valid location.
*/
public boolean isValid() {
return isValid;
}
/**
* @return True if PMID syntax is correct.
*/
public boolean isCorrect() {
return isCorrect;
}
/**
* @return True if help has been requested for this PMID.
*/
public boolean helpRequested() {
return helpRequested;
}
/**
* @return True if PMID is a template parameter.
*/
public boolean isTemplateParameter() {
return isTemplateParameter;
}
/**
* @return List of possible PMID.
*/
public List<String> getCorrectPMID() {
List<String> result = new ArrayList<String>();
String prefix = isTemplateParameter ? "" : "PMID ";
// Construct a basic PMID number
StringBuilder buffer = new StringBuilder();
for (int i = 0; i < pmidNotTrimmed.length(); i++) {
char currentChar = pmidNotTrimmed.charAt(i);
if ((POSSIBLE_CHARACTERS.indexOf(currentChar) >= 0) ||
(EXTRA_CHARACTERS.indexOf(currentChar) >= 0)) {
buffer.append(currentChar);
} else if ((currentChar == '‐') ||
(currentChar == '.')) {
buffer.append("-");
} else if (currentChar == '\t') {
buffer.append(" ");
} else {
buffer.append(currentChar);
}
}
String cleanedPMID = buffer.toString().trim();
// Basic replacement
addCorrectPMID(result, prefix, cleanedPMID);
return result;
}
/**
* @param result List of possible replacements.
* @param prefix PMI prefix.
* @param cleanedPMID Cleaned up PMID.
*/
private void addCorrectPMID(List<String> result, String prefix, String cleanedPMID) {
addCorrectPMID(result, prefix + cleanedPMID);
if (!isTemplateParameter()) {
List<String[]> pmidTemplates = wpcConfiguration.getStringArrayList(
WPCConfigurationStringList.PMID_TEMPLATES);
if (pmidTemplates != null) {
for (String[] pmidTemplate : pmidTemplates) {
if (pmidTemplate.length > 2) {
String[] params = pmidTemplate[1].split(",");
Boolean suggested = Boolean.valueOf(pmidTemplate[2]);
if ((params.length > 0) && (Boolean.TRUE.equals(suggested))) {
StringBuilder buffer = new StringBuilder();
buffer.append("{{");
buffer.append(pmidTemplate[0]);
buffer.append("|");
if (!"1".equals(params[0])) {
buffer.append(params[0]);
buffer.append("=");
}
buffer.append(cleanedPMID);
buffer.append("}}");
addCorrectPMID(result, buffer.toString());
}
}
}
}
}
}
/**
* @param result List of possible replacements.
* @param correctPMID Possible replacement.
*/
private void addCorrectPMID(List<String> result, String correctPMID) {
if ((result == null) || (correctPMID == null)) {
return;
}
if (!result.contains(correctPMID)) {
result.add(correctPMID);
}
}
/**
* @param helpNeededTemplate Name of template for asking for help.
* @param reason Reason of the request.
* @return Text for requesting for help.
*/
public String askForHelp(
String[] helpNeededTemplate, String reason) {
if ((helpNeededTemplate == null) ||
(helpNeededTemplate.length == 0)) {
return null;
}
if (isTemplateParameter) {
return null;
}
// Template name
StringBuilder replacement = new StringBuilder();
replacement.append("{{");
replacement.append(helpNeededTemplate[0]);
// PMID
replacement.append("|");
if ((helpNeededTemplate.length > 1) &&
(helpNeededTemplate[1].length() > 0)) {
replacement.append(helpNeededTemplate[1]);
replacement.append("=");
}
replacement.append(getPMIDNotTrimmed());
// Reason
if ((reason != null) &&
(helpNeededTemplate.length > 2) &&
(helpNeededTemplate[2].length() > 0)) {
replacement.append("|");
replacement.append(helpNeededTemplate[2]);
replacement.append("=");
replacement.append(reason);
}
// Extra parameters
for (int i = 3; i < helpNeededTemplate.length; i++) {
replacement.append("|");
replacement.append(helpNeededTemplate[i]);
}
replacement.append("}}");
return replacement.toString();
}
/**
* @param comment Comment for asking for help.
* @param reason Reason of the request.
* @return Text for requesting for help.
*/
public String askForHelp(
String comment, String reason) {
if ((comment == null) ||
(comment.trim().length() == 0)) {
return null;
}
StringBuilder replacement = new StringBuilder();
replacement.append("<!-- ");
replacement.append(comment);
if ((reason != null) && (reason.trim().length() > 0)) {
replacement.append(" - ");
replacement.append(reason);
}
replacement.append(" -->");
return replacement.toString();
}
/**
* @param pmid PMID number.
* @return Cleaned up PMID number.
*/
public static String cleanPMID(String pmid) {
if (pmid == null) {
return null;
}
pmid = pmid.trim();
if (pmid.length() == 0) {
return pmid;
}
PageAnalysis analysis = new PageAnalysis(null, pmid);
StringBuilder result = new StringBuilder();
int i = 0;
while (i < pmid.length()) {
char current = Character.toUpperCase(pmid.charAt(i));
if (current == '<') {
PageElementComment comment = analysis.isInComment(i);
if ((comment != null) && (comment.getBeginIndex() == i)) {
i = comment.getEndIndex() - 1;
} else {
PageElementTag refTag = analysis.isInTag(i, PageElementTag.TAG_WIKI_REF);
if ((refTag != null) && (refTag.getBeginIndex() == i)) {
i = refTag.getCompleteEndIndex() - 1;
}
}
} else if (POSSIBLE_CHARACTERS.indexOf(current) >= 0) {
result.append(current);
}
i++;
}
return result.toString();
}
/**
* @param pmidValue PMID value.
* @return True if PMID value is valid.
*/
public static boolean isValid(String pmidValue) {
if (pmidValue == null) {
return false;
}
pmidValue = cleanPMID(pmidValue);
if (pmidValue.length() == 0) {
return false;
}
return true;
}
}