/*
* WPCleaner: A tool to help on Wikipedia maintenance tasks.
* Copyright (C) 2013 Nicolas Vervelle
*
* See README.txt file for licensing information.
*/
package org.wikipediacleaner.gui.swing.bot;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.text.MessageFormat;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.concurrent.Callable;
import java.util.concurrent.Future;
import org.wikipediacleaner.api.API;
import org.wikipediacleaner.api.APIException;
import org.wikipediacleaner.api.APIFactory;
import org.wikipediacleaner.api.MediaWikiController;
import org.wikipediacleaner.api.MediaWikiListener;
import org.wikipediacleaner.api.check.CheckErrorResult;
import org.wikipediacleaner.api.check.CheckErrorResult.ErrorLevel;
import org.wikipediacleaner.api.check.algorithm.CheckErrorAlgorithm;
import org.wikipediacleaner.api.constants.EnumQueryResult;
import org.wikipediacleaner.api.constants.EnumWikipedia;
import org.wikipediacleaner.api.data.DataManager;
import org.wikipediacleaner.api.data.Page;
import org.wikipediacleaner.api.data.PageAnalysis;
import org.wikipediacleaner.api.data.PageElementComment;
import org.wikipediacleaner.api.data.PageElementInternalLink;
import org.wikipediacleaner.api.dump.DumpProcessor;
import org.wikipediacleaner.api.dump.PageProcessor;
import org.wikipediacleaner.api.execution.MediaWikiCallable;
import org.wikipediacleaner.gui.swing.basic.BasicWindow;
import org.wikipediacleaner.gui.swing.basic.BasicWorker;
import org.wikipediacleaner.gui.swing.basic.Utilities;
import org.wikipediacleaner.i18n.GT;
/**
* SwingWorker for listing Check Wiki errors from a dump.
*/
public class ListCWWorker extends BasicWorker {
/** File containing the dump */
private final File dumpFile;
/** Directory (or file with place holder for error number) in which the output is written */
private final File output;
/** Page name (with place holder for error number) in which the output is written */
private final String pageName;
/** Algorithms for which to analyze pages */
final List<CheckErrorAlgorithm> selectedAlgorithms;
/** True if article should be checked on wiki */
final boolean checkWiki;
/** True to just check the pages that have been previously reported */
final boolean onlyRecheck;
/** List of errors found for each algorithm */
final Map<CheckErrorAlgorithm, Map<String, Detection>> detections;
/** Count of pages analyzed */
int countAnalyzed;
/** Count of pages found with errors */
int countDetections;
/**
* @param wiki Wiki.
* @param window Window.
* @param dumpFile File containing the dump to be analyzed.
* @param output Directory (or file with place holder for error number) in which the output is written.
* @param selectedAlgorithms List of selected algorithms.
* @param checkWiki True if last version of articles should be checked on wiki.
*/
public ListCWWorker(
EnumWikipedia wiki, BasicWindow window,
File dumpFile, File output,
List<CheckErrorAlgorithm> selectedAlgorithms,
boolean checkWiki) {
super(wiki, window);
this.dumpFile = dumpFile;
this.output = output;
this.pageName = null;
this.selectedAlgorithms = selectedAlgorithms;
this.detections = new HashMap<>();
this.countAnalyzed = 0;
this.countDetections = 0;
this.checkWiki = checkWiki;
this.onlyRecheck = false;
}
/**
* @param wiki Wiki.
* @param window Window.
* @param dumpFile File containing the dump to be analyzed.
* @param pageName Page name (with place holder for error number) in which the output is written.
* @param selectedAlgorithms List of selected algorithms.
* @param checkWiki True if last version of articles should be checked on wiki.
* @param onlyRecheck True to just check the pages that have been previously reported.
*/
public ListCWWorker(
EnumWikipedia wiki, BasicWindow window,
File dumpFile, String pageName,
List<CheckErrorAlgorithm> selectedAlgorithms,
boolean checkWiki, boolean onlyRecheck) {
super(wiki, window);
this.dumpFile = dumpFile;
this.output = null;
this.pageName = pageName;
this.selectedAlgorithms = selectedAlgorithms;
this.detections = new HashMap<>();
this.countAnalyzed = 0;
this.countDetections = 0;
this.checkWiki = checkWiki;
this.onlyRecheck = onlyRecheck;
}
/**
* Compute the value to be returned by the <code>get</code> method.
*
* @return Object returned by the <code>get</code> method.
* @see org.wikipediacleaner.gui.swing.basic.BasicWorker#construct()
*/
@Override
public Object construct() {
if ((dumpFile == null) || !dumpFile.canRead() || !dumpFile.isFile()) {
return null;
}
if ((output == null) && (pageName == null)) {
return null;
}
if (output != null) {
if (!output.canWrite()) {
return null;
}
if (!output.getName().contains("{0}") && !output.isDirectory()) {
return null;
}
}
if ((selectedAlgorithms == null) || selectedAlgorithms.isEmpty()) {
return null;
}
CWPageProcessor pageProcessor = new CWPageProcessor(getWikipedia(), this);
if (onlyRecheck) {
try {
List<Page> outputPages = new ArrayList<>();
for (CheckErrorAlgorithm algorithm : selectedAlgorithms) {
String truePageName = MessageFormat.format(pageName, algorithm.getErrorNumberString());
Page page = DataManager.getPage(getWikipedia(), truePageName, null, null, null);
outputPages.add(page);
}
API api = APIFactory.getAPI();
api.retrieveLinks(getWikipedia(), outputPages);
for (Page page : outputPages) {
List<Page> links = page.getLinks();
if (links != null) {
for (Page link : links) {
pageProcessor.addPage(link);
}
}
}
} catch (APIException e) {
// Nothing to do
}
}
DumpProcessor dumpProcessor = new DumpProcessor(pageProcessor);
dumpProcessor.processDump(dumpFile);
while (!pageProcessor.hasFinished()) {
try {
Thread.sleep(100);
} catch (InterruptedException e) {
// Nothing to do
}
}
System.out.println(
"Total pages processed: " + countAnalyzed +
" / errors detected: " + countDetections);
for (CheckErrorAlgorithm algorithm : selectedAlgorithms) {
Map<String, Detection> pages = detections.get(algorithm);
if (pages == null) {
pages = new HashMap<>();
}
outputResult(algorithm, pages.values());
}
return null;
}
/**
* @param pages List of detections.
* @return Formatted result.
*/
private String generateResult(List<Detection> pages, Long maxSize) {
StringBuilder buffer = new StringBuilder();
buffer.append("<!-- Generated using ");
buffer.append(dumpFile.getName());
buffer.append(" -->\n");
ErrorLevel lastLevel = null;
StringBuilder line = new StringBuilder();
List<Detection> pagesToRemove = new ArrayList<>();
for (Detection detection : pages) {
line.setLength(0);
if ((detection.maxLevel != null) &&
!detection.maxLevel.equals(lastLevel)) {
lastLevel = detection.maxLevel;
line.append("<!-- ");
line.append(lastLevel.toString());
line.append(" -->\n");
}
line.append("* ");
line.append(PageElementInternalLink.createInternalLink(
detection.pageName, null));
line.append(": ");
if (detection.notices != null) {
boolean first = true;
for (String notice : detection.notices) {
if (!first) {
line.append(", ");
}
first = false;
line.append("<nowiki>");
notice = notice.replaceAll("\n", "\u21b5"); // Replace \n by a visual character
notice = notice.replaceAll("\\<", "<"); // Replace "<" by its HTML element
notice = notice.replaceAll("\u007F", "[DEL]"); // Replace control characters by visible text
notice = notice.replaceAll("\u00A0", "[NBSP]");
notice = notice.replaceAll("\u00AD", "[SHY]");
notice = notice.replaceAll("\u2004", "[3EM]");
notice = notice.replaceAll("\u2005", "[4EM]");
notice = notice.replaceAll("\u2006", "[6EM]");
notice = notice.replaceAll("\u2007", "[FS]");
notice = notice.replaceAll("\u2008", "[PS]");
notice = notice.replaceAll("\u2004", "[3EM]");
notice = notice.replaceAll("\u200B", "[0WS]");
notice = notice.replaceAll("\u200E", "[LRM]");
notice = notice.replaceAll("\u2028", "[LS]");
notice = notice.replaceAll("\u202A", "[LRE]");
notice = notice.replaceAll("\u202C", "[POPD]");
notice = notice.replaceAll("\uFEFF", "[BOM]");
line.append(notice);
line.append("</nowiki>");
}
}
line.append("\n");
if ((maxSize == null) ||
(buffer.length() + line.length() < maxSize)) {
buffer.append(line);
} else {
pagesToRemove.add(detection);
}
}
pages.removeAll(pagesToRemove);
return buffer.toString();
}
/**
* Output result of the analysis.
*
* @param algorithm Algorithm.
* @param pages List of pages with detections.
*/
private void outputResult(CheckErrorAlgorithm algorithm, Collection<Detection> pages) {
if ((algorithm == null) || (pages == null)) {
return;
}
// Prepare result
List<Detection> tmpPages = new ArrayList<>(pages);
Collections.sort(tmpPages);
int nbPages = tmpPages.size();
String result = generateResult(tmpPages, null);
// Output to file
if (output != null) {
File outputFile = null;
if (!output.getName().contains("{0}")) {
outputFile = new File(
output,
"CW_" + getWikipedia().getSettings().getCodeCheckWiki() + "_" + algorithm.getErrorNumberString() + ".txt");
} else {
outputFile = new File(MessageFormat.format(output.getAbsolutePath(), algorithm.getErrorNumberString()));
}
BufferedWriter writer = null;
try {
writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outputFile, false), "UTF8"));
writer.write(result);
} catch (IOException e) {
// Nothing to do
} finally {
if (writer != null) {
try {
writer.close();
} catch (IOException e) {
// Nothing to do
}
}
}
}
// Output to a page
if (pageName != null) {
boolean finished = false;
while (!finished) {
try {
finished = true;
String truePageName = MessageFormat.format(pageName, algorithm.getErrorNumberString());
Page page = DataManager.getPage(getWikipedia(), truePageName, null, null, null);
API api = APIFactory.getAPI();
api.retrieveContents(getWikipedia(), Collections.singletonList(page), false, false);
String contents = page.getContents();
if (contents != null) {
int begin = -1;
int end = -1;
for (PageElementComment comment : page.getAnalysis(contents, true).getComments()) {
String value = comment.getComment().trim();
if ("BOT BEGIN".equals(value)) {
if (begin < 0) {
begin = comment.getEndIndex();
}
} else if ("BOT END".equals(value)) {
end = comment.getBeginIndex();
}
}
if ((begin >= 0) && (end > begin)) {
String text = null;
finished = false;
while (!finished) {
StringBuilder newText = new StringBuilder();
newText.append(contents.substring(0, begin));
newText.append("\n");
newText.append(result);
newText.append(contents.substring(end));
text = newText.toString();
if (!getWikipedia().getWikiConfiguration().isArticleTooLong(text) ||
tmpPages.isEmpty()) {
finished = true;
} else {
tmpPages.remove(tmpPages.size() - 1);
result = generateResult(
tmpPages,
getWikipedia().getWikiConfiguration().getMaxArticleSize());
}
}
try {
api.updatePage(
getWikipedia(), page, text,
"Dump analysis for error n°" + algorithm.getErrorNumberString() + " (" + nbPages + " pages)",
true, false);
} catch (APIException e) {
if (EnumQueryResult.CONTENT_TOO_BIG.equals(e.getQueryResult())) {
for (int i = 0; i < 100; i++) {
if (!tmpPages.isEmpty()) {
finished = false;
tmpPages.remove(tmpPages.size() - 1);
}
}
result = generateResult(
tmpPages,
getWikipedia().getWikiConfiguration().getMaxArticleSize());
} else {
throw e;
}
}
}
}
} catch (APIException e) {
// Nothing
}
}
}
}
/**
* Called on the event dispatching thread (not on the worker thread)
* after the <code>construct</code> method has returned.
*
* @see org.wikipediacleaner.gui.swing.basic.BasicWorker#finished()
*/
/**
*
* @see org.wikipediacleaner.gui.swing.basic.BasicWorker#finished()
*/
@Override
public void finished() {
super.finished();
if (getWindow() != null) {
StringBuilder message = new StringBuilder();
message.append(GT.__(
"{0} page has been analyzed",
"{0} pages have been analyzed",
countAnalyzed, Integer.toString(countAnalyzed)));
for (Entry<CheckErrorAlgorithm, Map<String, Detection>> error : detections.entrySet()) {
if ((error != null) && (error.getKey() != null) && (error.getValue() != null)) {
CheckErrorAlgorithm algorithm = error.getKey();
Map<String, Detection> pages = error.getValue();
message.append("\n");
message.append(GT.__(
"{0} page has been detected for algorithm {1}",
"{0} pages have been detected for algorithm {1}",
pages.size(), new Object[] {
pages.size(),
algorithm.getErrorNumberString() + " - " + algorithm.getShortDescription()}));
}
}
Utilities.displayInformationMessage(
getWindow().getParentComponent(), message.toString());
}
}
/**
* Controller for background tasks.
*/
private class CWController extends MediaWikiController {
/**
* @param listener Listener to MediaWiki events.
*/
public CWController(MediaWikiListener listener) {
super(listener);
}
/**
* @param task Task to be performed in background.
* @see org.wikipediacleaner.api.MediaWikiController#addTask(java.util.concurrent.Callable)
*/
@Override
public void addTask(Callable<?> task) {
hasFinished(); // To clean up done tasks
super.addTask(task);
}
/**
* @return True if all tasks are completed.
*/
public boolean hasFinished() {
if (hasRemainingTask()) {
Future<?> result = getNextDoneResult();
if (result == null) {
return false;
}
}
return !hasRemainingTask();
}
}
/**
* Background task.
*/
private class CWPageCallable extends MediaWikiCallable<Page> {
/** Page to analyze */
private final Page page;
/**
* @param wikipedia Wikipedia.
* @param listener Listener of MediaWiki events.
* @param api MediaWiki API.
* @param page Page.
* @param checkWiki True if last version should be checked on wiki.
*/
public CWPageCallable(
EnumWikipedia wiki, MediaWikiListener listener, API api,
Page page) {
super(wiki, listener, api);
this.page = page;
}
/* (non-Javadoc)
* @see java.util.concurrent.Callable#call()
*/
@Override
public Page call() throws APIException {
EnumWikipedia wiki = getWikipedia();
PageAnalysis analysis = page.getAnalysis(page.getContents(), false);
Page currentPage = null;
PageAnalysis currentAnalysis = null;
for (CheckErrorAlgorithm algorithm : selectedAlgorithms) {
List<CheckErrorResult> errors = new ArrayList<>();
if (!algorithm.isInWhiteList(page.getTitle()) &&
algorithm.analyze(analysis, errors, false)) {
boolean detectionConfirmed = false;
// Confirm detection
if (checkWiki) {
try {
if (currentPage == null) {
currentPage = DataManager.getPage(wiki, page.getTitle(), null, null, null);
}
if (currentAnalysis == null) {
api.retrieveContents(wiki, Collections.singleton(currentPage), false, false);
if (currentPage.getContents().equals(page.getContents())) {
currentAnalysis = analysis;
} else {
currentAnalysis = currentPage.getAnalysis(currentPage.getContents(), false);
}
}
if (currentAnalysis == analysis) {
detectionConfirmed = true;
} else {
errors.clear();
if (algorithm.analyze(currentAnalysis, errors, false)) {
detectionConfirmed = true;
}
}
} catch (APIException e) {
// Nothing to do
}
} else {
detectionConfirmed = true;
currentPage = page;
}
// Memorize detection
if (detectionConfirmed) {
System.out.println(
"Detection confirmed for " + page.getTitle() +
": " + algorithm.getErrorNumberString() +
" - " + algorithm.getShortDescription());
Map<String, Detection> pages = detections.get(algorithm);
if (pages == null) {
pages = new HashMap<>();
detections.put(algorithm, pages);
}
pages.put(currentPage.getTitle(), new Detection(currentPage, errors));
countDetections++;
}
}
}
countAnalyzed++;
if (countAnalyzed % 100000 == 0) {
System.out.println(
"Pages processed: " + countAnalyzed +
" / errors detected: " + countDetections);
}
if (countAnalyzed % 1000 == 0) {
setText(GT._("{0} pages processed", Integer.toString(countAnalyzed)));
}
return page;
}
}
/**
* Process pages in the dump.
*/
private class CWPageProcessor implements PageProcessor {
/** Wiki */
private final EnumWikipedia wiki;
/** Listener */
private final MediaWikiListener listener;
/** Controller for background tasks */
private final CWController controller;
/** API */
private final API api;
/** Restrict the processing to this list of pages */
private List<String> pagesList;
/**
* @param wiki Wiki.
* @param listener Listener.
*/
public CWPageProcessor(EnumWikipedia wiki, MediaWikiListener listener) {
this.wiki = wiki;
this.listener = listener;
this.controller = new CWController(null);
this.api = APIFactory.getAPI();
}
/**
* @return Wiki.
* @see org.wikipediacleaner.api.dump.PageProcessor#getWiki()
*/
@Override
public EnumWikipedia getWiki() {
return wiki;
}
/**
* Add a page to the list of pages to check.
*
* @param page Page to be checked.
*/
public void addPage(Page page) {
if (page == null) {
return;
}
String title = page.getTitle();
if (pagesList == null) {
pagesList = new ArrayList<>();
}
if (!pagesList.contains(title)) {
pagesList.add(title);
}
}
/**
* @param page Page.
* @see org.wikipediacleaner.api.dump.PageProcessor#processPage(org.wikipediacleaner.api.data.Page)
*/
@Override
public void processPage(Page page) {
if ((page != null) && page.isInMainNamespace()) {
if ((pagesList == null) || pagesList.contains(page.getTitle())) {
controller.addTask(new CWPageCallable(wiki, listener, api, page));
}
}
}
/**
* @return True if all tasks are completed.
*/
public boolean hasFinished() {
return controller.hasFinished();
}
}
/**
* Bean for holding detection results.
*/
static class Detection implements Comparable<Detection> {
/** Page name */
public final String pageName;
/** List of notices */
public final List<String> notices;
/** Maximum level for the errors */
public final ErrorLevel maxLevel;
/**
* @param page Page.
* @param errors List of errors.
*/
public Detection(Page page, List<CheckErrorResult> errors) {
this.pageName = page.getTitle();
this.notices = new ArrayList<>();
ErrorLevel tmpLevel = ErrorLevel.CORRECT;
if (errors != null) {
for (CheckErrorResult error : errors) {
String contents = page.getContents();
if (contents != null) {
notices.add(new String(contents.substring(
error.getStartPosition(), error.getEndPosition())));
}
ErrorLevel currentLevel = error.getErrorLevel();
if (currentLevel.ordinal() < tmpLevel.ordinal()) {
tmpLevel = currentLevel;
}
}
}
this.maxLevel = tmpLevel;
}
/**
* @param o
* @return
* @see java.lang.Comparable#compareTo(java.lang.Object)
*/
@Override
public int compareTo(Detection o) {
if (o == null) {
return -1;
}
// Compare error level
if (!maxLevel.equals(o.maxLevel)) {
if (maxLevel.ordinal() < o.maxLevel.ordinal()) {
return -1;
}
return 1;
}
// Compare pages
if (pageName == null) {
if (o.pageName == null) {
return 0;
}
return 1;
}
if (o.pageName == null) {
return -1;
}
return pageName.compareTo(o.pageName);
}
}
}