/*
* WPCleaner: A tool to help on Wikipedia maintenance tasks.
* Copyright (C) 2016 Nicolas Vervelle
*
* See README.txt file for licensing information.
*/
package org.wikipediacleaner.api.dump;
import org.wikipediacleaner.api.data.DataManager;
import org.wikipediacleaner.api.data.Page;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
/**
* SAX handler for page elements.
*/
public class PageHandler extends DefaultHandler {
/** True when parsing a page */
private boolean isInPage;
/** True when parsing a title */
private boolean isInTitle;
/** Page title */
private StringBuilder title;
/** True when parsing name space */
private boolean isInNamespace;
/** Name space */
private StringBuilder namespace;
/** True when parsing a page id */
private boolean isInPageId;
/** Page id */
private StringBuilder pageId;
/** True when parsing a revision */
private boolean isInRevision;
/** True when parsing a revision id */
private boolean isInRevisionId;
/** Revision id */
private StringBuilder revisionId;
/** True when parsing a revision text */
private boolean isInRevisionText;
/** Revision text */
private StringBuilder revisionText;
/** Page processor */
private PageProcessor processor;
/**
* Constructor.
*/
public PageHandler() {
isInPage = false;
title = new StringBuilder();
namespace = new StringBuilder();
pageId = new StringBuilder();
revisionId = new StringBuilder();
revisionText = new StringBuilder();
cleanPageInformation();
}
/**
* @param processor Page processor.
*/
public void setPageProcessor(PageProcessor processor) {
this.processor = processor;
}
/**
* Receive notification of the start of an element.
*
* @param uri The Namespace URI, or the empty string if the
* element has no Namespace URI or if Namespace
* processing is not being performed.
* @param localName The local name (without prefix), or the
* empty string if Namespace processing is not being
* performed.
* @param qName The qualified name (with prefix), or the
* empty string if qualified names are not available.
* @param attributes The attributes attached to the element. If
* there are no attributes, it shall be an empty
* Attributes object.
* @exception org.xml.sax.SAXException Any SAX exception, possibly
* wrapping another exception.
* @see org.xml.sax.ContentHandler#startElement
*/
@Override
public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
if (isInPage) {
if (isInRevision) {
if (qName.equals("id")) {
isInRevisionId = true;
revisionId.setLength(0);
} else if (qName.equals("text")) {
isInRevisionText = true;
revisionText.setLength(0);
}
} else if (qName.equalsIgnoreCase("title")) {
isInTitle = true;
title.setLength(0);
} else if (qName.equalsIgnoreCase("ns")) {
isInNamespace = true;
namespace.setLength(0);
} else if (qName.equalsIgnoreCase("id")) {
isInPageId = true;
pageId.setLength(0);
} else if (qName.equalsIgnoreCase("revision")) {
isInRevision = true;
isInRevisionId = false;
revisionId.setLength(0);
isInRevisionText = false;
revisionText.setLength(0);
}
} else if (qName.equalsIgnoreCase("page")) {
isInPage = true;
cleanPageInformation();
}
}
/**
* Receive notification of the end of an element.
*
* @param uri The Namespace URI, or the empty string if the
* element has no Namespace URI or if Namespace
* processing is not being performed.
* @param localName The local name (without prefix), or the
* empty string if Namespace processing is not being
* performed.
* @param qName The qualified name (with prefix), or the
* empty string if qualified names are not available.
* @exception org.xml.sax.SAXException Any SAX exception, possibly
* wrapping another exception.
* @see org.xml.sax.ContentHandler#endElement
*/
@Override
public void endElement(String uri, String localName, String qName) throws SAXException {
if (isInPage) {
if (qName.equalsIgnoreCase("page")) {
if (processor != null) {
try {
Page page = DataManager.getPage(
processor.getWiki(), title.toString(),
Integer.valueOf(pageId.toString(), 10), revisionId.toString(),
null);
page.setNamespace(namespace.toString());
page.setContents(revisionText.toString());
processor.processPage(page);
} catch (NumberFormatException e) {
System.err.println("Problem in endElement: " + e.getMessage());
}
}
isInPage = false;
cleanPageInformation();
} else if (isInRevision) {
if (qName.equalsIgnoreCase("revision")) {
isInRevision = false;
isInRevisionId = false;
} else if (qName.equalsIgnoreCase("id")) {
isInRevisionId = false;
} else if (qName.equalsIgnoreCase("text")) {
isInRevisionText = false;
}
} else if (qName.equalsIgnoreCase("title")) {
isInTitle = false;
} else if (qName.equalsIgnoreCase("ns")) {
isInNamespace = false;
} else if (qName.equalsIgnoreCase("id")) {
isInPageId = false;
}
}
}
/**
* Receive notification of character data inside an element.
*
* @param ch The characters.
* @param start The start position in the character array.
* @param length The number of characters to use from the
* character array.
* @exception org.xml.sax.SAXException Any SAX exception, possibly
* wrapping another exception.
* @see org.xml.sax.ContentHandler#characters
*/
@Override
public void characters(char ch[], int start, int length) throws SAXException {
if (isInPage) {
if (isInRevision) {
if (isInRevisionId) {
revisionId.append(ch, start, length);
} else if (isInRevisionText) {
revisionText.append(ch, start, length);
}
} else if (isInTitle) {
title.append(ch, start, length);
} else if (isInNamespace) {
namespace.append(ch, start, length);
} else if (isInPageId) {
pageId.append(ch, start, length);
}
}
}
/**
* Clean current page information.
*/
private void cleanPageInformation() {
isInTitle = false;
title.setLength(0);
isInNamespace = false;
namespace.setLength(0);
isInPageId = false;
pageId.setLength(0);
isInRevision = false;
isInRevisionId = false;
revisionId.setLength(0);
isInRevisionText = false;
revisionText.setLength(0);
}
}