package edu.jhu.nlp.wikipedia;
import java.io.InputStream;
import java.util.Vector;
import org.apache.xerces.parsers.DOMParser;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
/**
* A memory efficient parser for easy access to Wikipedia XML dumps in native and compressed XML formats.<br>
*
* Typical pattern of use:<p>
* <code>
* WikiXMLDOMParser wxp = new WikiXMLDOMParser("enwiki-latest-pages-articles.xml");<br>
* wxp.setPageCallback(...);<br>
* wxp.parse();<br>
* </code><p>
* or<p>
* <code>
* WikiXMLDOMParser wxp = new WikiXMLDOMParser("enwiki-latest-pages-articles.xml");<br>
* wxp.parse();<br>
* WikiPageIterator it = wxp.getIterator();<br>
* ...
* </code>
* @author Delip Rao
*
*/
public class WikiXMLDOMParser extends WikiXMLParser {
private DOMParser domParser = new DOMParser();
private static String FEATURE_URI =
"http://apache.org/xml/features/dom/defer-node-expansion";
private Vector<WikiPage> pageList = null;
private PageCallbackHandler pageHandler = null;
public WikiXMLDOMParser(String fileName){
super(fileName);
}
public WikiXMLDOMParser(InputStream is){
super(is);
}
/**
* Set a callback handler. The callback is executed every time a
* page instance is detected in the stream. Custom handlers are
* implementations of {@link PageCallbackHandler}
* @param handler
* @throws Exception
*/
public void setPageCallback(PageCallbackHandler handler) throws Exception {
if(pageList != null) throw new Exception("Set the callback before calling parse()");
pageHandler = handler;
}
/**
*
* @return an iterator to the list of pages
* @throws Exception
*/
public WikiPageIterator getIterator() throws Exception {
if(pageHandler != null) throw new Exception("page callback found. Cannot iterate.");
return new WikiPageIterator(pageList);
}
/**
* The main parse method.
* @throws Exception
*/
public void parse() throws Exception {
if(pageHandler == null)
pageList = new Vector<WikiPage>();
domParser.setFeature(FEATURE_URI, true);
domParser.parse(this.getInputSource());
Document doc = domParser.getDocument();
NodeList pages = doc.getElementsByTagName("page");
for(int i = 0; i < pages.getLength(); i++) {
WikiPage wpage = new WikiPage();
Node pageNode = pages.item(i);
NodeList childNodes = pageNode.getChildNodes();
for(int j = 0; j < childNodes.getLength(); j++) {
Node child = childNodes.item(j);
if(child.getNodeName().equals("title"))
wpage.setTitle(child.getFirstChild().getNodeValue());
else if(child.getNodeName().equals("id"))
wpage.setID(child.getFirstChild().getNodeValue());
else if(child.getNodeName().equals("revision")) {
NodeList revchilds = child.getChildNodes();
for(int k = 0; k < revchilds.getLength(); k++) {
Node rchild = revchilds.item(k);
if(rchild.getNodeName().equals("text"))
wpage.setWikiText(rchild.getFirstChild().getNodeValue());
}
}
}
if(pageHandler != null) {
pageHandler.process(wpage);
} else pageList.add(wpage);
}
}
}