package org.wikipedia.miner.extract.model;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.regex.* ;
import java.io.*;
import javax.xml.stream.*;
import org.apache.log4j.*;
import org.wikipedia.miner.extract.util.Util;
import org.wikipedia.miner.extract.util.Languages.Language;
import org.wikipedia.miner.extract.util.Languages.NamespaceAlias;
import org.wikipedia.miner.extract.util.SiteInfo;
import org.wikipedia.miner.extract.util.SiteInfo.Namespace;
import org.wikipedia.miner.model.Page.PageType ;
/**
* @author David Milne
*
* Parses the markup of a >page< element from a mediawiki dump, to convert it into a DumpPage object.
*/
public class DumpPageParser {
private XMLInputFactory xmlStreamFactory = XMLInputFactory.newInstance() ;
private enum DumpTag {page, id, title, text, timestamp, ignorable} ;
private Language language ;
private SiteInfo siteInfo ;
private SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'") ;
public DumpPageParser(Language lc, SiteInfo si) {
this.language = lc ;
this.siteInfo = si ;
}
public DumpPage parsePage(String markup) throws XMLStreamException {
Integer id = null ;
String title = null ;
String text = null ;
Date lastEdited = null ;
StringBuffer characters = new StringBuffer() ;
XMLStreamReader xmlStreamReader = xmlStreamFactory.createXMLStreamReader(new StringReader(markup)) ;
while (xmlStreamReader.hasNext()) {
int eventCode = xmlStreamReader.next();
switch (eventCode) {
case XMLStreamReader.START_ELEMENT :
break;
case XMLStreamReader.END_ELEMENT :
switch(resolveDumpTag(xmlStreamReader.getLocalName())) {
case id:
//only take the first id (there is a 2nd one for the revision)
if (id == null)
id = Integer.parseInt(characters.toString().trim()) ;
break ;
case title:
title = characters.toString().trim() ;
break ;
case text:
text = characters.toString().trim() ;
break ;
case timestamp:
try {
lastEdited = dateFormat.parse(characters.toString().trim()) ;
} catch (ParseException e) {
lastEdited = null ;
}
break ;
}
characters = new StringBuffer() ;
break;
case XMLStreamReader.CHARACTERS :
characters.append(xmlStreamReader.getText()) ;
}
}
xmlStreamReader.close();
if (id == null || title == null || text == null)
throw new XMLStreamException("Could not parse xml markup for page") ;
//identify namespace - assume 0 (main) if there is no prefix, or if prefix doesn't match any known namespaces
Namespace namespace ;
int pos = title.indexOf(":") ;
if (pos > 0) {
namespace = getNamespace(title.substring(0, pos)) ;
if (namespace == null)
namespace = siteInfo.getMainNamespace() ;
else
title = title.substring(pos+1) ;
} else {
namespace = siteInfo.getMainNamespace() ;
}
//ignore anything that isn't in main, category or template namespace
if (namespace.getKey() != SiteInfo.CATEGORY_KEY && namespace.getKey() != SiteInfo.MAIN_KEY && namespace.getKey() != SiteInfo.TEMPLATE_KEY) {
Logger.getLogger(DumpPageParser.class).info("Ignoring page " + id + ":" + title) ;
return null ;
}
//identify page type ;
PageType type ;
String redirectTarget = null ;
Matcher redirectMatcher = language.getRedirectPattern().matcher(text) ;
if (redirectMatcher.find()) {
type = PageType.redirect ;
if (redirectMatcher.group(2) != null)
redirectTarget = redirectMatcher.group(2) ;
else
redirectTarget = redirectMatcher.group(3) ;
} else if (namespace.getKey() == SiteInfo.CATEGORY_KEY) {
type = PageType.category ;
} else if (namespace.getKey() == SiteInfo.TEMPLATE_KEY) {
type = PageType.template ;
} else if (namespace.getKey() == SiteInfo.MAIN_KEY){
Matcher disambigMatcher = language.getDisambigPattern().matcher(text) ;
if (disambigMatcher.find()) {
type = PageType.disambiguation ;
} else {
type = PageType.article ;
}
} else {
type = PageType.invalid ;
}
title = Util.normaliseTitle(title) ;
return new DumpPage(id, namespace, type, title, text, redirectTarget, lastEdited) ;
}
private DumpTag resolveDumpTag(String tagName) {
try {
return DumpTag.valueOf(tagName) ;
} catch (IllegalArgumentException e) {
return DumpTag.ignorable ;
}
}
private Namespace getNamespace(String name) {
NamespaceAlias alias = language.getAlias(name) ;
if (alias == null)
return siteInfo.getNamespace(name) ;
else
return siteInfo.getNamespace(alias.getTo()) ;
}
}