package org.wikibrain.parser.xml; import org.apache.commons.lang.StringEscapeUtils; import org.wikibrain.core.lang.LanguageInfo; import org.wikibrain.core.model.NameSpace; import org.wikibrain.core.model.RawPage; import org.wikibrain.core.model.Title; import org.wikibrain.parser.WpParseException; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.Date; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * Parses the Xml associated with a single Wikipedia page. */ public class PageXmlParser { private static final Logger LOG =LoggerFactory.getLogger(PageXmlParser.class); private static final Pattern TITLE_PATTERN = Pattern.compile("<title>(.*?)</title>"); private static final Pattern ID_PATTERN = Pattern.compile("<id>(.*?)</id>"); private static final Pattern TIMESTAMP_PATTERN = Pattern.compile("<timestamp>(.*?)</timestamp>"); private static final Pattern CONTENT_PATTERN = Pattern.compile("<text xml:space=\"preserve\">(.*?)</text>", Pattern.DOTALL); private static final Pattern SELF_CLOSING_CONTENT_PATTERN = Pattern.compile("<text xml:space=\"preserve\"\\s*/>", Pattern.DOTALL); private static final Pattern REDIRECT_PATTERN = Pattern.compile("<redirect title=\"(.*?)\" />"); private static final Pattern MODEL_PATTERN = Pattern.compile("<model>(.*?)</model>"); private static final Pattern FORMAT_PATTERN = Pattern.compile("<format>(.*?)</format>"); // xmlDumpDateFormat is not static because it isn't threadsafe. BOOO!! private final SimpleDateFormat xmlDumpDateFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss"); private final LanguageInfo language; public PageXmlParser(LanguageInfo language) { this.language = language; } public RawPage parse(String rawXml) throws WpParseException { return parse(rawXml, -1, -1); } /** * Parses a single xml page into the main page components * @param rawXml * @param startByte * @param stopByte * @return * @throws WpParseException */ public RawPage parse(String rawXml, long startByte, long stopByte) throws WpParseException { rawXml = StringEscapeUtils.unescapeHtml(rawXml); String title = extractSingleString(TITLE_PATTERN, rawXml, 1); String idString = extractSingleString(ID_PATTERN, rawXml, 1); String timestampString = extractSingleString(TIMESTAMP_PATTERN, rawXml, 1); String revisionIdString = extractSingleString(ID_PATTERN, rawXml, 2); String formatString = extractSingleString(FORMAT_PATTERN, rawXml, 1); String modelString = extractSingleString(MODEL_PATTERN, rawXml, 1); if (title == null) { throw new WpParseException("no title for article"); } if (idString == null) { throw new WpParseException("no id for article"); } if (revisionIdString == null) { throw new WpParseException("no revision id for article"); } String body = extractSingleString(CONTENT_PATTERN, rawXml, 1); if (body == null && SELF_CLOSING_CONTENT_PATTERN.matcher(rawXml).find()) { body = ""; } if (body == null) { System.err.println("invalid body: " + rawXml); body = ""; } Date lastEdit = null; try { lastEdit = xmlDumpDateFormat.parse(timestampString); } catch (ParseException e) { LOG.warn("Could not parse last edited date: " + timestampString); } title = title.trim(); String redirectTitle = getRedirect(rawXml); RawPage rp = new RawPage( Integer.valueOf(idString), Integer.valueOf(revisionIdString), title, body, lastEdit, language.getLanguage(), getNameSpace(title), redirectTitle!=null, false, // TODO: FIXME by properly parsing disambigs! redirectTitle ); if (formatString != null) { rp.setFormat(formatString); } if (modelString != null) { rp.setModel(modelString); } return rp; } // TODO: does this method need to be like this, or can it just return "type" private NameSpace getNameSpace(String title) { return new Title(title, language).getNamespace(); } private String getRedirect(String rawXml) { return extractSingleString(REDIRECT_PATTERN, rawXml, 1); } private static String extractSingleString(Pattern patternToMatch, String body, int matchNum){ if (patternToMatch == null || body == null) { return null; } Matcher matcher = patternToMatch.matcher(body); int counter = 0; String curGroup = null; while (matcher.find() && counter < matchNum){ curGroup = matcher.group(1); counter++; } return curGroup; } }