package org.wikibrain.core.model; import de.tudarmstadt.ukp.wikipedia.parser.mediawiki.FlushTemplates; import de.tudarmstadt.ukp.wikipedia.parser.mediawiki.MediaWikiParserFactory; import org.wikibrain.core.lang.Language; import org.wikibrain.core.lang.LanguageInfo; import java.util.Date; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Contains a single page's data from Wikipedia's Xml Dump with no processing. * You probably don't want to use this class unless you are parsing or need full text. */ public class RawPage { private static final Logger LOG = LoggerFactory.getLogger(RawPage.class); private final Title title; private final String body; private final Date lastEdit; private final Language lang; private final int revisionId; private final int localId; private final NameSpace namespace; private final boolean isRedirect; private final boolean isDisambig; private String redirectTitle = null; // Wikidata assigns these two fields private String model = null; private String format = null; public RawPage(int localId, int revisionId, String title, String body, Date lastEdit, Language lang, NameSpace namespace) { this.title = new Title(title, LanguageInfo.getByLanguage(lang)); this.body = body; this.lastEdit = lastEdit; this.namespace = namespace; this.lang = lang; this.revisionId = revisionId; this.localId = localId; isRedirect = false; isDisambig = false; } public String getRedirectTitle() { return redirectTitle; } public void setRedirectTitle(String redirectTitle) { this.redirectTitle = redirectTitle; } public RawPage(int localId, int revisionId, String title, String body, Date lastEdit, Language lang, NameSpace namespace, boolean redirect, boolean disambig, String redirectTitle) { this.title = new Title(title, LanguageInfo.getByLanguage(lang)); this.body = body; this.lastEdit = lastEdit; this.lang = lang; this.revisionId = revisionId; this.localId = localId; this.namespace = namespace; isRedirect = redirect; isDisambig = disambig; this.redirectTitle = redirectTitle; } public Title getTitle() { return title; } public String getBody() { return body; } public Date getLastEdit() { return lastEdit; } public Language getLanguage() { return lang; } public int getRevisionId() { return revisionId; } public int getLocalId() { return localId; } public NameSpace getNamespace() { return namespace; } public boolean isRedirect() { return isRedirect; } public boolean isDisambig() { return isDisambig; } public String getModel() { return model; } public void setModel(String model) { this.model = model; } public String getFormat() { return format; } public void setFormat(String format) { this.format = format; } /** * Returns a plain text output of the body of this RawPage * @return */ public String getPlainText() { return getPlainText(false); } /** * Returns a plain text output of the body of this RawPage * @return */ public String getPlainText(boolean includeTemplates) { if (body.isEmpty()) { return ""; } else { MediaWikiParserFactory factory = new MediaWikiParserFactory(); if (!includeTemplates) { factory.setTemplateParserClass(FlushTemplates.class); } return factory.createParser().parse(body).getText(); } } public String toString(){ return String.format("%s / %s (%s)", this.getTitle(), this.localId, lang.getLangCode()); } }