/*
* Copyright 2013
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.dkpro.core.io.bliki;
import info.bliki.api.Page;
import info.bliki.api.User;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.List;
import javax.xml.bind.JAXBException;
import org.apache.uima.UimaContext;
import org.apache.uima.cas.CASRuntimeException;
import org.apache.uima.collection.CollectionException;
import org.apache.uima.fit.component.JCasCollectionReader_ImplBase;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.TypeCapability;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.util.Progress;
import org.apache.uima.util.ProgressImpl;
import org.sweble.wikitext.engine.CompiledPage;
import org.sweble.wikitext.engine.Compiler;
import org.sweble.wikitext.engine.PageId;
import org.sweble.wikitext.engine.PageTitle;
import org.sweble.wikitext.engine.utils.SimpleWikiConfiguration;
import de.fau.cs.osr.ptk.common.AstVisitor;
import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData;
import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters;
import de.tudarmstadt.ukp.wikipedia.api.WikiConstants;
import de.tudarmstadt.ukp.wikipedia.api.exception.WikiApiException;
import de.tudarmstadt.ukp.wikipedia.api.sweble.PlainTextConverter;
/**
* Bliki-based Wikipedia reader.
*/
@TypeCapability(outputs = { "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData" })
public class BlikiWikipediaReader
extends JCasCollectionReader_ImplBase
{
/**
* Wikiapi URL E.g. for the English Wikipedia it should be: http://en.wikipedia.org/w/api.php
*/
public static final String PARAM_SOURCE_LOCATION = ComponentParameters.PARAM_SOURCE_LOCATION;
@ConfigurationParameter(name = PARAM_SOURCE_LOCATION, mandatory = true)
private String wikiapiUrl;
/**
* The language of the wiki installation.
*/
public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE;
@ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = true)
private String language;
/**
* Whether the reader outputs plain text or wiki markup.
*/
public static final String PARAM_OUTPUT_PLAIN_TEXT = "outputPlainText";
@ConfigurationParameter(name = PARAM_OUTPUT_PLAIN_TEXT, mandatory = true, defaultValue = "true")
private boolean outputPlainText;
/**
* Which page titles should be retrieved.
*/
public static final String PARAM_PAGE_TITLES = "pageTitles";
@ConfigurationParameter(name = PARAM_PAGE_TITLES, mandatory = true)
private String[] pageTitles;
private List<Page> listOfPages;
private int pageOffset = 0;
private SimpleWikiConfiguration config;
private Compiler compiler;
@Override
public void initialize(UimaContext context)
throws ResourceInitializationException
{
super.initialize(context);
User user = new User("", "", wikiapiUrl);
user.login();
try {
config = new SimpleWikiConfiguration(WikiConstants.SWEBLE_CONFIG);
}
catch (FileNotFoundException e) {
throw new ResourceInitializationException(e);
}
catch (JAXBException e) {
throw new ResourceInitializationException(e);
}
compiler = new Compiler(config);
listOfPages = user.queryContent(pageTitles);
}
@Override
public boolean hasNext()
throws IOException, CollectionException
{
return pageOffset < listOfPages.size();
}
@Override
public void getNext(JCas jcas)
throws IOException, CollectionException
{
Page page = listOfPages.get(pageOffset);
DocumentMetaData dmd = new DocumentMetaData(jcas);
dmd.setDocumentTitle(page.getTitle());
dmd.setDocumentUri(wikiapiUrl + "?title=" + page.getTitle());
dmd.setDocumentId(page.getPageid());
dmd.setDocumentBaseUri(wikiapiUrl);
dmd.setCollectionId(page.getPageid());
dmd.addToIndexes();
jcas.setDocumentLanguage(language);
if (outputPlainText) {
try {
jcas.setDocumentText(getPlainText(page));
}
catch (CASRuntimeException e) {
throw new CollectionException(e);
}
catch (WikiApiException e) {
throw new CollectionException(e);
}
}
else {
jcas.setDocumentText(page.getCurrentContent());
}
pageOffset++;
}
@Override
public Progress[] getProgress()
{
return new Progress[] { new ProgressImpl(Long.valueOf(pageOffset).intValue(), Long.valueOf(
listOfPages.size()).intValue(), Progress.ENTITIES) };
}
/**
* <p>
* Returns the Wikipedia article as plain text using the SwebleParser with a
* SimpleWikiConfiguration and the PlainTextConverter. <br/>
* If you have different needs regarding the plain text, you can use getParsedPage(Visitor v)
* and provide your own Sweble-Visitor. Examples are in the
* <code>de.tudarmstadt.ukp.wikipedia.api.sweble</code> package or on http://www.sweble.org
* </p>
*
* <p>
* Alternatively, use Page.getText() to return the Wikipedia article with all Wiki markup. You
* can then use the old JWPL MediaWiki parser for creating a plain text version. The JWPL parser
* is now located in a separate project <code>de.tudarmstad.ukp.wikipedia.parser</code>. Please
* refer to the JWPL Google Code project page for further reference.
* </p>
*
* @return The plain text of a Wikipedia article
*/
private String getPlainText(Page page)
throws WikiApiException
{
return (String) parsePage(page, new PlainTextConverter());
}
/**
* Parses the page with the Sweble parser using a SimpleWikiConfiguration and the provided
* visitor. For further information about the visitor concept, look at the examples in the
* <code>de.tudarmstadt.ukp.wikipedia.api.sweble</code> package, or on
* <code>http://www.sweble.org</code> or on the JWPL Google Code project page.
*
* @return the parsed page. The actual return type depends on the provided visitor. You have to
* cast the return type according to the return type of the go() method of your visitor.
*/
private Object parsePage(Page page, AstVisitor v)
throws WikiApiException
{
// Use the provided visitor to parse the page
return v.go(getCompiledPage(page).getPage());
}
/**
* a Returns CompiledPage produced by the SWEBLE parser using the SimpleWikiConfiguration.
*
* @return the parsed page
*/
private CompiledPage getCompiledPage(Page page)
throws WikiApiException
{
CompiledPage cp;
try {
PageTitle pageTitle = PageTitle.make(config, page.getTitle());
PageId pageId = new PageId(pageTitle, -1);
// Compile the retrieved page
cp = compiler.postprocess(pageId, page.getCurrentContent(), null);
}
catch (Exception e) {
throw new WikiApiException(e);
}
return cp;
}
}