/*
* Copyright 2010
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.dkpro.core.io.jwpl;
import java.io.BufferedReader;
import java.io.DataInputStream;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.sql.Timestamp;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import org.apache.uima.UimaContext;
import org.apache.uima.collection.CollectionException;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.util.Progress;
import org.apache.uima.util.ProgressImpl;
import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData;
import de.tudarmstadt.ukp.dkpro.core.io.jwpl.type.WikipediaRevision;
import de.tudarmstadt.ukp.dkpro.core.io.jwpl.util.WikiUtils;
import de.tudarmstadt.ukp.wikipedia.api.MetaData;
import de.tudarmstadt.ukp.wikipedia.api.Page;
import de.tudarmstadt.ukp.wikipedia.api.PageIterator;
import de.tudarmstadt.ukp.wikipedia.api.exception.WikiApiException;
import de.tudarmstadt.ukp.wikipedia.api.exception.WikiTitleParsingException;
import de.tudarmstadt.ukp.wikipedia.parser.mediawiki.FlushTemplates;
import de.tudarmstadt.ukp.wikipedia.parser.mediawiki.MediaWikiParser;
import de.tudarmstadt.ukp.wikipedia.parser.mediawiki.MediaWikiParserFactory;
import de.tudarmstadt.ukp.wikipedia.revisionmachine.api.Revision;
import de.tudarmstadt.ukp.wikipedia.revisionmachine.api.RevisionApi;
/**
* Abstract base class for all readers based on revisions.
*
*
*/
public abstract class WikipediaRevisionReaderBase
extends WikipediaReaderBase
{
/** Whether the reader outputs plain text or wiki markup. */
public static final String PARAM_OUTPUT_PLAIN_TEXT = "OutputPlainText";
@ConfigurationParameter(name = PARAM_OUTPUT_PLAIN_TEXT, mandatory = true, defaultValue = "true")
protected boolean outputPlainText;
/** The page buffer size (#pages) of the page iterator. */
public static final String PARAM_PAGE_BUFFER = "PageBuffer";
@ConfigurationParameter(name = PARAM_PAGE_BUFFER, mandatory = true, defaultValue = "1000")
protected int pageBuffer;
/**
* Defines the path to a file containing a line-separated list of revision ids of the revisions
* that should be retrieved. (Optional)
*/
public static final String PARAM_PATH_TO_REVISION_ID_LIST = "RevisionIdsFromFile";
@ConfigurationParameter(name = PARAM_PATH_TO_REVISION_ID_LIST, mandatory = false)
protected String revisionIdFile;
/**
* Defines an array of revision ids of the revisions that should be retrieved. (Optional)
*/
public static final String PARAM_REVISION_ID_LIST = "RevisionIdFromArray";
@ConfigurationParameter(name = PARAM_REVISION_ID_LIST, mandatory = false)
protected String[] revisionIdParamArray;
protected Page currentArticle;
protected RevisionApi revisionApi;
// These Iterators are used when iterating over ALL revisions
protected Iterator<Page> pageIter; // for page iteration - revs are subiterated
protected Iterator<Timestamp> timestampIter; // for rev subiteration
// This iterator is used when iterating over a predefined list of revisions
protected Iterator<String> revIdIterator; // for list-based rev iteration
protected long currentArticleIndex;
protected long currentRevisionIndex;
protected long nrOfArticles;
protected MediaWikiParser parser;
protected Set<String> revisionIds = null;
@Override
public void initialize(UimaContext context)
throws ResourceInitializationException
{
super.initialize(context);
revisionIds = new HashSet<String>();
try {
this.revisionApi = new RevisionApi(dbconfig);
if (revisionIdFile != null) {
revisionIds = loadFile(revisionIdFile);
}
if (revisionIdParamArray != null && revisionIdParamArray.length > 0) {
for (String id : revisionIdParamArray) {
revisionIds.add(id);
}
}
}
catch (Exception e) {
throw new ResourceInitializationException(e);
}
// Use one of the lists or iterate over all articles?
if (!revisionIds.isEmpty())
{
revIdIterator = revisionIds.iterator();
}
else // use iterator over all pages in the db
{
MetaData md = wiki.getMetaData();
this.nrOfArticles = md.getNumberOfPages()
- md.getNumberOfDisambiguationPages()
- md.getNumberOfRedirectPages();
pageIter = new PageIterator(wiki, true, pageBuffer);
try {
if (pageIter.hasNext()) {
currentArticle = pageIter.next();
}
else {
throw new IOException("No articles in database.");
}
this.timestampIter = getTimestampIter(currentArticle.getPageId());
}
catch (IOException e) {
throw new ResourceInitializationException(e);
}
}
currentArticleIndex = 0;
currentRevisionIndex = 0;
// TODO Use SWEBLE
MediaWikiParserFactory pf = new MediaWikiParserFactory();
pf.setTemplateParserClass(FlushTemplates.class);
parser = pf.createParser();
}
@Override
public boolean hasNext()
throws IOException, CollectionException
{
// If a list of revisions is provided, just use the hasNext() of the iterator
if (!revisionIds.isEmpty()) {
if (revIdIterator.hasNext()) {
currentRevisionIndex++;
return true;
}
else {
return false;
}
}
// If no list of revisions is provided, we iterate over pages and subiterate over revisions
else {
if (!timestampIter.hasNext()) {
if (pageIter.hasNext()) {
currentArticle = pageIter.next();
currentArticleIndex++;
this.timestampIter = getTimestampIter(currentArticle
.getPageId());
}
else {
return false;
}
}
if (!timestampIter.hasNext()) {
// if we are in here, we tried to update with last available page,
// but it contained no revisions
return false;
}
return true;
}
}
@Override
public Progress[] getProgress()
{
if (revisionIds.isEmpty()) {
// if we iterate over ALL revisions, we can only report the progress in <articles>
return new Progress[] { new ProgressImpl(Long.valueOf(
currentArticleIndex).intValue(), Long.valueOf(nrOfArticles)
.intValue(), Progress.ENTITIES) };
}
else {
// if we iterate over a revision list, we can actually report the progress in
// <revisions>
return new Progress[] { new ProgressImpl(Long.valueOf(
currentRevisionIndex).intValue(), Long.valueOf(revisionIds.size())
.intValue(), Progress.ENTITIES) };
}
}
protected Iterator<Timestamp> getTimestampIter(int pageId)
throws IOException
{
try {
List<Timestamp> timestamps = this.revisionApi
.getRevisionTimestamps(pageId);
Collections.sort(timestamps);
return timestamps.iterator();
}
catch (WikiApiException e) {
throw new IOException(e);
}
}
protected void addRevisionAnnotation(JCas jcas, Revision revision)
{
WikipediaRevision revAnno = new WikipediaRevision(jcas);
revAnno.setRevisionId(revision.getRevisionID());
revAnno.setPageId(revision.getArticleID());
revAnno.setContributorName(revision.getContributorName());
Integer contribId = revision.getContributorId();
if (contribId != null) {
revAnno.setContributorId(revision.getContributorId());
}
Timestamp timestamp = revision.getTimeStamp();
if (timestamp != null) {
revAnno.setTimestamp(timestamp.getTime());
}
revAnno.setComment(revision.getComment());
revAnno.setMinor(revision.isMinor());
revAnno.addToIndexes();
}
protected void addDocumentMetaData(JCas jcas, int pageId, int revisionId)
throws WikiTitleParsingException, WikiApiException
{
// fix for issue http://code.google.com/p/dkpro-core-asl/issues/detail?id=209
String language = WikiUtils.jwplLanguage2dkproLanguage(dbconfig.getLanguage());
DocumentMetaData metaData = DocumentMetaData.create(jcas);
metaData.setDocumentTitle(wiki.getPage(pageId).getTitle().getWikiStyleTitle());
metaData.setCollectionId(Integer.valueOf(pageId).toString());
metaData.setDocumentId(Integer.valueOf(revisionId).toString());
metaData.setLanguage(language);
}
/**
* Loads a text file line-by-line into a Set of Strings.
*
* @param fileName
* path to the file
* @return a Set containing the individual lines of the text file
* @throws IOException
* if any error occurs while reading the file
*/
private Set<String> loadFile(String fileName)
throws IOException
{
Set<String> container = new HashSet<String>();
FileInputStream fstream = null;
DataInputStream in = null;
BufferedReader br = null;
try {
fstream = new FileInputStream(fileName);
in = new DataInputStream(fstream);
br = new BufferedReader(new InputStreamReader(in));
String strLine;
while ((strLine = br.readLine()) != null) {
container.add(strLine);
}
}
finally {
if (br != null) {
br.close();
}
if (in != null) {
in.close();
}
if (fstream != null) {
fstream.close();
}
}
return container;
}
}