/*
* Copyright 2010
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.dkpro.core.io.jwpl;
import java.io.BufferedReader;
import java.io.DataInputStream;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import org.apache.uima.UimaContext;
import org.apache.uima.collection.CollectionException;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.util.Progress;
import org.apache.uima.util.ProgressImpl;
import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData;
import de.tudarmstadt.ukp.dkpro.core.io.jwpl.util.WikiUtils;
import de.tudarmstadt.ukp.wikipedia.api.MetaData;
import de.tudarmstadt.ukp.wikipedia.api.Page;
import de.tudarmstadt.ukp.wikipedia.api.PageIterator;
import de.tudarmstadt.ukp.wikipedia.api.exception.WikiTitleParsingException;
import de.tudarmstadt.ukp.wikipedia.parser.mediawiki.FlushTemplates;
import de.tudarmstadt.ukp.wikipedia.parser.mediawiki.MediaWikiParser;
import de.tudarmstadt.ukp.wikipedia.parser.mediawiki.MediaWikiParserFactory;
/**
* Abstract base class for standard Wikipedia readers reading single articles
* instead of revision pairs.
*
*
*/
public abstract class WikipediaStandardReaderBase
extends WikipediaReaderBase
{
/** Whether the reader outputs plain text or wiki markup. */
public static final String PARAM_OUTPUT_PLAIN_TEXT = "OutputPlainText";
@ConfigurationParameter(name = PARAM_OUTPUT_PLAIN_TEXT, mandatory = true, defaultValue = "true")
protected boolean outputPlainText;
/** The page buffer size (#pages) of the page iterator. */
public static final String PARAM_PAGE_BUFFER = "PageBuffer";
@ConfigurationParameter(name = PARAM_PAGE_BUFFER, mandatory = true, defaultValue = "1000")
protected int pageBuffer;
/**
* Defines the path to a file containing a line-separated list of
* page ids of the pages that should be retrieved. (Optional)
*/
public static final String PARAM_PATH_TO_PAGE_ID_LIST = "PageIdsFromFile";
@ConfigurationParameter(name = PARAM_PATH_TO_PAGE_ID_LIST, mandatory = false)
protected String pageIdFile;
/**
* Defines the path to a file containing a line-separated list of
* page titles of the pages that should be retrieved. (Optional)
*/
public static final String PARAM_PATH_TO_PAGE_TITLE_LIST = "PageTitleFromFile";
@ConfigurationParameter(name = PARAM_PATH_TO_PAGE_TITLE_LIST, mandatory = false)
protected String pageNameFile;
/**
* Defines an array of
* page ids of the pages that should be retrieved. (Optional)
*/
public static final String PARAM_PAGE_ID_LIST = "PageIdFromArray";
@ConfigurationParameter(name = PARAM_PAGE_ID_LIST, mandatory = false)
protected String[] pageIdParamArray;
/**
* Defines an array of page titles of the pages that should be retrieved.
* (Optional)
*/
public static final String PARAM_PAGE_TITLE_LIST = "PageTitlesFromArray";
@ConfigurationParameter(name = PARAM_PAGE_TITLE_LIST, mandatory = false)
protected String[] pageNameParamArray;
private Set<String> pageIds = null;
private Set<String> pageTitles = null;
protected long currentArticleIndex;
protected long nrOfArticles;
protected Iterator<Page> pageIter;
private Page page;
protected MediaWikiParser parser;
@Override
public void initialize(UimaContext context)
throws ResourceInitializationException
{
super.initialize(context);
pageIds = new HashSet<String>();
pageTitles = new HashSet<String>();
try {
if (pageIdFile != null) {
pageIds = loadFile(pageIdFile);
}
if (pageNameFile != null) {
pageTitles = loadFile(pageNameFile);
}
if (pageIdParamArray != null && pageIdParamArray.length > 0) {
for(String id: pageIdParamArray){
pageIds.add(id);
}
}
if (pageNameParamArray != null && pageNameParamArray.length > 0) {
for(String id: pageNameParamArray){
pageTitles.add(id);
}
}
}
catch (Exception e) {
throw new ResourceInitializationException(e);
}
//Use one of the lists or iterate over all articles?
if(!pageIds.isEmpty()||!pageTitles.isEmpty())
{
this.nrOfArticles = pageIds.size()+pageTitles.size();
pageIter = new PageIterator(wiki, pageIds, pageTitles, pageBuffer);
}
else //use iterator over all pages in the db
{
MetaData md = wiki.getMetaData();
this.nrOfArticles = md.getNumberOfPages()
- md.getNumberOfDisambiguationPages()
- md.getNumberOfRedirectPages();
pageIter = new PageIterator(wiki, true, pageBuffer);
}
currentArticleIndex = 0;
MediaWikiParserFactory pf = new MediaWikiParserFactory();
pf.setTemplateParserClass(FlushTemplates.class);
parser = pf.createParser();
}
@Override
public boolean hasNext()
throws IOException, CollectionException
{
return pageIter.hasNext();
}
@Override
public void getNext(JCas jcas)
throws IOException, CollectionException
{
super.getNext(jcas);
page = pageIter.next();
currentArticleIndex++;
try {
getLogger().debug("title: " + page.getTitle());
addDocumentMetaData(jcas, page);
if (!isValidPage(page)) {
jcas.setDocumentText("");
return;
}
if (outputPlainText) {
jcas.setDocumentText(WikiUtils
.cleanText(getPlainDocumentText(page)));
}
else {
jcas.setDocumentText(getDocumentText(page));
}
}
catch (WikiTitleParsingException e1) {
jcas.setDocumentText("");
return;
}
}
protected abstract boolean isValidPage(Page page)
throws WikiTitleParsingException;
@Override
public Progress[] getProgress()
{
return new Progress[] { new ProgressImpl(
Long.valueOf(currentArticleIndex).intValue(),
Long.valueOf(nrOfArticles).intValue(), Progress.ENTITIES) };
}
protected String getDocumentText(Page page)
{
return page.getText();
}
protected abstract String getPlainDocumentText(Page page);
private void addDocumentMetaData(JCas jcas, Page page)
throws WikiTitleParsingException
{
String language = WikiUtils.jwplLanguage2dkproLanguage(dbconfig.getLanguage());
DocumentMetaData metaData = DocumentMetaData.create(jcas);
metaData.setDocumentTitle(page.getTitle().getWikiStyleTitle());
metaData.setCollectionId(Integer.valueOf(page.getPageId()).toString());
metaData.setDocumentId(Integer.valueOf(page.getPageId()).toString());
metaData.setDocumentBaseUri("http://" + language + ".wikipedia.org");
metaData.setDocumentUri("http://" + language + ".wikipedia.org/w/index.php?title=" + page.getTitle().getWikiStyleTitle());
metaData.setLanguage(language);
}
/**
* Loads a text file line-by-line into a Set of Strings.
*
* @param fileName
* path to the file
* @return a Set containing the individual lines of the text file
* @throws IOException
* if any error occurs while reading the file
*/
private Set<String> loadFile(String fileName)
throws IOException
{
Set<String> container = new HashSet<String>();
FileInputStream fstream=null;
DataInputStream in=null;
BufferedReader br=null;
try{
fstream = new FileInputStream(fileName);
in = new DataInputStream(fstream);
br = new BufferedReader(new InputStreamReader(in));
String strLine;
while ((strLine = br.readLine()) != null) {
container.add(strLine);
}
}finally{
if(br!=null){
br.close();
}
if(in!=null){
in.close();
}
if(fstream!=null){
fstream.close();
}
}
return container;
}
public Page getPage() {
return page;
}
}