/*
* Copyright 2010
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.dkpro.core.io.jwpl;
import org.apache.uima.UimaContext;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.util.Level;
import de.tudarmstadt.ukp.wikipedia.api.PageQuery;
import de.tudarmstadt.ukp.wikipedia.api.exception.WikiApiException;
/**
* Reads all article pages that match a query created by the numerous parameters of this class.
*/
public class WikipediaQueryReader
extends WikipediaArticleReader
{
/**
* Maximum number of categories.
* Articles with a higher number of categories will not be returned by the query.
*/
public static final String PARAM_MAX_CATEGORIES = "MaxCategories";
@ConfigurationParameter(name = PARAM_MAX_CATEGORIES, mandatory=false, defaultValue="-1")
private int maxCategories;
/**
* Minimum number of categories.
* Articles with a lower number of categories will not be returned by the query.
*/
public static final String PARAM_MIN_CATEGORIES = "MinCategories";
@ConfigurationParameter(name = PARAM_MIN_CATEGORIES, mandatory=false, defaultValue="-1")
private int minCategories;
/**
* Maximum number of incoming links.
* Articles with a higher number of incoming links will not be returned by the query.
*/
public static final String PARAM_MAX_INLINKS = "MaxInlinks";
@ConfigurationParameter(name = PARAM_MAX_INLINKS, mandatory=false, defaultValue="-1")
private int maxInlinks;
/**
* Minimum number of incoming links.
* Articles with a lower number of incoming links will not be returned by the query.
*/
public static final String PARAM_MIN_INLINKS = "MinInlinks";
@ConfigurationParameter(name = PARAM_MIN_INLINKS, mandatory=false, defaultValue="-1")
private int minInlinks;
/**
* Maximum number of outgoing links.
* Articles with a higher number of outgoing links will not be returned by the query.
*/
public static final String PARAM_MAX_OUTLINKS = "MaxOutlinks";
@ConfigurationParameter(name = PARAM_MAX_OUTLINKS, mandatory=false, defaultValue="-1")
private int maxOutlinks;
/**
* Minimum number of outgoing links.
* Articles with a lower number of outgoing links will not be returned by the query.
*/
public static final String PARAM_MIN_OUTLINKS = "MinOutlinks";
@ConfigurationParameter(name = PARAM_MIN_OUTLINKS, mandatory=false, defaultValue="-1")
private int minOutlinks;
/**
* Maximum number of redirects.
* Articles with a higher number of redirects will not be returned by the query.
*/
public static final String PARAM_MAX_REDIRECTS = "MaxRedirects";
@ConfigurationParameter(name = PARAM_MAX_REDIRECTS, mandatory=false, defaultValue="-1")
private int maxRedirects;
/**
* Minimum number of redirects.
* Articles with a lower number of redirects will not be returned by the query.
*/
public static final String PARAM_MIN_REDIRECTS = "MinRedirects";
@ConfigurationParameter(name = PARAM_MIN_REDIRECTS, mandatory=false, defaultValue="-1")
private int minRedirects;
/**
* Maximum number of tokens.
* Articles with a higher number of tokens will not be returned by the query.
*/
public static final String PARAM_MAX_TOKENS = "MaxTokens";
@ConfigurationParameter(name = PARAM_MAX_TOKENS, mandatory=false, defaultValue="-1")
private int maxTokens;
/**
* Minimum number of tokens.
* Articles with a lower number of tokens will not be returned by the query.
*/
public static final String PARAM_MIN_TOKENS = "MinTokens";
@ConfigurationParameter(name = PARAM_MIN_TOKENS, mandatory=false, defaultValue="-1")
private int minTokens;
/**
* SQL-style title pattern.
* Only articles that match the pattern will be returned by the query.
*/
public static final String PARAM_TITLE_PATTERN = "TitlePattern";
@ConfigurationParameter(name = PARAM_TITLE_PATTERN, mandatory=false, defaultValue="")
private String titlePattern;
protected boolean queryInitialized = false; // indicates whether a query parameter was used
@Override
public void initialize(UimaContext context)
throws ResourceInitializationException
{
super.initialize(context);
PageQuery query = new PageQuery();
if (maxCategories != -1) {
query.setMaxCategories(maxCategories);
queryInitialized = true;
}
if (minCategories != -1) {
query.setMinCategories(minCategories);
queryInitialized = true;
}
if (maxInlinks != -1) {
query.setMaxIndegree(maxInlinks);
queryInitialized = true;
}
if (minInlinks != -1) {
query.setMinIndegree(minInlinks);
queryInitialized = true;
}
if (maxOutlinks != -1) {
query.setMaxOutdegree(maxOutlinks);
queryInitialized = true;
}
if (minOutlinks != -1) {
query.setMinOutdegree(minOutlinks);
queryInitialized = true;
}
if (maxRedirects != -1) {
query.setMaxRedirects(maxRedirects);
queryInitialized = true;
}
if (minRedirects != -1) {
query.setMinRedirects(minRedirects);
queryInitialized = true;
}
if (maxTokens != -1) {
query.setMaxTokens(maxTokens);
queryInitialized = true;
}
if (minTokens != -1) {
query.setMinTokens(minTokens);
queryInitialized = true;
}
if (!titlePattern.equals("")) {
query.setTitlePattern(titlePattern);
queryInitialized = true;
}
this.getLogger().log(Level.INFO, query.getQueryInfo());
// if a query was initialized, overwrite the page iterator
if (queryInitialized) {
try {
pageIter = wiki.getPages(query).iterator();
}
catch (WikiApiException e) {
throw new ResourceInitializationException(e);
}
}
}
}