/* * Copyright 2010 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.dkpro.core.io.jwpl; import java.io.IOException; import java.util.Arrays; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Set; import org.apache.uima.UimaContext; import org.apache.uima.collection.CollectionException; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.util.Progress; import org.apache.uima.util.ProgressImpl; import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; import de.tudarmstadt.ukp.dkpro.core.io.jwpl.util.WikiUtils; import de.tudarmstadt.ukp.wikipedia.api.Page; import de.tudarmstadt.ukp.wikipedia.api.WikiConstants; import de.tudarmstadt.ukp.wikipedia.api.exception.WikiApiException; import de.tudarmstadt.ukp.wikipedia.api.exception.WikiPageNotFoundException; import de.tudarmstadt.ukp.wikipedia.api.exception.WikiTitleParsingException; import de.tudarmstadt.ukp.wikipedia.parser.ParsedPage; import de.tudarmstadt.ukp.wikipedia.parser.mediawiki.FlushTemplates; import de.tudarmstadt.ukp.wikipedia.parser.mediawiki.MediaWikiParser; import de.tudarmstadt.ukp.wikipedia.parser.mediawiki.MediaWikiParserFactory; import de.tudarmstadt.ukp.wikipedia.util.templates.WikipediaTemplateInfo; import de.tudarmstadt.ukp.wikipedia.util.templates.WikipediaTemplateInfoGenerator; /** * Reads all pages that contain or do not contain the templates specified in the template whitelist * and template blacklist. * * <p> * It is possible to just define a whitelist OR a blacklist. If both whitelist and blacklist are * provided, the articles are chosen that DO contain the templates from the whitelist and at the * same time DO NOT contain the templates from the blacklist (= the intersection of the * "whitelist page set" and the "blacklist page set") * </p> * * <p> * This reader only works if template tables have been generated for the JWPL database using the * {@link WikipediaTemplateInfoGenerator}. * </p> * * <p> * <strong>NOTE:</strong> This reader directly extends the {@link WikipediaReaderBase} and not the * {@link WikipediaStandardReaderBase} * </p> * */ @TypeCapability(outputs = { "de.tudarmstadt.ukp.dkpro.core.io.jwpl.type.DBConfig", "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData" }) public class WikipediaTemplateFilteredArticleReader extends WikipediaReaderBase { /** If set to true, only the first paragraph instead of the whole article is used. */ public static final String PARAM_ONLY_FIRST_PARAGRAPH = "OnlyFirstParagraph"; @ConfigurationParameter(name = PARAM_ONLY_FIRST_PARAGRAPH, mandatory=true, defaultValue="false") private boolean onlyFirstParagraph; /** Whether the reader outputs plain text or wiki markup. */ public static final String PARAM_OUTPUT_PLAIN_TEXT = "OutputPlainText"; @ConfigurationParameter(name = PARAM_OUTPUT_PLAIN_TEXT, mandatory = true, defaultValue = "true") private boolean outputPlainText; /** Whether the reader should read also include talk pages. */ public static final String PARAM_INCLUDE_DISCUSSION_PAGES = "IncludeDiscussions"; @ConfigurationParameter(name = PARAM_INCLUDE_DISCUSSION_PAGES, mandatory = true, defaultValue = "true") private boolean inludeDiscussions; /** * If this option is set, discussion pages are rejected that are associated with a blacklisted * article. Analogously, articles are rejected that are associated with a blacklisted discussion * page. * <p> * This check is rather expensive and could take a long time. This is option is not active if * only a whitelist is used. * </p> * <p> * Default Value: false * </p> */ public static final String PARAM_DOUBLE_CHECK_ASSOCIATED_PAGES = "DoubleCheckAssociatedPages"; @ConfigurationParameter(name = PARAM_DOUBLE_CHECK_ASSOCIATED_PAGES, mandatory = true, defaultValue = "false") private boolean doubleCheckWhitelistedArticles; /** * Optional parameter that allows to define the max number of articles that should be delivered * by the reader. * <p> * This avoids unnecessary filtering if only a small number of articles is needed. * </p> */ public static final String PARAM_LIMIT_NUMBER_OF_ARTICLES_TO_READ = "LimitNUmberOfArticlesToRead"; @ConfigurationParameter(name = PARAM_LIMIT_NUMBER_OF_ARTICLES_TO_READ, mandatory = false) private Integer articleLimit; /** * Defines templates that the articles MUST contain. * <p> * If you also define a blacklist, the intersection of both sets is used. (= pages that DO * contain templates from the whitelist, but DO NOT contain templates from the blacklist) * </p> */ public static final String PARAM_TEMPLATE_WHITELIST = "TemplateWhitelist"; @ConfigurationParameter(name = PARAM_TEMPLATE_WHITELIST, mandatory = false) private String[] templateWhitelistArray; /** * Defines templates that the articles MUST NOT contain. * <p> * If you also define a whitelist, the intersection of both sets is used. (= pages that DO * contain templates from the whitelist, but DO NOT contain templates from the blacklist) * </p> */ public static final String PARAM_TEMPLATE_BLACKLIST = "TemplateBlacklist"; @ConfigurationParameter(name = PARAM_TEMPLATE_BLACKLIST, mandatory = false) private String[] templateBlacklistArray; /** * Defines whether to match the templates exactly or whether to match all * templates that start with the String given in the respective parameter * list. * <p>Default Value: {@code true}</p> */ public static final String PARAM_EXACT_TEMPLATE_MATCHING = "ExactTemplateMatching"; @ConfigurationParameter(name = PARAM_EXACT_TEMPLATE_MATCHING, mandatory = true, defaultValue="true") private boolean exactTemplateMatching; /** The page buffer size (#pages) of the page iterator. */ public static final String PARAM_PAGE_BUFFER = "PageBuffer"; @ConfigurationParameter(name = PARAM_PAGE_BUFFER, mandatory = true, defaultValue = "1000") private int pageBuffer; private List<Page> bufferedPages; private List<Integer> pageIds; List<String> templateBlacklist; List<String> templateWhitelist; private long currentArticleIndex; private long nrOfArticles; private MediaWikiParser parser; private WikipediaTemplateInfo tplInfo; @Override public void initialize(UimaContext context) throws ResourceInitializationException { super.initialize(context); if(articleLimit!=null){ getLogger().info("Article limit is set to " + articleLimit + " The reader won't " + "deliver all pages that meet the requirements. Remove " + "PARAM_LIMIT_NUMBER_OF_ARTICLES_TO_READ if that is not what you want."); } if (templateBlacklistArray == null && templateWhitelistArray == null) { throw new ResourceInitializationException(); } try { bufferedPages = new LinkedList<Page>(); pageIds = new LinkedList<Integer>(); tplInfo = new WikipediaTemplateInfo(wiki); Iterable<Integer> filteredIds = null; // WHITELIST FILTER Set<Integer> wlSet = null; if (templateWhitelistArray != null && templateWhitelistArray.length > 0) { //convert array to list templateWhitelist = Arrays.asList(templateWhitelistArray); wlSet = new HashSet<Integer>(); if (exactTemplateMatching) { filteredIds = tplInfo.getPageIdsContainingTemplateNames( templateWhitelist); } else { filteredIds = tplInfo.getPageIdsContainingTemplateFragments( templateWhitelist); } for (Integer id : filteredIds) { wlSet.add(id); } getLogger().info("The whitelist contains "+templateWhitelist.size()+" templates"); getLogger().info(wlSet.size()+" articles are whitelisted"); }else{ getLogger().info("No whitelist active"); } // BLACKLIST FILTER Set<Integer> blSet = null; if (templateBlacklistArray != null && templateBlacklistArray.length > 0) { //convert array to list templateBlacklist =Arrays.asList(templateBlacklistArray); blSet = new HashSet<Integer>(); if(wlSet!=null){ //if the whitelist is active, we can just treat the blacklist //as another whitelist and remove all items from the whitelist //that are also in the blacklist. //This way, we don't have to perform the expensive //getPageIdsNotContainingTemplateNames operation here if (exactTemplateMatching) { filteredIds = tplInfo.getPageIdsContainingTemplateNames( templateBlacklist); } else { filteredIds = tplInfo.getPageIdsContainingTemplateFragments( templateBlacklist); } for (Integer id : filteredIds) { blSet.add(id); } getLogger().info("The blacklist contains "+templateBlacklist.size()+" templates"); getLogger().info(blSet.size()+" articles are blacklisted"); }else{ //if the whitelist is not active, we have to treat the //the blacklist like a real blacklist and call the //rather expensive getPageIdsNotContainingTemplateNames() if (exactTemplateMatching) { filteredIds = tplInfo.getPageIdsNotContainingTemplateNames( templateBlacklist); } else { filteredIds = tplInfo.getPageIdsNotContainingTemplateFragments( templateBlacklist); } for (Integer id : filteredIds) { blSet.add(id); } getLogger().info("The blacklist contains "+templateBlacklist.size()+" templates"); getLogger().info(blSet.size()+" articles are NOT blacklisted"); } }else{ getLogger().info("No blacklist active"); } // GET FINAL ID LIST if (blSet != null && wlSet != null) { //here, blSet contains pages CONTAINING the blacklisted tpls //so, first remove blacklisted pages from the whitelist wlSet.removeAll(blSet); if(articleLimit!=null){ //limit number of articles, if necessary Set<Integer> tempWlSet = new HashSet<Integer>(); tempWlSet.addAll(wlSet); wlSet.clear(); Iterator<Integer> ids = tempWlSet.iterator(); for(int i=0;i<articleLimit;i++){ if(ids.hasNext()){ wlSet.add(ids.next()); } } } //now double filter, if necessary if(doubleCheckWhitelistedArticles){ getLogger().info("Double checking "+wlSet.size()+" articles"); //if doublecheck-param is set, double check whitelisted //articles against the blacklist before adding them pageIds.addAll(doubleCheckAssociatedArticles(wlSet, blSet)); }else{ pageIds.addAll(wlSet); } } else if (blSet == null && wlSet != null) { if(articleLimit!=null){ //limit number of articles, if necessary Set<Integer> tempWlSet = new HashSet<Integer>(); tempWlSet.addAll(wlSet); wlSet.clear(); Iterator<Integer> ids = tempWlSet.iterator(); for(int i=0;i<articleLimit;i++){ if(ids.hasNext()){ wlSet.add(ids.next()); } } } pageIds.addAll(wlSet); } else if (blSet != null && wlSet == null) { if(articleLimit!=null){ //limit number of articles, if necessary Set<Integer> tempBlSet = new HashSet<Integer>(); tempBlSet.addAll(blSet); blSet.clear(); Iterator<Integer> ids = tempBlSet.iterator(); for(int i=0;i<articleLimit;i++){ if(ids.hasNext()){ blSet.add(ids.next()); } } } //here, blSet contains pages NOT containing the blacklisted tpls //now add remaining pages to the pageId list if(doubleCheckWhitelistedArticles){ getLogger().info("Double checking "+blSet.size()+" articles"); //if doublecheck-param is set, double check the articles //that are not blacklisted against the blacklist Set<Integer> blacklistedArticles=new HashSet<Integer>(); if (exactTemplateMatching) { blacklistedArticles.addAll(tplInfo.getPageIdsNotContainingTemplateNames( templateBlacklist)); } else { blacklistedArticles.addAll(tplInfo.getPageIdsNotContainingTemplateFragments( templateBlacklist)); } pageIds.addAll(doubleCheckAssociatedArticles(blSet, blacklistedArticles)); }else{ pageIds.addAll(blSet); } } this.nrOfArticles = pageIds.size(); getLogger().info("Reading "+nrOfArticles+" pages"); } catch (Exception e) { throw new ResourceInitializationException(e); } currentArticleIndex = 0; //TODO Use SWEBLE MediaWikiParserFactory pf = new MediaWikiParserFactory(); pf.setTemplateParserClass(FlushTemplates.class); parser = pf.createParser(); } @Override public boolean hasNext() throws IOException, CollectionException { return !pageIds.isEmpty()||!bufferedPages.isEmpty(); } @Override public void getNext(JCas jcas) throws IOException, CollectionException { super.getNext(jcas); Page page = null; try { //fill buffer if empty if(bufferedPages.isEmpty()) { getLogger().trace("Filling buffer"); for (int i = 0; i < (pageIds.size() < pageBuffer ? pageIds.size() : pageBuffer); i++) { bufferedPages.add(wiki.getPage(pageIds.remove(0))); } } //get next page from buffer page = bufferedPages.remove(0); getLogger().trace("Processing article: " + page.getTitle()); addDocumentMetaData(jcas, page); if (!isValidPage(page)) { jcas.setDocumentText(""); return; } if (outputPlainText) { jcas.setDocumentText(WikiUtils .cleanText(getPlainDocumentText(page))); } else { jcas.setDocumentText(getDocumentText(page)); } } catch (WikiApiException e) { throw new CollectionException(e); } currentArticleIndex++; } /** * Only accept article pages and (if includeDiscussions=true) talk pages * * @param page * the page that should be checked for validity * @return true, if page is valid. false, else * @throws WikiTitleParsingException * if the page title cannot be parsed. */ private boolean isValidPage(Page page) throws WikiTitleParsingException { return !page.isDisambiguation() && !page.isRedirect() && (inludeDiscussions || (!inludeDiscussions && !page.isDiscussion())); } @Override public Progress[] getProgress() { return new Progress[] { new ProgressImpl( Long.valueOf(currentArticleIndex).intValue(), Long.valueOf(nrOfArticles).intValue(), Progress.ENTITIES) }; } private String getDocumentText(Page page) { return page.getText(); } private String getPlainDocumentText(Page page) { String text = ""; ParsedPage pp = parser.parse(page.getText()); if (onlyFirstParagraph) { if (pp != null && pp.getParagraph(0) != null) { text = pp.getParagraph(0).getText(); } } else { if (pp != null ) { text = pp.getText(); } } return text; } /** * Double checks a list of page ids and checks for each id that belongs to a discussion page the * corresponding article if it is blacklisted<br/> * <br/> * This is an rather expensive operation! * * @param idsToDoubleCheck * the set of ids that should be double checked * @param blIds * a set with ids of blacklisted articles * @return a the list of articles after double checking * @throws WikiApiException * if the wiki data cannot be accessed. */ private Set<Integer> doubleCheckAssociatedArticles(Set<Integer> idsToDoubleCheck, Set<Integer> blIds) throws WikiApiException { if (idsToDoubleCheck.size() > 20000) { getLogger().info("You want to double check "+idsToDoubleCheck.size()+" articles in the whitelist. This can take a very long time."+System.getProperty("line.separator")+ "If you do not need ALL pages that meet the specified requirements, you might speed things up by setting PARAM_LIMIT_NUMBER_OF_ARTICLES_TO_READ."); } Set<Integer> doubleFilteredArticles = new HashSet<Integer>(); //do the additional filtering for(Integer id: idsToDoubleCheck){ try{ String curPageTitle = wiki.getTitle(id).getWikiStyleTitle(); //check associated discussion or article if(curPageTitle.startsWith(WikiConstants.DISCUSSION_PREFIX)){ curPageTitle = curPageTitle.replaceAll(WikiConstants.DISCUSSION_PREFIX, ""); if(curPageTitle.contains("/")){ //If we have a discussion archive String[] parts = curPageTitle.split("/"); if(parts!=null&&parts.length>0&&parts[0].length()>0){ curPageTitle = parts[0]; } } List<Integer> curArticleIds = wiki.getPageIds(curPageTitle); for(int curArtId:curArticleIds){ if(blIds.contains(curArtId)){ //select id of current page for removal doubleFilteredArticles.add(id); } } }else{ List<Integer> curDiscussionIds = wiki.getPageIds(WikiConstants.DISCUSSION_PREFIX+curPageTitle); for(int curDiscId:curDiscussionIds){ if(blIds.contains(curDiscId)){ //select id of current page for removal doubleFilteredArticles.add(id); } } } }catch(WikiPageNotFoundException e){ //just go on with the next id } } idsToDoubleCheck.removeAll(doubleFilteredArticles); return idsToDoubleCheck; } private void addDocumentMetaData(JCas jcas, Page page) throws WikiTitleParsingException { DocumentMetaData metaData = DocumentMetaData.create(jcas); metaData.setDocumentTitle(page.getTitle().getWikiStyleTitle()); metaData.setCollectionId(Integer.valueOf(page.getPageId()).toString()); metaData.setDocumentId(Integer.valueOf(page.getPageId()).toString()); metaData.setLanguage(dbconfig.getLanguage().toString()); } }