/* * ArticleSet.java * Copyright (C) 2007 David Milne, d.n.milne@gmail.com * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ package org.wikipedia.miner.util; import java.io.*; import java.text.DecimalFormat; import java.util.*; import java.util.regex.*; import org.apache.log4j.Logger; import org.wikipedia.miner.db.WEnvironment.StatisticName; import org.wikipedia.miner.model.*; import org.wikipedia.miner.model.Page.PageType; /** * @author David Milne * * A set of Wikipedia articles that can be used to train and test disambiguators, linkDetectors, etc. * Can either be generated randomly from Wikipedia, or loaded from file. */ public class ArticleSet extends ArrayList<Article> { //TODO: This screams out for the builder design pattern private static final long serialVersionUID = 6142971965290887331L; //private TreeSet<Integer> articleIds = new TreeSet<Integer>() ; private MarkupStripper stripper = new MarkupStripper() ; public ArticleSet() { super() ; } /** * Loads this article set from file. The file must contain a list of article ids, separated by newlines. * If the file is comma separated, then only the first column is used. * * @param file the file containing article ids. * @throws IOException if the file cannot be read. */ public ArticleSet(File file, Wikipedia wikipedia) throws IOException{ //articleIds = new TreeSet<Integer>() ; BufferedReader reader = new BufferedReader(new FileReader(file)) ; String line ; while ((line = reader.readLine()) != null) { String[] values = line.split("\t") ; int id = new Integer(values[0].trim()) ; add((Article)wikipedia.getPageById(id)) ; } reader.close(); } /** * Generates a set of articles randomly from Wikipedia, given some constraints on what is an acceptable article. * <p> * This first gathers all articles that satisfy the minInLink and minOutLink constraints, and then randomly samples from * these to produce the final set of articles which satisfy all constraints. * <p> * The length of time this takes is very variable. It will work fastest if the minInLink and minOutLink constraints are strict, and * the other constraints are loose. * <p> * You can ignore any of the constraints by setting them to -1 ; * * @param wikipedia an instantiated instance of Wikipedia. * @param size the desired number of articles * @param minInLinks the minimum number of links that must be made to an article * @param minOutLinks the minimum number of links that an article must make * @param minLinkProportion the minimum proportion of links (over total words) that articles must contain * @param maxLinkProportion the maximum proportion of links (over total words) that articles must contain * @param minWordCount the minimum number of words allowed in an article * @param maxWordCount the maximum number of words allowed in an article * @param maxListProportion the maximum proportion of list items (over total line count) that an article may contain. */ public ArticleSet(Wikipedia wikipedia, int size, Integer minInLinks, Integer minOutLinks, Double minLinkProportion, Double maxLinkProportion, Integer minWordCount, Integer maxWordCount, Double maxListProportion, Pattern mustMatch, Pattern mustNotMatch, Vector<Article> candidates , ArticleSet exclude) { if (candidates == null) candidates = getRoughCandidates(wikipedia, minInLinks, minOutLinks) ; buildFromCandidates(wikipedia, candidates, size, minInLinks, minOutLinks, minLinkProportion, maxLinkProportion, minWordCount, maxWordCount, maxListProportion, mustMatch, mustNotMatch, exclude) ; } public ArticleSet getRandomSubset(int size) { if (size > this.size()) throw new IllegalArgumentException("requested size " + size + " is larger than " + size()) ; Random r = new Random() ; HashSet<Integer> usedIds = new HashSet<Integer>() ; ArticleSet subset = new ArticleSet() ; while (subset.size() < size) { int index = r.nextInt(size()) ; Article art = get(index) ; if (!usedIds.contains(art.getId())) { subset.add(art) ; usedIds.add(art.getId()) ; } } Collections.sort(subset) ; return subset ; } private void buildFromCandidates(Wikipedia wikipedia, Vector<Article> roughCandidates, int size, Integer minInLinks, Integer minOutLinks, Double minLinkProportion, Double maxLinkProportion, Integer minWordCount, Integer maxWordCount, Double maxListProportion, Pattern mustMatch, Pattern mustNotMatch, ArticleSet exclude) { DecimalFormat df = new DecimalFormat("#0.00 %") ; int totalRoughCandidates = roughCandidates.size(); ProgressTracker pn = new ProgressTracker(totalRoughCandidates, "Refining candidates (ETA is worst case)", ArticleSet.class) ; double lastWarningProgress = 0 ; while (roughCandidates.size() > 0) { pn.update() ; if (size() == size) break ; //we have enough ids //pop a random id Integer index = (int)Math.floor(Math.random() * roughCandidates.size()) ; Article art = roughCandidates.elementAt(index) ; roughCandidates.removeElementAt(index) ; if (isArticleValid(art, minLinkProportion, maxLinkProportion, minWordCount, maxWordCount, maxListProportion, mustMatch, mustNotMatch, exclude)) add(art) ; // warn user if it looks like we wont find enough valid articles double roughProgress = 1-((double) roughCandidates.size()/totalRoughCandidates) ; if (roughProgress >= lastWarningProgress + 0.01) { double fineProgress = (double)size()/size ; if (roughProgress > fineProgress) { System.err.println("ArticleSet | Warning : we have exhausted " + df.format(roughProgress) + " of the available pages and only gathered " + df.format(fineProgress*100) + " of the articles needed.") ; lastWarningProgress = roughProgress ; } } } if (size() < size) System.err.println("ArticleSet | Warning: we could only find " + size() + " suitable articles.") ; Collections.sort(this) ; } /** * @return the set of article ids, in ascending order. *//* public ArrayList<Integer> getArticleIds() { return articleIds ; }*/ /** * Saves this list of article ids in a text file, separated by newlines. * If the file exists already, it will be overwritten. * * @param file the file in which this set is to be saved * @throws IOException if the file cannot be written to. */ public void save(File file) throws IOException{ BufferedWriter writer = new BufferedWriter(new FileWriter(file)) ; for (Article art: this) writer.write(art.getId() + "\n") ; writer.close() ; } protected static Vector<Article> getRoughCandidates(Wikipedia wikipedia, Integer minInLinks, Integer minOutLinks) { Vector<Article> articles = new Vector<Article>() ; int totalArticles = wikipedia.getEnvironment().retrieveStatistic(StatisticName.articleCount).intValue() ; ProgressTracker pn = new ProgressTracker(totalArticles, "Gathering rough candidates", ArticleSet.class) ; PageIterator i = wikipedia.getPageIterator(PageType.article) ; while (i.hasNext()) { Article art = (Article)i.next() ; pn.update() ; if (minOutLinks != null && art.getLinksOut().length < minOutLinks) continue ; if (minInLinks != null && art.getLinksIn().length < minInLinks) continue ; articles.add(art) ; } i.close(); return articles ; } @Override public boolean contains(Object obj) { Article art = (Article)obj ; int index = Collections.binarySearch(this, art) ; return (index >= 0 ) ; } private boolean isArticleValid(Article art, Double minLinkProportion, Double maxLinkProportion, Integer minWordCount, Integer maxWordCount, Double maxListProportion, Pattern mustMatch, Pattern mustNotMatch, ArticleSet exclude) { Logger.getLogger(ArticleSet.class).debug("Evaluating " + art) ; //we don't want any disambiguations if (art.getType() == PageType.disambiguation) { Logger.getLogger(ArticleSet.class).debug(" - rejected due to disambiguation") ; return false ; } if (exclude != null && exclude.contains(art)) { Logger.getLogger(ArticleSet.class).debug(" - rejected due to exclusion list") ; return false ; } //TODO: check that list identification works //if (art.getType() == PageType.list) // return false ; //check if there are any other constraints if (minLinkProportion == null && maxLinkProportion == null && minWordCount == null && maxWordCount == null && maxListProportion == null) return true ; // get and prepare markup String markup = art.getMarkup() ; if (markup == null) return false ; if (mustMatch != null) { Matcher m = mustMatch.matcher(markup) ; if (!m.find()) { Logger.getLogger(ArticleSet.class).debug(" - rejected due to mustMatch pattern") ; return false ; } } if (mustNotMatch != null) { Matcher m = mustNotMatch.matcher(markup) ; if (m.find()) { Logger.getLogger(ArticleSet.class).debug(" - rejected due to mustNotMatch pattern") ; return false ; } } markup = stripper.stripToPlainText(markup, null) ; markup = stripper.stripExcessNewlines(markup) ; if (maxListProportion != null) { //we need to count lines and list items String[] lines = markup.split("\n") ; int lineCount = 0 ; int listCount = 0 ; for (String line: lines) { line = line.replace(':', ' ') ; line = line.replace(';', ' ') ; line = line.trim() ; if (line.length() > 5) { lineCount++ ; if (line.startsWith("*") || line.startsWith("#")) listCount++ ; } } float listProportion = ((float)listCount) / lineCount ; if (listProportion > maxListProportion) { Logger.getLogger(ArticleSet.class).debug(" - rejected for max list proportion " + (listProportion)) ; return false ; } } if (minWordCount != null || maxWordCount != null || minLinkProportion != null || maxLinkProportion != null ) { //we need to count words StringTokenizer t = new StringTokenizer(markup) ; int wordCount = t.countTokens() ; if (minWordCount != null && wordCount < minWordCount) { Logger.getLogger(ArticleSet.class).debug(" - rejected for min wordcount " + (wordCount)) ; return false ; } if (maxWordCount != null && wordCount > maxWordCount) { Logger.getLogger(ArticleSet.class).debug(" - rejected for max wordcount " + (wordCount)) ; return false ; } int linkCount = art.getTotalLinksOutCount() ; float linkProportion = (float)linkCount/wordCount ; if (minLinkProportion != null && linkProportion < minLinkProportion) { Logger.getLogger(ArticleSet.class).debug(" - rejected for min link proportion " + (linkProportion)) ; return false ; } if (maxLinkProportion != null && linkProportion > maxLinkProportion) { Logger.getLogger(ArticleSet.class).debug(" - rejected for max link proportion " + (linkProportion)) ; return false ; } } return true ; } }