ArticleSet.java example

Explorer

wikipediaminer-master
- src
  - main
    - java
      - org
        wikipedia_miner
        wikipedia_miner
        App.java
  - test
    - java
      - org
        wikipedia_miner
        wikipedia_miner
        AppTest.java
- wikipedia-miner-core
  - src
    - main
      - java
        org
        wikipedia
        miner
        annotation
        ArticleCleaner.java
        Context.java
        Disambiguator.java
        Topic.java
        TopicDetector.java
        TopicReference.java
        preprocessing
        DocumentPreprocessor.java
        HtmlPreprocessor.java
        PreprocessedDocument.java
        WikiPreprocessor.java
        tagging
        DocumentTagger.java
        HtmlTagger.java
        WikiTagger.java
        weighting
        LinkDetector.java
        SimpleDocumentIndexer.java
        TopicIndexer.java
        TopicWeighter.java
        comparison
        ArticleComparer.java
        ArticleComparison.java
        ComparisonDataSet.java
        ConnectionSnippet.java
        ConnectionSnippetWeighter.java
        LabelComparer.java
        db
        IntObjectDatabase.java
        LabelDatabase.java
        MarkupDatabase.java
        PageLinkCountDatabase.java
        RecordBinding.java
        TitleDatabase.java
        WDatabase.java
        WDatabaseFactory.java
        WEntry.java
        WEnvironment.java
        WIterator.java
        struct
        DbIntList.java
        DbIntPair.java
        DbLabel.java
        DbLabelForPage.java
        DbLabelForPageList.java
        DbLinkLocation.java
        DbLinkLocationList.java
        DbPage.java
        DbPageLinkCounts.java
        DbSenseForLabel.java
        DbSentenceSplitList.java
        DbTranslations.java
        model
        Article.java
        Category.java
        Disambiguation.java
        Label.java
        Page.java
        Redirect.java
        Template.java
        Wikipedia.java
        util
        ArticleSet.java
        ArticleSetBuilder.java
        CorrelationCalculator.java
        EmphasisResolver.java
        EnvironmentBuilder.java
        LabelIterator.java
        MarkupStripper.java
        NGrammer.java
        PageIterator.java
        Position.java
        ProgressTracker.java
        RelatednessCache.java
        Result.java
        TopicIndexingSet.java
        WikipediaConfiguration.java
        text
        CaseAccentSimpleTextProcessor.java
        CaseFolder.java
        Cleaner.java
        PlingStemmer.java
        PorterStemmer.java
        SimpleStemmer.java
        StopwordRemover.java
        TextProcessor.java
        yagoUtils
        FinalMap.java
        FinalSet.java
    - test
      - java
        org
        wikipedia
        miner
        core
        AppTest.java
- wikipedia-miner-examples
  - src
    - main
      - java
        org
        wikipedia
        miner
        examples
        AnnotationWorkbench.java
        ComparisonWorkbench.java
        WikipediaDefiner.java
    - test
      - java
        org
        wikipedia
        miner
        examples
        AppTest.java
- wikipedia-miner-extract
  - src
    - main
      - java
        org
        wikipedia
        miner
        extract
        DumpExtractor.java
        model
        DumpLink.java
        DumpLinkParser.java
        DumpPage.java
        DumpPageParser.java
        struct
        LabelOccurrences.java
        LabelSense.java
        LabelSenseList.java
        LabelSummary.java
        LinkSummary.java
        PageDepthSummary.java
        PageDetail.java
        PageKey.java
        PageSummary.java
        PrimaryLabels.java
        steps
        IterativeStep.java
        LocalStep.java
        Step.java
        finalSummary
        FinalSummaryStep.java
        labelOccurrences
        CombinerOrReducer.java
        LabelCache.java
        LabelOccurrenceStep.java
        Mapper.java
        labelSenses
        CombinerOrReducer.java
        LabelSensesStep.java
        Mapper.java
        pageDepth
        DepthCombinerOrReducer.java
        InitialDepthMapper.java
        PageDepthStep.java
        SubsequentDepthMapper.java
        pageSummary
        CombinerOrReducer.java
        InitialMapper.java
        PageSummaryStep.java
        SubsequentMapper.java
        primaryLabel
        PrimaryLabelStep.java
        sortedPages
        PageSortingStep.java
        util
        Languages.java
        PageSentenceExtractor.java
        SiteInfo.java
        UncompletedStepException.java
        Util.java
        XmlInputFormat.java
    - test
      - java
        org
        wikipedia
        miner
        extract
        LinkMarkupHandling.java
        MarkupTestCase.java
        TestMarkupHandling.java
- wikipedia-miner-web
  - src
    - main
      - java
        org
        wikipedia
        miner
        web
        service
        CompareService.java
        CorrectService.java
        ExploreArticleService.java
        ExploreCategoryService.java
        ListWikipediasService.java
        ProgressService.java
        SearchService.java
        StatsService.java
        SuggestService.java
        WMHub.java
        WMService.java
        WikifyService.java
        util
        CharsetFilter.java
        HubConfiguration.java
        ImageRetriever.java
        MarkupFormatter.java
        UtilityMessages.java
        WebContentRetriever.java
        xjsfParameters
        StringListParameter.java

/*
 *    ArticleSet.java
 *    Copyright (C) 2007 David Milne, d.n.milne@gmail.com
 *
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

package org.wikipedia.miner.util;

import java.io.*;
import java.text.DecimalFormat;
import java.util.*;
import java.util.regex.*;

import org.apache.log4j.Logger;
import org.wikipedia.miner.db.WEnvironment.StatisticName;
import org.wikipedia.miner.model.*;
import org.wikipedia.miner.model.Page.PageType;

/**
 * @author David Milne
 *
 *	A set of Wikipedia articles that can be used to train and test disambiguators, linkDetectors, etc. 
 * Can either be generated randomly from Wikipedia, or loaded from file.
 */
public class ArticleSet extends ArrayList<Article> {
	
	//TODO: This screams out for the builder design pattern
	
	private static final long serialVersionUID = 6142971965290887331L;
	
	//private TreeSet<Integer> articleIds = new TreeSet<Integer>() ;
	private MarkupStripper stripper = new MarkupStripper() ;
	
	public ArticleSet() {
		super() ;
	}
	
	/**
	 * Loads this article set from file. The file must contain a list of article ids, separated by newlines. 
	 * If the file is comma separated, then only the first column is used.
	 * 
	 * @param file the file containing article ids.
	 * @throws IOException if the file cannot be read.
	 */
	public ArticleSet(File file, Wikipedia wikipedia) throws IOException{
		
		//articleIds = new TreeSet<Integer>() ;

		BufferedReader reader = new BufferedReader(new FileReader(file)) ;
		String line  ;

		while ((line = reader.readLine()) != null) {
			String[] values = line.split("\t") ;
			int id = new Integer(values[0].trim()) ;
			add((Article)wikipedia.getPageById(id)) ;
		}

		reader.close();		
	}
	
	/**
	 * Generates a set of articles randomly from Wikipedia, given some constraints on what is an acceptable article.
	 * <p>
	 * This first gathers all articles that satisfy the minInLink and minOutLink constraints, and then randomly samples from
	 * these to produce the final set of articles which satisfy all constraints.
	 * <p>
	 * The length of time this takes is very variable. It will work fastest if the minInLink and minOutLink constraints are strict, and 
	 * the other constraints are loose.
	 * <p>
	 * You can ignore any of the constraints by setting them to -1 ;
	 * 
	 * @param wikipedia	an instantiated instance of Wikipedia.
	 * @param size	the desired number of articles
	 * @param minInLinks	the minimum number of links that must be made to an article
	 * @param minOutLinks	the minimum number of links that an article must make
	 * @param minLinkProportion the minimum proportion of links (over total words) that articles must contain
	 * @param maxLinkProportion the maximum proportion of links (over total words) that articles must contain
	 * @param minWordCount  the minimum number of words allowed in an article
	 * @param maxWordCount the maximum number of words allowed in an article
	 * @param maxListProportion the maximum proportion of list items (over total line count) that an article may contain. 
	 */
	public ArticleSet(Wikipedia wikipedia, int size, Integer minInLinks, Integer minOutLinks, Double minLinkProportion, Double maxLinkProportion, Integer minWordCount, Integer maxWordCount, Double maxListProportion, Pattern mustMatch, Pattern mustNotMatch, Vector<Article> candidates , ArticleSet exclude) {
		
		if (candidates == null)
			candidates = getRoughCandidates(wikipedia, minInLinks, minOutLinks) ;
		
		buildFromCandidates(wikipedia, candidates, size, minInLinks, minOutLinks, minLinkProportion, maxLinkProportion, minWordCount, maxWordCount, maxListProportion, mustMatch, mustNotMatch, exclude) ;
	}
	
	public ArticleSet getRandomSubset(int size) {
		
		if (size > this.size())
			throw new IllegalArgumentException("requested size " + size + " is larger than " + size()) ;
		
		Random r = new Random() ;
		HashSet<Integer> usedIds = new HashSet<Integer>() ;
		
		ArticleSet subset = new ArticleSet() ;
		while (subset.size() < size) {
			
			int index = r.nextInt(size()) ;
			
			Article art = get(index) ;
			
			if (!usedIds.contains(art.getId())) {
				subset.add(art) ;
				usedIds.add(art.getId()) ;
			}
		}
		
		Collections.sort(subset) ;
		
		return subset ;
	}
	
	private void buildFromCandidates(Wikipedia wikipedia, Vector<Article> roughCandidates, int size, Integer minInLinks, Integer minOutLinks, Double minLinkProportion, Double maxLinkProportion, Integer minWordCount, Integer maxWordCount, Double maxListProportion, Pattern mustMatch, Pattern mustNotMatch, ArticleSet exclude) {
		
		DecimalFormat df = new DecimalFormat("#0.00 %") ;
		
		int totalRoughCandidates = roughCandidates.size();
		
		ProgressTracker pn = new ProgressTracker(totalRoughCandidates, "Refining candidates (ETA is worst case)", ArticleSet.class) ;
		
		
		
		double lastWarningProgress = 0 ;
		
		while (roughCandidates.size() > 0) {
			
			pn.update() ;
			
			if (size() == size)
				break ; //we have enough ids
			
			//pop a random id
			Integer index = (int)Math.floor(Math.random() * roughCandidates.size()) ;
			Article art = roughCandidates.elementAt(index) ;
			roughCandidates.removeElementAt(index) ;
									
			if (isArticleValid(art, minLinkProportion, maxLinkProportion, minWordCount, maxWordCount, maxListProportion, mustMatch, mustNotMatch, exclude)) 
				add(art) ;
			
			
			// warn user if it looks like we wont find enough valid articles
			double roughProgress = 1-((double) roughCandidates.size()/totalRoughCandidates) ;
			if (roughProgress >= lastWarningProgress + 0.01) {
				double fineProgress = (double)size()/size ;
			
				if (roughProgress > fineProgress) {
					System.err.println("ArticleSet | Warning : we have exhausted " + df.format(roughProgress) + " of the available pages and only gathered " + df.format(fineProgress*100) + " of the articles needed.") ;
					lastWarningProgress = roughProgress ;
				}
			}
		}
		
		if (size() < size)
			System.err.println("ArticleSet | Warning: we could only find " + size() + " suitable articles.") ;
		
		
		Collections.sort(this) ;
	}

	/**
	 * @return the set of article ids, in ascending order.
	 *//*
	public ArrayList<Integer> getArticleIds() {
		return articleIds ;
	}*/
	
	/**
	 * Saves this list of article ids in a text file, separated by newlines. 
	 * If the file exists already, it will be overwritten.
	 * 
	 * @param file the file in which this set is to be saved
	 * @throws IOException if the file cannot be written to.
	 */
	public void save(File file) throws IOException{
		BufferedWriter writer = new BufferedWriter(new FileWriter(file)) ;
		
		for (Article art: this) 
			writer.write(art.getId() + "\n") ;
		
		writer.close() ;
	}
		
	protected static Vector<Article> getRoughCandidates(Wikipedia wikipedia, Integer minInLinks, Integer minOutLinks)  {
		
		Vector<Article> articles = new Vector<Article>() ;
		int totalArticles = wikipedia.getEnvironment().retrieveStatistic(StatisticName.articleCount).intValue() ;
		
		ProgressTracker pn = new ProgressTracker(totalArticles, "Gathering rough candidates", ArticleSet.class) ;
		
		PageIterator i = wikipedia.getPageIterator(PageType.article) ;
		
		while (i.hasNext()) {
			Article art = (Article)i.next() ;
			pn.update() ;
			
			if (minOutLinks != null && art.getLinksOut().length < minOutLinks)
				continue ;
			
			if (minInLinks != null && art.getLinksIn().length < minInLinks)
				continue ;
			
			articles.add(art) ;
		}
		i.close();
		
		return articles ;
	}
	
	@Override
	public boolean contains(Object obj) {
		
		Article art = (Article)obj ;
		
		int index = Collections.binarySearch(this, art) ;
		
		return (index >= 0 ) ;
		
	}
	
		
	private boolean isArticleValid(Article art, Double minLinkProportion, Double maxLinkProportion, Integer minWordCount, Integer maxWordCount, Double maxListProportion, Pattern mustMatch, Pattern mustNotMatch, ArticleSet exclude) {
			
		Logger.getLogger(ArticleSet.class).debug("Evaluating " + art) ;
		
		
		//we don't want any disambiguations
		if (art.getType() == PageType.disambiguation) {
			Logger.getLogger(ArticleSet.class).debug(" - rejected due to disambiguation") ;
			return false ;	
			
		}
		
		if (exclude != null && exclude.contains(art)) {
			Logger.getLogger(ArticleSet.class).debug(" - rejected due to exclusion list") ;
			return false ;	
		}
		
		//TODO: check that list identification works
		//if (art.getType() == PageType.list) 
		//	return false ;	
	
		//check if there are any other constraints
		if (minLinkProportion == null && maxLinkProportion == null && minWordCount == null && maxWordCount == null && maxListProportion == null)
			return true ;
		
		// get and prepare markup
		String markup = art.getMarkup() ;
		
		if (markup == null)
			return false ;
		
		if (mustMatch != null) {
			Matcher m = mustMatch.matcher(markup) ;
			
			if (!m.find()) {
				Logger.getLogger(ArticleSet.class).debug(" - rejected due to mustMatch pattern") ;
				return false ;	
			}
		}
		
		if (mustNotMatch != null) {
			Matcher m = mustNotMatch.matcher(markup) ;
			
			if (m.find()) {
				Logger.getLogger(ArticleSet.class).debug(" - rejected due to mustNotMatch pattern") ;
				return false ;	
			}
		}
		
		markup = stripper.stripToPlainText(markup, null) ; 
		
		markup = stripper.stripExcessNewlines(markup) ;
		
		
		if (maxListProportion != null) {
			//we need to count lines and list items
			
			String[] lines = markup.split("\n") ;
			
			int lineCount = 0 ;
			int listCount = 0 ;
			
			for (String line: lines) {
				line = line.replace(':', ' ') ;
				line = line.replace(';', ' ') ;
				
				line = line.trim() ;
				
				if (line.length() > 5) {
					lineCount++ ;

					if (line.startsWith("*") || line.startsWith("#")) 
						listCount++ ;			
				}
			}
			
			float listProportion = ((float)listCount) / lineCount ;
			if (listProportion > maxListProportion) {
				Logger.getLogger(ArticleSet.class).debug(" - rejected for max list proportion " + (listProportion)) ;
				return false ;
			}
		}
		
				
		if (minWordCount != null || maxWordCount != null || minLinkProportion != null || maxLinkProportion != null ) {
			//we need to count words
					
			StringTokenizer t = new StringTokenizer(markup) ;
	    	int wordCount = t.countTokens() ;

			if (minWordCount != null && wordCount < minWordCount) {
				Logger.getLogger(ArticleSet.class).debug(" - rejected for min wordcount " + (wordCount)) ;
				return false ;
				
			}
			
			if (maxWordCount != null && wordCount > maxWordCount) {
				Logger.getLogger(ArticleSet.class).debug(" - rejected for max wordcount " + (wordCount)) ;
				return false ;
			}
			
			int linkCount = art.getTotalLinksOutCount() ;
			float linkProportion = (float)linkCount/wordCount ;
			
			if (minLinkProportion != null && linkProportion < minLinkProportion) {
				Logger.getLogger(ArticleSet.class).debug(" - rejected for min link proportion " + (linkProportion)) ;
				return false ;
			}
			
			if (maxLinkProportion != null && linkProportion > maxLinkProportion) {
				Logger.getLogger(ArticleSet.class).debug(" - rejected for max link proportion " + (linkProportion)) ;
				return false ;
			}
		}
		
		return true ;
	}
}