/* * PageIterator.java * Copyright (C) 2007 David Milne, d.n.milne@gmail.com * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ package org.wikipedia.miner.util; import java.util.* ; import org.wikipedia.miner.db.WEntry; import org.wikipedia.miner.db.WIterator; import org.wikipedia.miner.db.WEnvironment; import org.wikipedia.miner.db.struct.DbPage; import org.wikipedia.miner.model.* ; import org.wikipedia.miner.model.Page.PageType; /** * @author David Milne * * Provides efficient iteration over the pages in Wikipedia */ public class PageIterator implements Iterator<Page> { WEnvironment env ; WIterator<Integer,DbPage> iter ; Page nextPage = null ; PageType type = null ; /** * Creates an iterator that will loop through all pages in Wikipedia. * * @param database an active (connected) Wikipedia database. */ public PageIterator(WEnvironment env) { this.env = env ; iter = env.getDbPage().getIterator() ; queueNext() ; } /** * Creates an iterator that will loop through all pages of the given type in Wikipedia. * * @param database an active (connected) Wikipedia database. * @param pageType the type of page to restrict the iterator to (ARTICLE, CATEGORY, REDIRECT or DISAMBIGUATION_PAGE) * @throws SQLException if there is a problem with the Wikipedia database. */ public PageIterator(WEnvironment env, PageType type) { this.env = env ; iter = env.getDbPage().getIterator() ; this.type = type ; queueNext() ; } public boolean hasNext() { return (nextPage != null) ; } public void remove() { throw new UnsupportedOperationException() ; } public Page next() { if (nextPage == null) throw new NoSuchElementException() ; Page p = nextPage ; queueNext() ; return p ; } private void queueNext() { try { nextPage=toPage(iter.next()) ; if (type != null) { while (nextPage.getType() != type) nextPage = toPage(iter.next()); } } catch (NoSuchElementException e) { nextPage = null ; } } private Page toPage(WEntry<Integer,DbPage> e) { if (e== null) return null ; else return Page.createPage(env, e.getKey(), e.getValue()) ; } /* public static void main(String[] args) throws Exception { DecimalFormat df = new DecimalFormat("0.000") ; if (args.length != 1) { System.out.println("Please specify a directory containing a fully prepared Wikipedia database") ; return ; } File envDir = new File(args[0]) ; Wikipedia wikipedia = new Wikipedia(envDir) ; ProgressTracker tracker = new ProgressTracker(wikipedia.getEnvironment().getDbPage().getCount(), "Iterating pages", PageIterator.class) ; Iterator<Page> iter = wikipedia.getPageIterator(PageType.article) ; int count = 0 ; while (iter.hasNext()) { tracker.update() ; Page p = iter.next() ; if (count%1000 == 0) { System.out.println(p + " [" + p.getType() + "] - " + df.format(tracker.getTaskProgress())) ; } count++ ; } System.out.println(count) ; }*/ public void close() { iter.close(); } }