package org.wikibrain.cookbook.pageview; import gnu.trove.map.TIntIntMap; import org.joda.time.DateTime; import org.wikibrain.conf.ConfigurationException; import org.wikibrain.core.cmd.Env; import org.wikibrain.core.cmd.EnvBuilder; import org.wikibrain.core.dao.DaoException; import org.wikibrain.core.dao.DaoFilter; import org.wikibrain.core.dao.LocalCategoryMemberDao; import org.wikibrain.core.dao.LocalPageDao; import org.wikibrain.core.lang.Language; import org.wikibrain.core.model.LocalPage; import org.wikibrain.core.model.NameSpace; import org.wikibrain.pageview.PageViewDao; import org.wikibrain.utils.ParallelForEach; import org.wikibrain.utils.Procedure; import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.Set; import java.util.concurrent.atomic.AtomicInteger; /** * @author Shilad Sen */ public class CategoryViews { public static String TOP_LEVEL_PARENT = "Category:Main_topic_classifications"; public static void main(String args[]) throws ConfigurationException, DaoException { // Get the pageview dao Env env = EnvBuilder.envFromArgs(args); Language lang = env.getDefaultLanguage(); final PageViewDao viewDao = env.getConfigurator().get(PageViewDao.class); final LocalCategoryMemberDao catDao = env.getConfigurator().get(LocalCategoryMemberDao.class); LocalPageDao pageDao = env.getConfigurator().get(LocalPageDao.class); // Download and import pageview stats if necessary. DateTime start = new DateTime(2014, 8, 14, 11, 0, 0); DateTime end = new DateTime(2014, 8, 14, 23, 0, 0); viewDao.ensureLoaded(start, end, env.getLanguages()); // Build up set of top level categories final Set<LocalPage> topLevelCategories = new HashSet<LocalPage>(); LocalPage parent = pageDao.getByTitle(lang, NameSpace.CATEGORY, TOP_LEVEL_PARENT); for (LocalPage page : catDao.getCategoryMembers(parent).values()) { if (page.getNameSpace().equals(NameSpace.CATEGORY)) { topLevelCategories.add(page); } } // Map from page id -> num views final TIntIntMap allViews = viewDao.getAllViews(lang, start, end); final Map<LocalPage, Integer> articleCounts = new HashMap<LocalPage, Integer>(); final Map<LocalPage, Integer> viewCounts = new HashMap<LocalPage, Integer>(); final AtomicInteger numPages = new AtomicInteger(); // Build up accumulators for each category by looping over pages in parallel ParallelForEach.iterate( pageDao.get(DaoFilter.normalPageFilter(lang)).iterator(), new Procedure<LocalPage>() { @Override public void call(LocalPage page) throws Exception { int views = allViews.get(page.getLocalId()); LocalPage cat = catDao.getClosestCategory(page, topLevelCategories, true); if (cat != null) { if (articleCounts.containsKey(cat)) { articleCounts.put(cat, articleCounts.get(cat) + 1); viewCounts.put(cat, viewCounts.get(cat) + views); } else { articleCounts.put(cat, 1); viewCounts.put(cat, views); } if (numPages.incrementAndGet() % 10000 == 0) { System.err.println("doing page " + numPages.get()); } } } }); for (LocalPage page : viewCounts.keySet()) { System.out.format("%s\t%d\t%d\n", page.getTitle().getCanonicalTitle(), articleCounts.get(page), viewCounts.get(page) ); } } }