/* * Preprocessing.java * * Copyright (c) 2005-2007 Andrew Krizhanovsky /aka at mail.iias.spb.su/ * Distributed under GNU Public License. */ package wikipedia.clustering; import wikipedia.kleinberg.*; import java.util.*; /** * Class Description * */ public class Preprocessing { /** Creates a new instance of Preprocessing */ public Preprocessing() { } /** Build initial clusters of categories. * In the beginning every cluster consists of one node (category). * Assign to each cluster the following values (using information about categories in the cluster): * 1) c.n_article - number of articles which refer to the cluster (categories in the cluster); * 2) c.weight = 1 + n_article * 3) c.category_id[0] = id of added category (first and alone as yet) */ public static List<ClusterCategory> createInitialClusters (Map<Integer, Article> articles, Map<Integer, Category> categories) { List<ClusterCategory> clusters = new ArrayList<ClusterCategory>(Arrays.asList( new ClusterCategory[categories.size()] )); int i=0; for(Iterator it = categories.values().iterator(); it.hasNext();) { ClusterCategory c = new ClusterCategory(); c.init( (Category)it.next() ); clusters.set(i++, c); } return clusters; } /** Create initial edges. * Create edge e between clusters c1 and c2 for each edge between categories. * Calculate weight for edge e: e.weight = c1.weight + c2.weight */ public static List<Edge> createEdgesBetweenClusters (List<ClusterCategory> clusters, Map<Integer, Category> categories) { List<Edge> edges = new ArrayList<Edge>(); // this map helps to skip repeated (id_from, id_to) then edges will be unique Map<Integer, Integer> category_id_from_to = new HashMap<Integer, Integer>(); // speed-up search of clusters using this local map Map<Integer, ClusterCategory> category_id_to_cluster = ClusterCategory.mapCategoryIdToCluster(clusters); for(Category cat:categories.values()) { int id_from = cat.page_id; assert(null != category_id_to_cluster.get(id_from)); if(null != cat.links_out && null != category_id_to_cluster.get(id_from)) { for(int id_to:cat.links_out) { // links_out: id of categories which are referred by the category // add only unique edge (id_from, id_to) if ( (!category_id_from_to.containsKey(id_from) || id_to != category_id_from_to.get(id_from) ) && (!category_id_from_to.containsKey (id_to) || id_from != category_id_from_to.get(id_to) ) ) { assert(null != category_id_to_cluster.get(id_to)); // skip absent categories: if(null != category_id_to_cluster.get(id_to)) { category_id_from_to.put(id_from, id_to); Edge e = new Edge(); e.init(id_from, id_to, category_id_to_cluster); edges.add(e); } } } } } return edges; } }