/*
* CategorySet.java - Altorithms to manipulate category map.
* This map is a network (or DCEL) of categories (parents) of articles.
*
* Copyright (c) 2005-2007 Andrew Krizhanovsky /aka at mail.iias.spb.su/
* Distributed under GNU Public License.
*/
package wikipedia.kleinberg;
import wikipedia.language.Encodings;
import wikipedia.util.StringUtilRegular;
import wikipedia.clustering.*;
import java.util.*;
public class CategorySet {
/** Creates a new instance of CategorySet */
public CategorySet() {
}
/** Finds id of articles which refer to the category.
* Fill the field int[] Category.id_articles.
* Information from articles.id_categories are used.
*/
public static void fillLinksFromCategoryToArticles(Map<Integer, Article> articles,
Map<Integer, Category> categories) {
int j;
// 1. Fill map m with ArrayList<Integer>.
// The local map from category id to list of articles id
Map<Integer,List<Integer>> m = new HashMap<Integer,List<Integer>>();
for(Iterator it = articles.values().iterator(); it.hasNext();) {
Article a = (Article)it.next();
for(j=0; null != a.id_categories && j<a.id_categories.length; j++) {
Category c = categories.get(a.id_categories[j]);
if(null != c) {
List<Integer> l;
if(m.containsKey(c.page_id)) {
l = m.get(c.page_id);
} else {
// initialize map
l = new ArrayList<Integer>();
m.put(c.page_id, l);
}
// check uniqueness of articles id and add it to the category
if(!l.contains(a.page_id))
l.add(a.page_id);
}
}
}
// 2. Create int[] id_articles of categories
for(Iterator it = m.keySet().iterator(); it.hasNext();) {
// id is category_id
Integer id = (Integer)it.next();
List<Integer> l = m.get(id);
Category c = categories.get(id);
c.id_articles = new int[l.size()];
for(j=0; j<l.size(); j++) {
c.id_articles[j] = l.get(j);
}
}
}
/**
* Categories already partly created and stored in session.category_nodes.
* In order to have connected DCEL, it is need to create (here) .links_out
* field, and create links from categories to the articles.
*/
public static void prepareCategories (SessionHolder session,
Map<Integer, Article> articles) {
// add categories to the source article,
// i.e. add to articles(session.source_article_id) the category
int id = session.source_article_id;
if(null != articles.get(id)) {
List<String> titles_level_1_cats = new ArrayList<String>();
session.category_black_list.inBlackList(id, titles_level_1_cats, session.source_article_id);
articles.get(id).id_categories = Category.getIDByTitle(session.connect, titles_level_1_cats);
/* todo del
ArrayList<Integer> first_level_categories = new ArrayList<Integer>();
session.category_black_list.inBlackList(id, first_level_categories);
//articles.get(id).id_categories = first_level_categories;
int[] a = new int[first_level_categories.size()];
for(int i=0; i<first_level_categories.size(); i++)
a[i] = first_level_categories.get(i);
articles.get(id).id_categories = a;
*/
}
CreateLinksOutByLinksIn (session.category_nodes);
if (null != session.dump) {
String s = StringUtilRegular.encodeRussianToLatinitsa(session.source_page_title, Encodings.enc_java_default, Encodings.enc_int_default);
session.dump.DotOpen(s + "_01_category.dot");
Article.bdraw_categories = true;
session.dump.Dump(articles, "Article nodes");
session.dump.Dump(session.category_nodes, "Category nodes");
session.dump.BatEnd();
}
}
/**
* Create clusters of categories.
* @param max_cluster_weight the maximum allowed clusters' weight (size)
*/
public static List<ClusterCategory> getCategoryClusters (
Map<Integer, Category> categories,
Map<Integer, Article> articles, int max_cluster_weight) {
// Preprocessing.
fillLinksFromCategoryToArticles(articles, categories);
List<ClusterCategory> clusters = Preprocessing.createInitialClusters (articles, categories);
List<Edge> edges = Preprocessing.createEdgesBetweenClusters (clusters, categories);
// Clustering algorithm
Collections.sort(edges, Edge.WEIGHT_ORDER);
int i=0;
while(0<edges.size() && max_cluster_weight >= edges.get(0).getWeight()) {
Edge e = edges.get(0);
/*if(i++ == 113) {
int z = 1;
}*/
//System.out.println ("i="+i);
// merge e.c2 to e.c1
List<Edge> remove_edges = e.Merge();
// remove edges and remove cluster c2
clusters.remove( e.getVertex2() );
edges.removeAll( remove_edges );
// re-sort edges:
Collections.sort(edges, Edge.WEIGHT_ORDER);
}
return clusters;
}
/** Articles are presented as separated nodes */
public static void dumpClusterCategoryArticle(SessionHolder session, Map<Integer, Article> articles,
List<ClusterCategory> clusters, String filename_suffix) {
if (null != session.dump) {
String s = StringUtilRegular.encodeRussianToLatinitsa(session.source_page_title, Encodings.enc_java_default, Encodings.enc_int_default);
session.dump.DotOpen(s + "_" + filename_suffix + ".dot");
session.dump.DumpCluster(null, null, clusters, "Clusters");
session.dump.Dump(articles, "Article nodes");
session.dump.Dump(session.category_nodes, "Category nodes");
session.dump.BatEnd();
}
}
/** Articles are presented within cluster box */
public static void dumpClusterCategorywithListArticles(SessionHolder session, Map<Integer, Article> articles,
List<ClusterCategory> clusters, String filename_suffix) {
if (null != session.dump) {
String s = StringUtilRegular.encodeRussianToLatinitsa(session.source_page_title, Encodings.enc_java_default, Encodings.enc_int_default);
session.dump.DotOpen(s + "_" + filename_suffix + ".dot");
session.dump.DumpCluster(articles, session.category_nodes, clusters, "Clusters");
//session.dump.Dump(articles, "Article nodes");
session.dump.Dump(session.category_nodes, "Category nodes");
session.dump.BatEnd();
}
}
/**
* Create DCEL of categories.
* Dump it to GraphViz dot file.
* @param session.categories_max_steps the maximum allowed number of passed categories (in iterative search)
* @param max_levels the maximum allowed level of extracted categories
* @return the hashmap of nodes which are categories
*/
/*public HashMap<Integer, Article> Create(SessionHolder session, HashMap<Integer, Article> article_nodes, int max_levels) {
if (null == base_nodes)
return null;
HashMap<Integer, Article> category_nodes = null;
// REFORMULATE
// for each article_nodes
// get list of categories (limited)
// with links 1) from category to article and vice versa
// 2) from category to (and from) category
// two kind of links: bottom-link (cl_from) and up-link (cl_to)
return category_nodes;
return null;
}
/**
* Goal
* - save only categories, which ties different articles,
* - remove set of categories, which belong to only one category.
* Algorithm
* Input: DCEL = Category + Articles
* 1. FOR each article A
* 2. FOR each category C of article A
* 3. IF C is not marked THEN
* 4. start depth-first search from C, store categories to the set S;
* 5. IF the article B was encountered AND B<>A THEN mark categories S
* 6. ELSE remove categories S
* 7. ENDIF
* 8. ENDFOR
* 9. ENFOR
*/
/*public HashMap<Integer, Category> removeDanglingVertices (SessionHolder session,
HashMap<Integer, Category> categories)
{
return null;
}*/
/**
* It is supposed that the map categories have valid links in the list link_in,
* these lists are used here to create the lists link_out.
*/
public static void CreateLinksOutByLinksIn (Map<Integer, Category> categories)
{
for(Iterator it = categories.values().iterator(); it.hasNext();) {
Category c = (Category)it.next();
if(null != c.links_in) {
c.links_out = new int[c.links_in.length];
System.arraycopy(c.links_in, 0, c.links_out, 0, c.links_in.length);
}
/*for(Iterator it2 = c.links_in.iterator(); it2.hasNext();) {
Integer id = (Integer)it2.next();
assert (categories.containsKey(id));
Category updating = categories.get(id);
if (!updating.links_out.contains(c.page_id))
updating.links_out.add(c.page_id);
}*/
}
}
/** //
* Set value Article.id_categories[] for each article nodes
*/
/*public void SetCategory (SessionHolder session, HashMap<Integer, Article> article_nodes) {
}*/
}