/* * CategoryBlackList.java * * Copyright (c) 2005-2007 Andrew Krizhanovsky /aka at mail.iias.spb.su/ * Distributed under GNU Public License. */ package wikipedia.kleinberg; import wikipedia.data.ArticleIdAndTitle; import wikipedia.sql.*; import java.util.*; class LinkId { public List<Integer> dest = new ArrayList<Integer>(); }; /** The categories in the black list (e.g. in black_array_ru or black_array_en) * help to mark articles which have small possibilitiy to be synonyms * for common words, e.g. categories "Years", "Geography", etc. */ public class CategoryBlackList { private SessionHolder session; private final static String[] NULL_STRING_ARRAY = new String[0]; private final static int[] NULL_INT_ARRAY = new int[0]; /** Russian */ private final String[] black_array_ru = {"Страны", "Века", "Календарь", "География", "География России", "Края_России", "Области России", "Города России", "Столицы", "Города", "Персоналии", "Правители России", "Астрономы_России", }; public List<String> ru; private final String[] skip_array_ru = {"Stub"}; public List<String> skip_ru; /** English */ private final String[] black_array_en = {"Years", "Calendars", "Geography", "Colleges_and_universities", "Scientists", "Psychologists", "Philosophers"}; public List<String> en; private final String[] skip_array_en = {"Stub"}; public List<String> skip_en; private List<String> black_list; /** list of categories' names which should be skipped, omitted, e.g. 'All', 'Country', 'Time', * the value of list 'ru' or 'en' should be assigned to */ /* private List<String> skip_list;*/ /** This categories should be simply skipped, because they have no meaning * e.g. "Stub" */ /** The number of passed (treated) categories after searching for categories * with black list.*/ private int passed_steps; /** The maximum allowed number of passed (treated) categories (searching for * categories with black list). It is search constrain, alternative to * parameter 'categories_max_steps'. */ private int max_steps; /** Number of categories passed after removing via black-list */ private int total_categories_passed; /* current set of black list of categories, it depends on black_list and CategoryBlackList.max_steps */ private Set<String> category_titles_black_list; /* current set of white list of categories, it depends on black_list and CategoryBlackList.max_steps */ private Set<String> category_titles_white_list; /** Creates a new instance of CategoryBlackList */ public CategoryBlackList(SessionHolder session) { this.session = session; black_list = null; // copy black_array_ru to black_list_ru ru = new ArrayList<String>(); for(int i=0; i<black_array_ru.length; i++) { //String s = Encodings.UTF8ToLatin1(black_array_ru[i]); //ru.add(black_array_ru[i]); ru.add(session.connect.enc.EncodeFromJava(black_array_ru[i])); //ru.add(Encodings.FromTo(black_array_ru[i], Encodings.enc_java_default, Encodings.enc_int_default)); } en = new ArrayList<String>(); for(int i=0; i<black_array_en.length; i++) { //String s = Encodings.UTF8ToLatin1(black_array_en[i]); en.add(black_array_en[i]); } category_titles_black_list = new HashSet<String>(); category_titles_white_list = new HashSet<String>(); } public void init(List<String> new_black_list, int categories_max_steps) { black_list = new_black_list; max_steps = categories_max_steps; total_categories_passed = 0; category_titles_black_list.clear(); if(null != black_list && 0 < black_list.size()) { category_titles_black_list.addAll(black_list); } category_titles_white_list.clear(); } public void setBlackList(List<String> black_list) { this.black_list = black_list; } public List<String> getBlackList() { return black_list; } public void setMaxSteps(int max_steps) { this.max_steps = max_steps; } /** Gets the number of passed (treated) categories after searching for * categories with black list.*/ public int getPassedSteps() { return passed_steps; } /** Gets total number of passed (treated) categories after removing via * black-list. */ public int getTotalCategoriesPassed() { return total_categories_passed; } /** Fills session.category_nodes, if blacklist is empty. * * Without this func, the table of categories is empty when blacklist is * null. */ public void fillCategoryNodesIfBlackListEmpty (List<Article> articles) { if (null != black_list) return; int save_max_steps = max_steps; max_steps = 1; for(Article a:articles) { //String[] categories = getCategoryUpIteratively (a.page_id, black_list); } max_steps = save_max_steps; } /** Gets first level categories of the article with id='cl_from'. * * !Be careful with session.skipTitlesWithSpaces(), see example in * inBlackList(). */ public static String[] getFirstLevelCategories (SessionHolder session,int cl_from) { String[] add = null; if(0 < cl_from) { add = Categorylinks.GetCategoryTitleByArticleID(session.connect, cl_from); } else { // redirect page has negative id int cl_2 = Links.getIdToByIDFrom(session, cl_from, PageNamespace.MAIN); if(0 != cl_2) { add = Categorylinks.GetCategoryTitleByArticleID(session.connect, cl_2); } } return add; } /** Gets first level categories' IDs of the article with 'id'. */ public static int[] getFirstLevelCategoriesID (SessionHolder session,int id) { boolean save_skipTitlesWithSpaces = session.skipTitlesWithSpaces(false); String[] add = CategoryBlackList.getFirstLevelCategories (session, id); session.skipTitlesWithSpaces(save_skipTitlesWithSpaces); if (null == add) return NULL_INT_ARRAY; return Category.getIDByTitle(session.connect, Arrays.asList(add)); } /** Returns true, if category blacklist already containts title. */ public boolean inBlackListAlready (String title) { return category_titles_black_list.contains(title); } /** Compare categories (and parents) with the blacklist. * If the category (or parent) is found, which is presented in blacklist then * the name of this category will be returned, else - the null. * * @param cl_from id of from page, if it is < 0 then * this is redirect page * * @param first_level_categories is titles of first level categories * if !=null then it will be filled by * categories of first level, i.e. by id of categories * which are nearest to the article. * * If there are no categories in black-list * then first_level_categories will contain all * categories of the first level of the article. * Else when it is encountered an element from * the blacklist, and last element in it is * the name of element from the blacklist. * * @param source_article_id id of source article, return null * if id of source page == id of from page * * !Side effects: * This function sets value for the variable "passed_steps" (number of passed categories). */ public String inBlackList (int cl_from, List<String> first_level_categories, int source_article_id) { //public String inBlackList (int cl_from, List<Integer> first_level_categories) { if (cl_from == source_article_id) // Suppose that source article is not in the blacklist return null; // in order to escape problems in getAllHubsSortedByY() // started calculations from the source article if (null == black_list) return null; boolean save_skipTitlesWithSpaces = session.skipTitlesWithSpaces(false); String[] add = getFirstLevelCategories (session, cl_from); if (null == add) { session.skipTitlesWithSpaces(save_skipTitlesWithSpaces); return null; } if (null != first_level_categories) first_level_categories.addAll(Arrays.asList(add)); // test simple: whether the first level categories belong to the current black list? for(int i=0; i<add.length; i++) { if (category_titles_black_list.contains( add[i] )) { session.skipTitlesWithSpaces(save_skipTitlesWithSpaces); return add[i]; } } // test complex: check categories recursively in black list for(String a : add) { if (category_titles_white_list.contains(a)) continue; int id = PageTable.getCategoryIDByTitle(session.connect, a); if (0 == id) continue; String[] categories = getCategoryUpIteratively (id, black_list); if (null != categories && 0 < categories.length) { // test whether the last element is presented in blacklist String last = categories[ categories.length - 1 ]; if (black_list.contains( last )) { category_titles_black_list.add( a ); session.skipTitlesWithSpaces(save_skipTitlesWithSpaces); return last; } category_titles_white_list.add(a); } //if (null != first_level_categories) // first_level_categories.add(id); } session.skipTitlesWithSpaces(save_skipTitlesWithSpaces); return null; } /** * Get list of categories: categories, parents of categories, etc. * List is limited by max_steps. * * @param cl_from the id of article which categories will be sought * (should be >=0, else it is id of redirect page) * @param id_categories id of categories of first level (!Attention: * the function will update this variable), * if ==null then function don't update it * * @return If local_black_list == null then it returns all categories (<= max_steps). * If local_black_list != null then the function stops the search * when it encounters an element from this blacklist, and last element in String[] * is the name of element from this blacklist. * * !Side effect: session.category_nodes is updated. */ public String[] getCategoryUpIteratively(int cl_from, List<String> local_black_list) { //List<Integer> first_level_categories) { passed_steps = 0; boolean save_skipTitlesWithSpaces = session.skipTitlesWithSpaces(false); if(0 > cl_from) { // redirect page has negative id cl_from = Links.getIdToByIDFrom(session, cl_from, PageNamespace.MAIN); } if(0 == cl_from) { session.skipTitlesWithSpaces(save_skipTitlesWithSpaces); return null; } Connect connect = session.connect; //List<Integer> done_id = new ArrayList<Integer>(); List<String> categories = new ArrayList<String>(); List<Integer> categories_id = new ArrayList<Integer>(); HashMap<Integer, Category> local_map_category = new HashMap<Integer, Category>(); // map from category id to list of parents categories id Map<Integer, LinkId> local_links_in = new HashMap<Integer, LinkId>(); categories_id.add(cl_from); //int level = 0; boolean found_in_black_list = false; CATEGORIES_CYCLE: while (0 < categories_id.size()) { //level ++; //int page_id = categories_id.remove( categories_id.size() - 1 ); // depth-first search int page_id = categories_id.remove( 0 ); // breadth-first search //done_id.add(page_id); String[] add = Categorylinks.GetCategoryTitleByArticleID(connect, page_id); if (null == add) continue; // add new categories to the stack for(int i=0; i<add.length; i++) { if (passed_steps ++ >= max_steps) break CATEGORIES_CYCLE; if (!categories.contains(add[i])) // this is double check (first check is done_id.contains). categories.add( add[i]); // It is need because the same category can have differ id. //String latin1 = Encodings.FromTo(add[i], "UTF8", "ISO8859_1"); //String latin1 = session.enc.FromUserToDB(add[i]); String latin1 = add[i]; if (null != local_black_list && local_black_list.contains(latin1)) { found_in_black_list = true; break CATEGORIES_CYCLE; // last element of categories contain string from blacklist } int candidate_id = connect.page_table.getCategoryIDByTitle(connect, latin1); //if (0 != candidate_id && !done_id.contains(candidate_id) && !categories_id.contains(candidate_id)) { if (0 != candidate_id) { categories_id.add(candidate_id); //if (1 == level && null != first_level_categories) // first_level_categories.add(candidate_id); Category c; boolean c_new = false; if(local_map_category.containsKey(candidate_id)) { c = local_map_category.get(candidate_id); } else if(session.category_nodes.containsKey(candidate_id)) { c = session.category_nodes.get(candidate_id); } else { c = new Category(); c_new = true; c.page_id = candidate_id; c.page_title = add[i]; } LinkId l; //if(1 != level) { // skip link to non-categories (first article) l = local_links_in.get(candidate_id); if (null == l) { l = new LinkId(); local_links_in.put(candidate_id, l); } if (!l.dest.contains(page_id)) l.dest.add(page_id); //} if(c_new) local_map_category.put(c.page_id, c); } } } total_categories_passed += passed_steps; if (!found_in_black_list) { // add vertices session.category_nodes.putAll(local_map_category); // add arcs: session.category_nodes (local_links_in.key) .links_in.add (local_links_in.value) for(Integer id_source : local_links_in.keySet()) { LinkId l = (LinkId)local_links_in.get(id_source); assert(session.category_nodes.containsKey(id_source)); Category c = session.category_nodes.get(id_source); // c.links_in[] = all = unique_value( c.links_in[] + l.dest) List<Integer> all = new ArrayList<Integer>(); if(null != c.links_in) { for(int i=0; i<c.links_in.length; i++) { all.add(c.links_in[i]); } } for(Integer id_dest : l.dest) { if (!all.contains(id_dest)) all.add(id_dest); } c.links_in = new int[all.size()]; int i=0; for(Integer a : all) { c.links_in[i++] = a; } } } session.skipTitlesWithSpaces(save_skipTitlesWithSpaces); if (0 == categories.size()) return null; else return (String[])categories.toArray(NULL_STRING_ARRAY); } /** Returns only pairs (id, title) of articles which are absent in blacklist */ public ArticleIdAndTitle[] DeleteUsingBlackList(ArticleIdAndTitle[] aid_source) { if (null == black_list) return aid_source; boolean save_skipTitlesWithSpaces = session.skipTitlesWithSpaces(false); List<ArticleIdAndTitle> aid = new ArrayList<ArticleIdAndTitle>(aid_source.length); // or less size for(ArticleIdAndTitle it:aid_source) { if(session.removed_articles.hasTitle(it.title)) continue; String black_category = inBlackList (it.id, null, session.source_article_id); if (null == black_category) { aid.add(it); } else { session.removed_articles.addTitle(it.title); if (null != session.dump) { session.dump.file.PrintNL( String.format("Removed:%-20s steps:%3d blacklist category:%s (String[] DeleteUsingBlackList)", it.title, session.category_black_list.passed_steps, black_category)); session.dump.file.Flush(); } } } session.skipTitlesWithSpaces(save_skipTitlesWithSpaces); return (ArticleIdAndTitle[])aid.toArray(ArticleIdAndTitle.NULL_ARTICLEIDANDTITLE_ARRAY); } /** Returns only id of articles which are absent in blacklist. * Result should contains no more than n_limit elements. * Algorithm: * 1) Random permutation of elements in id (if b_rand is true) * 2) Take good id (absented in blacklist) till n_limit elements will be gathered. * Remark: if n_limit is -1 then return all id which are absent in blacklist. * GetRandNodeArray(result, n_limit); */ public ArticleIdAndTitle[] DeleteUsingBlackList (boolean b_rand, ArticleIdAndTitle[] aid_source, int n_limit) { if (null == black_list) return aid_source; List<ArticleIdAndTitle> result = new ArrayList<ArticleIdAndTitle>(aid_source.length); List<ArticleIdAndTitle> aid_list = Arrays.asList(aid_source); if(b_rand) { // Random permutation of elements in id Collections.shuffle(aid_list); //id = RandShuffle.permuteRandomly(id_source); } boolean save_skipTitlesWithSpaces = session.skipTitlesWithSpaces(false); BLACK_CATEGORY_CYCLE: //for(i=0; i<id.length; i++) { for(ArticleIdAndTitle aid:aid_list) { if(session.removed_articles.hasId(aid.id)) continue; String black_category = inBlackList (aid.id, null, session.source_article_id); if (null == black_category) { result.add( aid ); if (n_limit!=-1 && result.size() >= n_limit) break BLACK_CATEGORY_CYCLE; } else { session.removed_articles.addId(aid.id); if (null != session.dump) { //String title = PageTable.getTitleByID(session.connect, id[i]); String title = aid.title; // + " id:" + id[i] + session.dump.file.PrintNL( String.format("Removed:%-20s steps:%3d blacklist category:%s (int[] DeleteUsingBlackList)", title, session.category_black_list.passed_steps, black_category)); session.dump.file.Flush(); } } } session.skipTitlesWithSpaces(save_skipTitlesWithSpaces); return (ArticleIdAndTitle[])result.toArray(ArticleIdAndTitle.NULL_ARTICLEIDANDTITLE_ARRAY); } }