/*
* Hyponyms.java - Calculates distance between articles via categories.
*
* Copyright (c) 2005-2007 Andrew Krizhanovsky /aka at mail.iias.spb.su/
* Distributed under GNU Public License.
*/
package wikipedia.experiment;
import wikipedia.util.StringUtil;
import wikipedia.kleinberg.SessionHolder;
import wikipedia.sql.Connect;
import wikipedia.sql.PageTable;
/** Calculates distance between articles via categories. The table cat_count
* is used. It contains number of hyponyms (sub-categories + articles) for each
* category.
*
* Table cat_count could be built by MySQL stored procedure. See instructions
* in the file synarcher/sql_procedures/hyponyms/readme_ic.txt
*
* More information about IC distance metric see in
*
* 1) [Resnik95]. Resnik P. Disambiguating noun groupings with respect to
* WordNet senses. In Proceedings of the 3rd Workshop on Very Large Corpora.
* MIT, June, 1995. http://xxx.lanl.gov/abs/cmp-lg/9511006
*
* 2) [Resnik99]. Resnik P. Semantic similarity in a taxonomy: an
* information-based measure and its application to problems of ambiguity in
* natural language. – Journal of Artificial Intelligence Research (JAIR), 1999.
* – Vol. 11, No. , pp. 95-130.
* http://www.cs.washington.edu/research/jair/abstracts/resnik99a.htm
*
* and metric's adaptation to Wikipedia in (used here):
*
* 3) [Strube06] Strube M., Ponzetto S. WikiRelate! Computing semantic relatedness using
* Wikipedia. In Proceedings of the 21st National Conference on Artificial
* Intelligence (AAAI 06). Boston, Mass., July 16-20, 2006. [to appear]
* http://www.eml-research.de/english/research/nlp/publications.php
*/
public class Hyponyms {
/** Gets information content (ic) by a title of category page. The field
* cat_count.ic is used.
*/
/*public static float getIC(String category_title) {return 0; }*/
/** Gets title of common category of two pages with the maximum value of
* information content (IC), i.e. the most specific category parent.
* The field cat_count.ic is used.
*
* @return CatCount object with initialized fields, null if common
* categories are absent
*/
public static CatCount getCommonCategoryWithMaxIC(String page_title1, String page_title2,
SessionHolder session) {
int page_id1 = PageTable.getIDByTitle(session.connect, page_title1);
int page_id2 = PageTable.getIDByTitle(session.connect, page_title2);
// get list of parent categories for page 1 and 2
String categories1[] = session.category_black_list.getCategoryUpIteratively(page_id1, null);
String categories2[] = session.category_black_list.getCategoryUpIteratively(page_id2, null);
// intersect lists
String cat12[] = StringUtil.intersect(categories1, categories2);
// get category row with maximum IC (for each in intersection)
return CatCount.getMaxIC(session.connect.conn, cat12);
}
/** Calculates and prints IC for 353 pairs of words (may be synonyms).
*/
public static void dumpICWordSim353(SessionHolder session) {
System.out.println("getCommonCategoryWithMaxIC");
int i = 0;
CatCount cc;
System.out.println ("\nThe 353 words pairs are processing.\n");
System.out.println ("N, word1, word2, human, InformationContent, NearestCommonCategoryParent");
WordSim353 wordsim353 = new WordSim353();
int skip = 0;
for(WordSim w:wordsim353.data) {
i++;
String word1 = StringUtil.UpperFirstLetter(w.word1);
String word2 = StringUtil.UpperFirstLetter(w.word2);
//System.out.println ("The word Latin1ToUTF8 '"+Encodings.Latin1ToUTF8(all_words[i])+"' is processing...");
//word1 = "Stupid";
//word2 = "Smart";
cc = Hyponyms.getCommonCategoryWithMaxIC(word1, word2, session);
if (null == cc || cc.getInformationContent() < -0.9) { // skip: ic = -1 for Disambiguation pages, since there is common sub-categories of Main_page
skip ++;
//System.out.println (i + ", " + word1 + ", " + word2 + ", " + w.sim);
//System.out.println (i + ", " + word1 + ", " + word2 + ", " + w.sim + ", 0");
} else {
System.out.println (i + ", " + word1 + ", " + word2 + ", " + w.sim + ", " + cc.getInformationContent() + ", " + cc.getPageTitle());
}
//if( i > 7)
//break;
}
System.out.println("Skipped "+ skip + " items.");
}
}