/* WikIDFAPI.java - API of TF-IDF database of wiki. * * Copyright (c) 2005-2008 Andrew Krizhanovsky /aka at mail.iias.spb.su/ * Distributed under GNU Public License. */ package wikipedia.sql_idf; import java.util.*; /** API of TF-IDF database of wiki. It's a higher level than * the requests to a separate database tables (page, term). */ public class WikIDFAPI { private final static List<TermPage> NULL_TERMPAGE_LIST = new ArrayList<TermPage>(0); private final static List<Page> NULL_PAGE_LIST = new ArrayList <Page>(0); /** Gets all terms for the page titled page_title. */ public static List<TermPage> getTerms (java.sql.Connection conn, String page_title) { // 1. SELECT * FROM page WHERE page_title="Japanese_tea_ceremony" // 2. SELECT * FROM term_page WHERE page_id=29243 // term_id // 3. SELECT * FROM term WHERE term.term_id IN (559092, 607182, 515136) // lemma, doc_freq Page p = wikipedia.sql_idf.Page.get(conn, page_title); if(null == p || 0 == p.getPageID()) return NULL_TERMPAGE_LIST; List<TermPage> tp_list = TermPage.getTermsByPageID(conn, p.getPageID()); Term.fillTerms(conn, tp_list); return tp_list; } /** Gets all terms for the page titled page_title, * terms are sorted by IDF: first are the most rare (in corpus) words. * * @param n_total_pages number of pages in the wiki corpus */ public static List<TermPage> getTermsSortedByTF_IDF (java.sql.Connection conn, String page_title, int n_total_pages) { List<TermPage> tp_list = WikIDFAPI.getTerms(conn, page_title); if(tp_list.size() > 1) { TermPage.calcTF_IDF(tp_list, n_total_pages); Collections.sort(tp_list, TermPage.TF_IDF_ORDER); } return tp_list; } /** Gets all pages which contain the term (lemma), pages are sorted by TF (term frequency). */ public static List<TermPage> getPages (java.sql.Connection conn, String lemma) { // 1. SELECT * FROM term WHERE lemma="PROVINE" // term_id, doc_freq, corpus_freq // set size(doc_freq) // 2. SELECT * FROM term_page WHERE term_id=67657 // page_id, term_freq (sort by) // 3. SELECT * FROM page WHERE page_id IN (8772, ... ) // page_title, word_count Term t = Term.get(conn, lemma); List<TermPage> tp_list = TermPage.getPagesByTermID(conn, t); if(tp_list.size() > 0) { Page.fillPages(conn, tp_list); } return tp_list; } }