package wikipedia.sql_idf; import wikipedia.sql.Connect; import wikipedia.util.PrintfFormat; import java.util.*; import junit.framework.TestCase; public class WikIDFAPITest extends TestCase { public Connect idfruwiki_conn; public Connect idfsimplewiki_conn; public WikIDFAPITest(String testName) { super(testName); } @Override protected void setUp() throws Exception { idfruwiki_conn = new Connect(); idfruwiki_conn.Open(Connect.IDF_RU_HOST, Connect.IDF_RU_DB, Connect.IDF_RU_USER, Connect.IDF_RU_PASS); idfsimplewiki_conn = new Connect(); idfsimplewiki_conn.Open(Connect.IDF_SIMPLE_HOST, Connect.IDF_SIMPLE_DB, Connect.IDF_SIMPLE_USER, Connect.IDF_SIMPLE_PASS); super.setUp(); } @Override protected void tearDown() throws Exception { idfruwiki_conn.Close(); idfsimplewiki_conn.Close(); super.tearDown(); } /** * Test of getTerms method, of class WikIDFAPI. */ public void testGetTerms_simple() { System.out.println("getTerms_simple"); String page_title = ""; List<TermPage> result = null; java.sql.Connection conn = idfsimplewiki_conn.conn; // null test result = WikIDFAPI.getTerms(conn, page_title); assertEquals(0, result.size()); // article "Evolution" should have more than 42(8) unique words in wikidf database page_title = "Green_tea"; Page p = wikipedia.sql_idf.Page.get(conn, page_title); if(null == p || 0 == p.getPageID()) { System.out.println("\nSkipped. WikIDF DB is empty. The test is valid only for parsed Simple WP"); return; } result = WikIDFAPI.getTerms(conn, page_title); assertTrue(result.size() >= 28); System.out.println("\nLemmas (doc_freq number of docs with term) : term_freq (frequency in the article) \"" + page_title + "\":"); for(TermPage tp:result) { System.out.println(tp.getTerm().getLemma() + " (" + tp.getTerm().getDocFreq() + ") " + " : " + tp.getTermFreq()); } } public void testGetTermsSortedByIDF_simple() { System.out.println("getTermsSortedByIDF_simple"); String page_title = ""; List<TermPage> result = null; java.sql.Connection conn = idfsimplewiki_conn.conn; int n_pages = wikipedia.sql_idf.Page.countPages(conn); // null test result = WikIDFAPI.getTermsSortedByTF_IDF(conn, page_title, n_pages); assertEquals(0, result.size()); // article "Evolution" should have more than 42(8) unique words in wikidf database page_title = "Green_tea"; Page p = wikipedia.sql_idf.Page.get(conn, page_title); if(null == p || 0 == p.getPageID()) { System.out.println("\nSkipped. WikIDF DB is empty. The test is valid only for parsed Simple WP"); return; } result = WikIDFAPI.getTermsSortedByTF_IDF(conn, page_title, n_pages); assertTrue(result.size() >= 28); // print terms (lemmas) of the article "Evolution" System.out.println("\nPage: \"" + page_title + "\""); System.out.println("TF*IDF : lemma : term_freq (term frequency in the article) : doc_freq (number of docs with term)"); System.out.println("Corpus has " + n_pages + " pages."); double prev_tf_idf = 10000000; for(TermPage tp:result) { double tf_idf = tp.getTF_IDF(); assertTrue(tf_idf <= prev_tf_idf); // check the sorting by TF-IDF prev_tf_idf = tf_idf; System.out.println( tf_idf + " : " + tp.getTerm().getLemma() + " : " + tp.getTermFreq() + " : " + tp.getTerm().getDocFreq()); } } public void testGetTermsSortedByIDF_ru() { System.out.println("getTermsSortedByIDF_ru"); String page_title = ""; List<TermPage> result = null; java.sql.Connection conn = idfruwiki_conn.conn; int n_pages = wikipedia.sql_idf.Page.countPages(conn); // article "Через_тернии_к_звёздам_(фильм)" should have more than ??? unique words in wikidf database page_title = "Через_тернии_к_звёздам_(фильм)"; Page p = wikipedia.sql_idf.Page.get(conn, page_title); if(null == p || 0 == p.getPageID()) { System.out.println("\nSkipped. WikIDF DB is empty. The test is valid only for parsed Russian WP"); return; } result = WikIDFAPI.getTermsSortedByTF_IDF(conn, page_title, n_pages); assertTrue(result.size() >= 87); // print terms (lemmas) of the article System.out.println("\nPage: \"" + page_title + "\""); System.out.println("TF*IDF : lemma : term_freq (term frequency in the article) : doc_freq (number of docs with term)"); System.out.println("Corpus has " + n_pages + " pages."); double prev_tf_idf = 10000000; for(TermPage tp:result) { double tf_idf = tp.getTF_IDF(); assertTrue(tf_idf <= prev_tf_idf); // check the sorting by TF-IDF prev_tf_idf = tf_idf; System.out.println( new PrintfFormat("%.2lg").sprintf(tf_idf) + "\t" + tp.getTerm().getLemma() + "\t" + tp.getTermFreq() + "\t" + tp.getTerm().getDocFreq()); } } public void testGetPages_simple() { System.out.println("getPages_simple"); List<TermPage> result = null; java.sql.Connection conn = idfsimplewiki_conn.conn; String lemma = "GREEN"; List<TermPage> list1 = WikIDFAPI.getPages(conn, lemma); List<TermPage> list2 = WikIDFAPI.getPages(conn, "TEA"); List<TermPage> intersection = TermPage.intersectPageTitles(list1, list2); Collections.sort(intersection, TermPage.TF_ORDER); System.out.println("\nPages which contain: " + lemma + " AND TEA:"); System.out.println("term_freq (two terms frequency in the page) : page title : number of words (page length)"); for(TermPage tp:intersection) { System.out.println( tp.getTermFreq() + " : " + tp.getPageTitle() + " : " + tp.getPageWordCount()); } } public void testGetPages_ru() { System.out.println("getPages_ru"); java.sql.Connection conn = idfruwiki_conn.conn; String lemma1 = "чародей"; String lemma2 = "ВОЛШЕБНИК"; List<TermPage> list1 = WikIDFAPI.getPages(conn, lemma1); List<TermPage> list2 = WikIDFAPI.getPages(conn, lemma2); List<TermPage> intersection = TermPage.intersectPageTitles(list1, list2); Collections.sort(intersection, TermPage.TF_ORDER); System.out.println("\nPages which contain: " + lemma1 + " AND " + lemma2 + ":"); System.out.println("term_freq (two terms frequency in the page) : page title : number of words (page length)"); for(TermPage tp:intersection) { System.out.println( new PrintfFormat("%d").sprintf(tp.getTermFreq()) + "\t" + tp.getPageTitle() + "\t" + tp.getPageWordCount()); } } }