/* * FreqLink.java - to calculate the frequency of l_from and l_to for each article. * It will be used to find the most frequently used words (stop-words), * which should be skipped while searching of synonyms. * * Copyright (c) 2005 Andrew Krizhanovsky /aka at mail.iias.spb.su/ * Distributed under GNU Public License. */ /* package wikipedia; import java.sql.*; public class FreqLink { public FreqLink() { } */ /** Goal: calculate number of links_in, links_out for each article * * Algorithm * ? 1. Get next unique n_pack elements from the table link * SQL: * 2. Take only links which are article, i.e. type=??? * SQL: * ? 3. * * SELECT link_in, COUNT(link_in) FROM links WHERE links_in=749 GROUP BY article; * * SELECT COUNT(*) FROM links; // total links * SELECT COUNT(*) FROM links GROUP BY l_from; // unique links l_from * SELECT l_from, COUNT(l_from) AS size FROM links GROUP BY l_from ORDER BY size; // l_from detail statistics * SELECT l_to, COUNT(l_to) AS size FROM links GROUP BY l_to ORDER BY size; // l_to detail statistics * * Analysis of links table (russian): * total links: 306471; unique links l_from 24530; unique l_to 18446 * l_to field * 10 articles > 750 times; * 145 > 200 times * 520 > 100 times * l_from field * 33 articles > 200 times * 380 > 100 times * * English links table * total links: 18 380 035; unique links l_from 1 355 280 (8 min); unique l_to ??? * */ /* public void CreateFreqLinkTable() { } */ /** Get article's frequency in the table links: count(l_from) and count(l_to) * Example Russian 'domra' (id=749): * SELECT COUNT(*) FROM links WHERE l_from=749; 6 * SELECT COUNT(*) FROM links WHERE l_to=749; 3 */ /*public void CountLFrom(int l_from) { }*/ /* } */