/*
* TermPage.java
*
* Copyright (c) 2005-2008 Andrew Krizhanovsky /aka at mail.iias.spb.su/
* Distributed under GNU Public License.
*/
package wikipedia.sql_idf;
import java.sql.Statement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.*;
/** Routines to work with the table term_page in wiki idf database.
*/
public class TermPage {
/** term identifier */
private Term term;
//public int term_id;
/** document identifier */
private Page page;
//public int page_id;
/** frequency of term (with term_id) in the document (with page_id) */
private int term_freq;
/** term frequency * inverse document frequency */
private double tf_idf;
private void init() {
term_freq = 0;
tf_idf = 0;
}
/** Creates a new instance of TermPage */
public TermPage() {
term = new Term();
page = new Page();
init();
}
/** To share one page for all terms within the same page */
public TermPage(Page _page) {
term = new Term();
page = _page;
init();
}
/** To share one term for all pages which contain the same term */
public TermPage(Term _term) {
term = _term;
page = new Page();
init();
}
private final static List<TermPage> NULL_TERMPAGE_LIST = new ArrayList<TermPage>(0);
private final static Map<Integer, TermPage> NULL_INTEGER_TERMPAGE_MAP = new HashMap<Integer, TermPage>(0);
private final static Map<Integer, Term> NULL_INTEGERTERM_MAP = new HashMap<Integer, Term> (0);
private final static Map<Integer, Page> NULL_INTEGERPAGE_MAP = new HashMap<Integer, Page> (0);
/** Gets term */
public Term getTerm() { return term; }
/** Sets term */
public void setTerm(Term _term)
{ term = _term; }
/** Gets term identifier */
public int getTermID() { return term.getTermID(); }
/** Sets term identifier */
public void setTermID(int _term_id)
{ term.setTermID(_term_id); }
/** Gets frequency of the term (with term_id) in the document (with page_id) */
public int getTermFreq(){ return term_freq; }
/** Sets frequency of the term (with term_id) in the document (with page_id) */
public void setTermFreq(int _term_freq)
{ term_freq = _term_freq; }
/** Gets value of term frequency * inverse document frequency */
public double getTF_IDF(){ return tf_idf; }
/** Sets value of term frequency * inverse document frequency */
public void setTF_IDF(double _tf_idf)
{ tf_idf = _tf_idf; }
/** Gets page */
public Page getPage() { return page; }
/** Gets page identifier */
public int getPageID() { return page.getPageID(); }
/** Gets the title of the page */
public String getPageTitle(){ return page.getPageTitle(); }
/** Gets a number of words in the article */
public int getPageWordCount(){ return page.getWordCount(); }
/** Gets term frequency by page ID and term ID from the table term_page.
*
* SQL:
* SELECT term_freq FROM term_page WHERE term_id=1 AND page_id=1;
*
* @return term frequency, 0 if it is absent in the table
*/
public static int getTermFreqInDocument (java.sql.Connection conn,int term_id,int page_id) {
int term_freq = 0;
Statement s = null;
ResultSet rs= null;
StringBuffer str_sql = new StringBuffer();
try {
s = conn.createStatement ();
// term_id, lemma, doc_freq
str_sql.append("SELECT term_freq FROM term_page WHERE term_id=");
str_sql.append(term_id);
str_sql.append(" AND page_id=");
str_sql.append(page_id);
s.executeQuery (str_sql.toString());
rs = s.getResultSet ();
if (rs.next ())
{
term_freq = rs.getInt("term_freq");
}
} catch(SQLException ex) {
System.err.println("SQLException (wikidf TermPage.java get()):: sql='" + str_sql.toString() + "' " + ex.getMessage());
} finally {
if (rs != null) { try { rs.close(); } catch (SQLException sqlEx) { } rs = null; }
if (s != null) { try { s.close(); } catch (SQLException sqlEx) { } s = null; }
}
return term_freq;
}
/** Gets list of term ID by page ID from the table term_page.
* Only one field of Term.term_id is filled here,
* others (lemma, doc_freq) are empty.
*
* SQL:
* SELECT term_id, term_freq FROM term_page WHERE page_id=29243
*
* @return terms, or empty array if they are absent
*/
public static List<TermPage> getTermsByPageID (java.sql.Connection conn,int page_id) {
List<TermPage> result = NULL_TERMPAGE_LIST;
Statement s = null;
ResultSet rs= null;
StringBuffer str_sql = new StringBuffer();
try {
s = conn.createStatement ();
// term_id, term_freq
//SELECT term_freq FROM term_page WHERE term_id
str_sql.append("SELECT term_id, term_freq FROM term_page WHERE page_id=");
str_sql.append(page_id);
s.executeQuery (str_sql.toString());
rs = s.getResultSet ();
if (rs.next ())
{
result = new ArrayList<TermPage>();
wikipedia.sql_idf.Page p = new Page();
p.setPageID(page_id);
// p.setPageTitle(_page_title);
do {
TermPage tp = new TermPage(p);
tp.term.setTermID( rs.getInt("term_id") );
tp.term_freq = rs.getInt("term_freq");
result.add(tp);
} while (rs.next ());
}
} catch(SQLException ex) {
System.err.println("SQLException (wikidf TermPage.java getTerms()):: sql='" + str_sql.toString() + "' " + ex.getMessage());
} finally {
if (rs != null) { try { rs.close(); } catch (SQLException sqlEx) { } rs = null; }
if (s != null) { try { s.close(); } catch (SQLException sqlEx) { } s = null; }
}
return result;
}
/** Gets list of pages which contain the term t.<br><br>
* <PRE>
* A. SELECT * FROM term_page WHERE term_id=67657
* page_id, term_freq (sort by)
*
* B. SELECT * FROM page WHERE page_id IN (8772, ... )
* page_title, word_count
*</PRE>
* @return pages, or empty array if they are absent
*/
public static List<TermPage> getPagesByTermID (java.sql.Connection conn,Term t) {
if(null == t) {
return NULL_TERMPAGE_LIST;
}
int doc_freq = t.getDocFreq();
int term_id = t.getTermID();
if(doc_freq <= 0 || term_id <= 0) {
return NULL_TERMPAGE_LIST;
}
List<TermPage> result = NULL_TERMPAGE_LIST;
Statement s = null;
ResultSet rs= null;
StringBuffer str_sql = new StringBuffer();
try {
s = conn.createStatement ();
str_sql.append("SELECT page_id, term_freq FROM term_page WHERE term_id=");
str_sql.append(term_id);
s.executeQuery (str_sql.toString());
rs = s.getResultSet ();
if (rs.next ())
{
result = new ArrayList<TermPage>(doc_freq); // fine, the number of docs is known in advance
do {
TermPage tp = new TermPage(t);
tp.page.setPageID( rs.getInt("page_id") );
tp.term_freq = rs.getInt("term_freq");
result.add(tp);
} while (rs.next ());
}
} catch(SQLException ex) {
System.err.println("SQLException (wikidf TermPage.java getPagesByTermID()):: sql='" + str_sql.toString() + "' " + ex.getMessage());
} finally {
if (rs != null) { try { rs.close(); } catch (SQLException sqlEx) { } rs = null; }
if (s != null) { try { s.close(); } catch (SQLException sqlEx) { } s = null; }
}
return result;
}
/** Counts number of pages which contain the term with term_id,
* i.e. number of rows in the table term_page with term_id.<br>
*
* SQL:
* SELECT COUNT(*) FROM term_page WHERE term_id=22;<br><br>
*
* Attention: this value <= doc_freq_max, see Term.incLemmaDocFreq.
* So if it's bigger then see value in table term.doc_freq
*/
public static int countPagesWithTerm (java.sql.Connection conn,int term_id) {
int doc_freq = 0;
Statement s = null;
ResultSet rs= null;
int size = 0;
StringBuffer str_sql = new StringBuffer();
try {
s = conn.createStatement ();
str_sql.append("SELECT COUNT(*) AS doc_freq FROM term_page WHERE term_id=");
str_sql.append(term_id);
s.executeQuery (str_sql.toString());
rs = s.getResultSet ();
if (rs.next ())
{
doc_freq = rs.getInt("doc_freq");
}
} catch(SQLException ex) {
System.err.println("SQLException (wikidf TermPage.java countPagesWithTerm()):: sql='" + str_sql.toString() + "' " + ex.getMessage());
} finally {
if (rs != null) { try { rs.close(); } catch (SQLException sqlEx) { } rs = null; }
if (s != null) { try { s.close(); } catch (SQLException sqlEx) { } s = null; }
}
return doc_freq;
}
/** Deletes records with the term_id and page_id from the table term_page.
*
* SQL example:
* DELETE FROM term_page WHERE term_id=1 AND page_id=1;
*/
public static void delete (java.sql.Connection conn,int term_id,int page_id) {
Statement s = null;
ResultSet rs= null;
StringBuffer str_sql = new StringBuffer();
try
{
s = conn.createStatement ();
str_sql.append("DELETE FROM term_page WHERE term_id=");
str_sql.append(term_id);
str_sql.append(" AND page_id=");
str_sql.append(page_id);
s.executeUpdate (str_sql.toString());
}catch(SQLException ex) {
System.err.println("SQLException (wikidf TermPage.java delete()):: sql='" + str_sql.toString() + "' " + ex.getMessage());
} finally {
if (rs != null) { try { rs.close(); } catch (SQLException sqlEx) { } rs = null; }
if (s != null) { try { s.close(); } catch (SQLException sqlEx) { } s = null; }
}
}
/** Deletes records with the term_id from the table term_page.
*
* SQL example:
* DELETE FROM term_page WHERE term_id=1;
*/
public static void deleteByTermID (java.sql.Connection conn,int term_id) {
Statement s = null;
ResultSet rs= null;
StringBuffer str_sql = new StringBuffer();
try
{
s = conn.createStatement ();
str_sql.append("DELETE FROM term_page WHERE term_id=");
str_sql.append(term_id);
s.executeUpdate (str_sql.toString());
}catch(SQLException ex) {
System.err.println("SQLException (wikidf TermPage.java deleteByTermID()):: sql='" + str_sql.toString() + "' " + ex.getMessage());
} finally {
if (rs != null) { try { rs.close(); } catch (SQLException sqlEx) { } rs = null; }
if (s != null) { try { s.close(); } catch (SQLException sqlEx) { } s = null; }
}
}
/** Inserts/updates term (term_id) frequency in document (page_id) in
* the table term_page.<br><br>
*
* SQL example:
* INSERT INTO term_page (term_id,page_id,term_freq) VALUES (1,2,7);
* UPDATE term_page SET term_freq=7 WHERE term_id=1 AND page_id=2;
*
* @param new_term_freq new value of the term frequency in the document
*/
public static void updateTermFreq ( java.sql.Connection conn,
int term_id,
int page_id,
int new_term_freq) {
Statement s = null;
ResultSet rs= null;
StringBuffer str_sql = new StringBuffer();
int term_freq_cur = -1;
try
{
s = conn.createStatement ();
term_freq_cur = getTermFreqInDocument(conn, term_id, page_id);
if (term_freq_cur == 0)
{
str_sql.append("INSERT INTO term_page (term_id,page_id,term_freq) VALUES (");
str_sql.append(term_id);
str_sql.append(",");
str_sql.append(page_id);
str_sql.append(",");
str_sql.append(new_term_freq);
str_sql.append(")");
} else // UPDATE term_page SET term_freq=7 WHERE term_id=1 AND page_id=2;
{
str_sql.append("UPDATE term_page SET term_freq=");
str_sql.append(new_term_freq);
str_sql.append(" WHERE term_id=");
str_sql.append(term_id);
str_sql.append(" AND page_id=");
str_sql.append(page_id);
}
s.executeUpdate (str_sql.toString());
}catch(SQLException ex) {
System.err.println("SQLException (wikidf Term.java incDocFreq()):: sql='" + str_sql.toString() + "' " + ex.getMessage());
} finally {
if (rs != null) { try { rs.close(); } catch (SQLException sqlEx) { } rs = null; }
if (s != null) { try { s.close(); } catch (SQLException sqlEx) { } s = null; }
}
}
/** Intersects page titles of the two lists, term frequencies are summarized.
* @return common items of two lists
*/
public static List<TermPage> intersectPageTitles (List<TermPage> l1, List<TermPage> l2) {
List<TermPage> result = new ArrayList<TermPage>();
Map<Integer,TermPage> m_page_id_to_l1 = TermPage.createMapPageIdToTermPage(l1);
for(TermPage tp2:l2) {
int id2 = tp2.getPageID();
if(m_page_id_to_l1.containsKey(id2)) {
int term_freq1 = m_page_id_to_l1.get(id2).getTermFreq();
int term_freq2 = tp2. getTermFreq();
tp2.setTermFreq( term_freq1 + term_freq2 );
result.add(tp2);
}
}
return result;
}
/** Creates map from terms' identifiers to terms */
public static Map<Integer, Term> createMapIdTerm (List<TermPage> tp_list) {
if(null == tp_list || 0 == tp_list.size())
return NULL_INTEGERTERM_MAP;
Map<Integer, Term> m = new HashMap<Integer, Term> ();
for(TermPage tp:tp_list)
m.put(tp.getTermID(), tp.getTerm());
return m;
}
/** Creates map from pages' identifiers to these pages */
public static Map<Integer, Page> createMapIdPage (List<TermPage> tp_list) {
if(null == tp_list || 0 == tp_list.size())
return NULL_INTEGERPAGE_MAP;
Map<Integer, Page> m = new HashMap<Integer, Page> ();
for(TermPage tp:tp_list)
m.put(tp.getPageID(), tp.getPage());
return m;
}
/** Creates map from pages' identifiers to these TermPages */
public static Map<Integer, TermPage> createMapPageIdToTermPage (List<TermPage> tp_list) {
if(null == tp_list || 0 == tp_list.size())
return NULL_INTEGER_TERMPAGE_MAP;
Map<Integer, TermPage> m = new HashMap<Integer, TermPage> ();
for(TermPage tp:tp_list)
m.put(tp.getPageID(), tp);
return m;
}
/** Calculates TF*IDF for the terms of this page, writes to ->tf_idf. */
protected static void calcTF_IDF (List<TermPage> tp_list, int n_total_pages) {
for(TermPage tp:tp_list) {
double idf = Math.log((double)(n_total_pages) / tp.getTerm().getDocFreq());
tp.tf_idf = tp.term_freq * idf;
}
}
/** Ordering: the items with highest value of TF*IDF will be in the begin of list */
static final Comparator<TermPage> TF_IDF_ORDER = new Comparator<TermPage>() {
public int compare(TermPage n1, TermPage n2) {
if (n1.tf_idf > n2.tf_idf)
return -1;
return 1;
}
};
/** Ordering: the items with highest value of TF will be in the begin of list */
public static final Comparator<TermPage> TF_ORDER = new Comparator<TermPage>() {
public int compare(TermPage n1, TermPage n2) {
if (n1.term_freq > n2.term_freq)
return -1;
return 1;
}
};
}