/*
* Term.java
*
* Copyright (c) 2005-2008 Andrew Krizhanovsky /aka at mail.iias.spb.su/
* Distributed under GNU Public License.
*/
package wikipedia.sql_idf;
//import wikipedia.sql.Connect;
import java.util.*;
import java.sql.*;
/** Routines to work with the table term in wiki idf database.
*/
public class Term {
/** term identifier */
private int term_id;
/** lemma (term) */
private String lemma;
/** documents frequency - number of documents which contain the term */
private int doc_freq;
/** corpus frequency - the frequency of the term in the corpus */
private int corpus_freq;
private final static int LEMMA_LEN_MAX = 253;
private final static int LEMMA_LEN_WARNING = 100;
/** Gets term identifier */
public int getTermID() { return term_id; }
/** Sets term identifier */
public void setTermID(int _term_id)
{ term_id = _term_id; }
/** Gets lemma */
public String getLemma(){ return lemma; }
/** Sets lemma */
public void setLemma( String _lemma)
{ lemma = _lemma; }
/** Gets the document frequency - number of documents which contain the term */
public int getDocFreq() { return doc_freq;}
/** Sets the document frequency - number of documents which contain the term */
public void setDocFreq(int _doc_freq) {
doc_freq = _doc_freq;
}
/** Stores (updates) the fields 'doc_freq' and 'corpus_freq' to the table 'term',
* but the row with the 'lemma' should exist in the 'term' table already.
*/
public void storeToDatabase(java.sql.Connection conn) {
update (conn, lemma, doc_freq, corpus_freq);
}
/** Gets the frequency of the term in the corpus*/
public int getCorpusFreq() { return corpus_freq;}
/** Sets the frequency of the term in the corpus */
public void setCorpusFreq(int _corpus_freq) { corpus_freq = _corpus_freq;}
/** Selects data from the table term by the lemma.
*
* SQL:
* SELECT term_id,doc_freq,corpus_freq FROM term WHERE lemma="test";
*
* @param lemma lemma
* @return Term object with initialized fields: term_id, doc_freq, corpus_freq
*/
public static Term get (java.sql.Connection conn,String lemma) {
Term result = null;
Statement s = null;
ResultSet rs= null;
StringBuffer str_sql = new StringBuffer();
try {
s = conn.createStatement ();
str_sql.append("SELECT term_id,doc_freq,corpus_freq FROM term WHERE lemma=\"");
str_sql.append(lemma);
str_sql.append("\"");
s.executeQuery (str_sql.toString());
rs = s.getResultSet ();
if (rs.next ())
{
result = new Term();
result.term_id = rs.getInt("term_id");
result.doc_freq = rs.getInt("doc_freq");
result.corpus_freq = rs.getInt("corpus_freq");
result.lemma = lemma;
}
} catch(SQLException ex) {
System.err.println("SQLException (wikidf Term.java get()):: sql='" + str_sql.toString() + "' " + ex.getMessage());
} finally {
if (rs != null) { try { rs.close(); } catch (SQLException sqlEx) { } rs = null; }
if (s != null) { try { s.close(); } catch (SQLException sqlEx) { } s = null; }
}
return result;
}
/** Selects data into tp_list (lemma of term and number of documents with term)
* from the table term by the term_id.
*<PRE>
* SQL:
* SELECT term_id, lemma, doc_freq, corpus_freq FROM term
* WHERE term.term_id IN (559092, 607182, 515136)
*</PRE>
* @param lemma lemma
* @return Term object with initialized fields: term_id, doc_freq, corpus_freq
* @see Page.fillPages similar function (the contrary)
*/
public static void fillTerms (java.sql.Connection conn,List<TermPage> tp_list) {
if(null == tp_list || 0 == tp_list.size())
return;
Statement s = null;
ResultSet rs= null;
StringBuffer sb = new StringBuffer();
try {
s = conn.createStatement ();
sb.append("SELECT term_id,lemma,doc_freq,corpus_freq FROM term WHERE term_id IN (");
// Prepare SQL IN(...) via tp_list[].term.term_id
int len = tp_list.size()-1;
for (int i=0; i<len; i++) {
sb.append(tp_list.get(i).getTermID());
sb.append(",");
}
sb.append(tp_list.get(len).getTermID()); // skip last comma
sb.append(")");
s.executeQuery (sb.toString());
rs = s.getResultSet ();
if (rs.next ())
{
Map<Integer,Term> m_id_to_term = TermPage.createMapIdTerm(tp_list);
do {
Term t = m_id_to_term.get(rs.getInt("term_id"));
t.setDocFreq (rs.getInt("doc_freq"));
t.setCorpusFreq (rs.getInt("corpus_freq"));
t.setLemma (rs.getString("lemma"));
} while (rs.next());
}
} catch(SQLException ex) {
System.err.println("SQLException (wikidf Term.java fillTerms()):: sql='" + sb.toString() + "' " + ex.getMessage());
} finally {
if (rs != null) { try { rs.close(); } catch (SQLException sqlEx) { } rs = null; }
if (s != null) { try { s.close(); } catch (SQLException sqlEx) { } s = null; }
}
}
/** Increments lemma document frequency, since lemmas were found in the
* wiki-document titled page_title.
* (pl. 'lemmata', since the map lemma_tf contains several lemmas)
*
* @param lemma_tf map from lemma to term frequency (in document)
*
* @param doc_freq_max the database limit for the table term_page.
* If number of documents which contain a lemma (e.g. lemma "website")
* > doc_freq_max,
* then
* (1) table term_page.(term_id and page_id) contains only ID of first
* doc_freq_max documents which contain the term;
* (2) term.doc_freq increments in any case,
* i.e. term.doc_freq can be > doc_freq_max
*/
public static void incLemmataDocFreq( java.sql.Connection connect_idf,
//List<String> list_lemma,
Map<String, Integer> lemma_tf,
String page_title,
int doc_freq_max)
{
if(null == lemma_tf || 0 == lemma_tf.keySet().size())
return;
// count number of words in the article
int word_count = 0;
for(Integer n:lemma_tf.values()) {
word_count += n;
}
Page p = Page.getOrInsert(connect_idf, page_title, word_count);
p.storeWordCount(connect_idf, word_count);
for(String l:lemma_tf.keySet()) {
incLemmaDocFreq(connect_idf, l, lemma_tf.get(l), p, doc_freq_max);
}
}
/** Increments lemma document frequency (+1) and sets term frequency for
* the document (=inc_corpus_freq), since lemma was found the same number in
* the wiki-document titled page_title, corpus_freq += inc_corpus_freq.<br><br>
*
* 1. Adds page_title to the table page (if it's not exist).<br>
* 2. Adds lemma to the table term (if it's not exist).<br>
* 3. Adds (term_id of lemma, page_id of page_title) to the table term_page,
* if number of (term_id, *) < doc_freq_max.
*
* @param inc_corpus_freq the frequency of the term in the corpus should be
* incremented by this value
*/
public static void incLemmaDocFreq( java.sql.Connection connect_idf,
String lemma, int inc_corpus_freq,
Page page,
int doc_freq_max)
{
if (lemma.length() > LEMMA_LEN_WARNING) {
System.out.println("Warning: wikidf.Term.incLemmaDocFreq() page_title="+
page.getPageTitle() + " has very long ("+lemma.length()+" characters) lemma="+lemma);
if (lemma.length() > LEMMA_LEN_MAX) {
return;
}
}
Term t = get(connect_idf, lemma);
int doc_freq = incDocFreq(connect_idf, t, lemma, 1, inc_corpus_freq); // inc_doc_freq=1, +1 document
if( doc_freq <= doc_freq_max && null != page) {
if(null == t) {
t = get(connect_idf, lemma);
}
//??? int inc_corpus_freq_prev = TermPage.getTermFreqInDocument(connect_idf, t.term_id, page_id);
//??? TermPage.updateTermFreq( connect_idf, t.term_id, page_id, inc_corpus_freq_prev + inc_corpus_freq);
TermPage.updateTermFreq(connect_idf, t.term_id, page.getPageID(), inc_corpus_freq);
}
}
/** Iserts/updates term record (=lemma, +=inc_doc_freq++, +=inc_corpus_freq). <br><br>
* 1) Increments number of documents with term (+inc_doc_freq). <br>
* 2) Increments number of frequency of the term in corpus (+inc_corpus_freq).<br><br>
*
* SQL example:
* INSERT INTO term (lemma, doc_freq, corpus_freq) VALUES ("apple", 1, 20);
* UPDATE term SET doc_freq=5,corpus_freq=23 WHERE lemma="apple";
*
* Test database content by
* select * from term,page,term_page WHERE page.page_id=term_page.page_id AND term.term_id=term_page.term_id;
*
* @param lemma the lemma of term (word normalized form)
*
* @param inc_doc_freq increment the number of documents which contain the term
*
* @param inc_corpus_freq increment the frequency of the term in the corpus
*
* @return incremented value of document frequency for the lemma
*/
public static int incDocFreq (java.sql.Connection conn, Term term_cur,
String lemma,
int inc_doc_freq, int inc_corpus_freq) {
Statement s = null;
ResultSet rs= null;
StringBuffer str_sql = new StringBuffer();
int _doc_freq = 0;
if (null != term_cur)
{
// UPDATE term SET doc_freq=5,corpus_freq=23 WHERE lemma="apple";
_doc_freq = term_cur.doc_freq + inc_doc_freq;
int cf = term_cur.corpus_freq + inc_corpus_freq;
update (conn, lemma, _doc_freq, cf);
} else
{
try // null == term_cur
{
s = conn.createStatement ();
str_sql.append("INSERT INTO term (lemma,doc_freq,corpus_freq) ");
str_sql.append("VALUES (\"");
str_sql.append(lemma);
str_sql.append("\",");
str_sql.append(inc_doc_freq);
str_sql.append(",");
str_sql.append(inc_corpus_freq);
str_sql.append(")");
_doc_freq = 1;
s.executeUpdate (str_sql.toString());
}catch(SQLException ex) {
System.err.println("SQLException (wikidf Term.java incDocFreq()):: sql='" + str_sql.toString() + "' " + ex.getMessage());
} finally {
if (rs != null) { try { rs.close(); } catch (SQLException sqlEx) { } rs = null; }
if (s != null) { try { s.close(); } catch (SQLException sqlEx) { } s = null; }
}
}
return _doc_freq;
}
/** Updates fields 'doc_freq' and 'corpus_freq' of the term identified by lemma.<br><br>
* SQL example:
* UPDATE term SET doc_freq=5,corpus_freq=23 WHERE lemma="apple";
*
* @param lemma the lemma of the term which fields will be updated
* @param doc_freq new value of the number of documents which contain the term
* @param corpus_freq new value of frequency of the term in the corpus
*/
private static void update (java.sql.Connection conn, String lemma,
int doc_freq, int corpus_freq)
{
Statement s = null;
ResultSet rs= null;
StringBuffer str_sql = new StringBuffer();
try
{
s = conn.createStatement ();
str_sql.append("UPDATE term SET doc_freq=");
str_sql.append(doc_freq);
str_sql.append( ",corpus_freq=");
str_sql.append( corpus_freq);
str_sql.append(" WHERE lemma=\"");
str_sql.append(lemma);
str_sql.append("\"");
s.executeUpdate (str_sql.toString());
}catch(SQLException ex) {
System.err.println("SQLException (wikidf Term.java update()):: sql='" + str_sql.toString() + "' " + ex.getMessage());
} finally {
if (rs != null) { try { rs.close(); } catch (SQLException sqlEx) { } rs = null; }
if (s != null) { try { s.close(); } catch (SQLException sqlEx) { } s = null; }
}
}
/** Deletes records with the lemma from the table term.
*
* SQL example:
* DELETE FROM term WHERE lemma="apple";
*
* @param lemma the lemma of term (word normalized form)
*/
public static void delete (java.sql.Connection conn,String lemma) {
Statement s = null;
ResultSet rs= null;
StringBuffer str_sql = new StringBuffer();
try
{
s = conn.createStatement ();
// DELETE FROM term WHERE lemma="apple";
str_sql.append("DELETE FROM term WHERE lemma=\"");
str_sql.append(lemma);
str_sql.append("\"");
s.executeUpdate (str_sql.toString());
}catch(SQLException ex) {
System.err.println("SQLException (wikidf Term.java delete()):: sql='" + str_sql.toString() + "' " + ex.getMessage());
} finally {
if (rs != null) { try { rs.close(); } catch (SQLException sqlEx) { } rs = null; }
if (s != null) { try { s.close(); } catch (SQLException sqlEx) { } s = null; }
}
}
}