package maui.vocab;
import java.io.File;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Vector;
import org.apache.commons.dbcp.ConnectionFactory;
import org.apache.commons.dbcp.DriverManagerConnectionFactory;
import org.apache.commons.dbcp.PoolableConnectionFactory;
import org.apache.commons.dbcp.PoolingDriver;
import org.apache.commons.pool.ObjectPool;
import org.apache.commons.pool.impl.GenericObjectPool;
import maui.stemmers.Stemmer;
import maui.stopwords.Stopwords;
/**
* Indexes the content of the controlled vocabulary into an embedded H2
* database. Accepts vocabularies only as rdf files (SKOS format).
* @author craig.willis
*/
public class VocabularyH2 implements Vocabulary
{
/** Document language */
private String language = "en";
/** Document encoding */
private String encoding = "UTF-8";
/** Default stemmer to be used */
private Stemmer stemmer;
/** List of stopwords to be used */
private Stopwords stopwords;
/** Normalization to lower case - defaulte no */
private boolean toLowerCase = true;
/** Normalization via alphabetic reordering - default true*/
private boolean reorder = true;
private boolean debugMode = false;
private String vocabularyName = "";
/** Vocabulary constructor.
*
* Given the name of the vocabulary and the format, it first checks whether
* the data/vocabularies directory contains the specified files:<br>
* - vocabularyName.rdf if skos format is selected<br>
* - or a set of 3 flat txt files starting with vocabularyName and with extensions<br>
* <li>.en (id term)
* <li>.use (non-descriptor \t descriptor)
* <li>.rel (id \t related_id1 related_id2 ...)
* If the required files exist, the vocabulary index is built.
*
* @param vocabularyName The name of the vocabulary file (before extension).
* @param vocabularyFormat The format of the vocabulary (skos or text).
* @throws Exception
* */
public VocabularyH2(String vocabularyName, String vocabularyDirectory, String h2Path)
throws Exception {
this.vocabularyName = vocabularyName;
// Initialize an H2 connection pool
String uri = "jdbc:h2:" + h2Path + File.separator + vocabularyName;
Class.forName("org.h2.Driver");
ObjectPool connectionPool = new GenericObjectPool(null);
ConnectionFactory connectionFactory = new DriverManagerConnectionFactory(uri, "", "");
new PoolableConnectionFactory(connectionFactory, connectionPool, null, null, false, true);
Class.forName("org.apache.commons.dbcp.PoolingDriver");
PoolingDriver driver = (PoolingDriver) DriverManager.getDriver("jdbc:apache:commons:dbcp:");
driver.registerPool(vocabularyName, connectionPool);
}
public void setLanguage(String language) {
this.language = language;
}
public void setEncoding(String encoding) {
this.encoding = encoding;
}
public void setLowerCase(boolean toLowerCase) {
this.toLowerCase = toLowerCase;
}
public void setReorder(boolean reorder) {
this.reorder = reorder;
}
public void setStemmer(Stemmer stemmer) {
this.stemmer = stemmer;
}
public void setDebug(boolean debugMode) {
this.debugMode = debugMode;
}
/**
* Starts initialization of the vocabulary.
* @throws Exception
*
*/
public void initialize() throws Exception {
}
/**
* Set the stopwords class.
* @param stopwords
*/
public void setStopwords(Stopwords stopwords) {
this.stopwords = stopwords;
}
public Connection getConnection() throws SQLException {
return DriverManager.getConnection("jdbc:apache:commons:dbcp:" + vocabularyName);
}
/**
* Returns the id of the given term
* @param phrase
* @return term id
*/
public String getID(String phrase)
{
String id = null;
if (phrase != null) {
Connection con = null;
PreparedStatement ps = null;
PreparedStatement ps2 = null;
try {
String sql = "select value from vocabulary_en where id = ?";
con = getConnection();
ps = con.prepareStatement(sql);
ps.setString(1, phrase.toLowerCase());
ResultSet rs = ps.executeQuery();
while (rs.next())
id = rs.getString(1);
if (id == null)
{
String sql2 = "select value from vocabulary_use where id = ?";
ps2 = con.prepareStatement(sql2);
ps2.setString(1, phrase);
ResultSet rs2 = ps2.executeQuery();
while (rs2.next())
{
id = rs2.getString(1);
}
}
} catch (SQLException e) {
e.printStackTrace();
} finally {
try
{
if (con != null)
con.close();
if (ps != null)
ps.close();
if (ps2 != null)
ps2.close();
} catch (SQLException e) {
e.printStackTrace();
}
}
}
return id;
}
public String getIDFromPrefLabel(String prefLabel)
{
String id = null;
if (prefLabel != null) {
Connection con = null;
PreparedStatement ps = null;
PreparedStatement ps2 = null;
try {
String sql = "select id from vocabulary_enrev where value = ?";
con = getConnection();
ps = con.prepareStatement(sql);
ps.setString(1, prefLabel);
ResultSet rs = ps.executeQuery();
while (rs.next())
id = rs.getString(1);
rs.close();
String sql2 = "select value from vocabulary_use where id = ?";
ps2 = con.prepareStatement(sql2);
ps2.setString(1, id);
ResultSet rs2 = ps2.executeQuery();
while (rs2.next())
id = rs2.getString(1);
rs2.close();
} catch (SQLException e) {
e.printStackTrace();
} finally {
try
{
if (con != null)
con.close();
if (ps != null)
ps.close();
if (ps2 != null)
ps2.close();
} catch (SQLException e) {
e.printStackTrace();
}
}
}
return id;
}
/**
* Returns the id of the given term
* @param phrase
* @return term id
*/
public Vector<String> getIDs(String normalized)
{
Vector<String> ids = new Vector<String>();
if (normalized != null) {
Connection con = null;
PreparedStatement ps = null;
PreparedStatement ps2 = null;
try {
String sql = "select value from vocabulary_en where id = ?";
con = getConnection();
ps = con.prepareStatement(sql);
ps.setString(1, normalized);
ResultSet rs = ps.executeQuery();
List<String> tmp = new ArrayList<String>();
while (rs.next()) {
String id = rs.getString(1);
tmp.add(id);
}
String sql2 = "select value from vocabulary_use where id = ?";
ps2 = con.prepareStatement(sql2);
for (String id: tmp)
{
ps2.setString(1, id);
ResultSet rs2 = ps2.executeQuery();
if (rs2.next())
{
String id2 = rs2.getString(1);
ids.add(id2);
}
else
{
ids.add(id);
}
}
} catch (SQLException e) {
e.printStackTrace();
} finally {
try
{
if (con != null)
con.close();
if (ps != null)
ps.close();
if (ps2 != null)
ps2.close();
} catch (SQLException e) {
e.printStackTrace();
}
}
}
return ids;
}
/**
* Checks whether a normalized phrase
* is a valid vocabulary term.
* @param phrase
* @return true if phrase is in the vocabulary
*/
public boolean containsNormalizedEntry(String phrase) {
return getIDs(normalizePhrase(phrase)).size() > 0;
}
/**
* Returns true if a phrase has more than one senses
* @param phrase
* @return false if a phrase has only one sense
*/
public boolean isAmbiguous(String phrase) {
return getIDs(normalizePhrase(phrase)).size() > 1;
}
/**
* Retrieves all possible descriptors for a given phrase
* @param phrase
* @return a vector list of all senses of a given term
*/
public Vector<String> getSenses(String phrase)
{
String normalized = normalizePhrase(phrase);
return getIDs(normalized);
}
/**
* Generates the preudo phrase from a string.
* A pseudo phrase is a version of a phrase
* that only contains non-stopwords,
* which are stemmed and sorted into alphabetical order.
*/
public String normalizePhrase(String phrase) {
if (toLowerCase) {
phrase = phrase.toLowerCase();
}
if (toLowerCase) {
phrase = phrase.toLowerCase();
}
StringBuffer result = new StringBuffer();
char prev = ' ';
int i = 0;
while (i < phrase.length()) {
char c = phrase.charAt(i);
// we ignore everything after the "/" symbol and everything in brackets
// e.g. Monocytes/*immunology/microbiology -> monocytes
// e.g. Vanilla (Spice) -> vanilla
if (c == '/' || c == '(')
break;
if (c == '-' || c == '&' || c == '.' || c == '.')
c = ' ';
if (c == '*' || c == ':') {
prev = c;
i++;
continue;
}
if (c != ' ' || prev != ' ')
result.append(c);
prev = c;
i++;
}
phrase = result.toString().trim();
if (reorder || stopwords != null || stemmer != null) {
phrase = pseudoPhrase(phrase);
}
if (phrase.equals("")) {
// to prevent cases where the term is a stop word (e.g. Back).
return result.toString();
} else {
return phrase;
}
}
/**
* Generates the preudo phrase from a string.
* A pseudo phrase is a version of a phrase
* that only contains non-stopwords,
* which are stemmed and sorted into alphabetical order.
*/
public String pseudoPhrase(String str) {
String result = "";
String[] words = str.split(" ");
if (reorder) {
Arrays.sort(words);
}
for (String word : words) {
if (stopwords != null) {
if (stopwords.isStopword(word)) {
continue;
}
}
int apostr = word.indexOf('\'');
if (apostr != -1) {
word = word.substring(0, apostr);
}
if (stemmer != null) {
word = stemmer.stem(word);
}
result += word + " ";
}
return result.trim();
}
/**
* Returns the term for the given id
* @param id - id of some phrase in the vocabulary
* @return phrase, i.e. the full form listed in the vocabulary
*/
public String getTerm(String id)
{
String orig = null;
Connection con = null;
PreparedStatement ps = null;
try {
String sql = "select value from vocabulary_enrev where id = ?";
con = getConnection();
ps = con.prepareStatement(sql);
ps.setString(1, id);
ResultSet rs = ps.executeQuery();
if (rs.next())
orig = rs.getString(1);
} catch (SQLException e) {
e.printStackTrace();
} finally {
try
{
if (con != null)
con.close();
if (ps != null)
ps.close();
} catch (SQLException e) {
e.printStackTrace();
}
}
return orig;
}
public Vector<String> getRelated(String id)
{
Vector<String> related = new Vector<String>();
Connection con = null;
PreparedStatement ps = null;
try {
String sql = "select value from vocabulary_rel where id = ? and relation = 'related'";
con = getConnection();
ps = con.prepareStatement(sql);
ps.setString(1, id);
ResultSet rs = ps.executeQuery();
while (rs.next()) {
related.add(rs.getString(1));
}
} catch (SQLException e) {
e.printStackTrace();
} finally {
try
{
if (con != null)
con.close();
if (ps != null)
ps.close();
} catch (SQLException e) {
e.printStackTrace();
}
}
return related;
}
/**
* Given an ID of a term gets the list of all IDs of terms
* that are semantically related to the given term
* with a specific relation
* @param id - id of some term in the vocabulary
* @param relation - a given semantic relation
* @return a vector with ids related to the input id by a specified relation
*/
public Vector<String> getRelated(String id, String relation)
{
Vector<String> related = new Vector<String>();
Connection con = null;
PreparedStatement ps = null;
try {
String sql = "select value from vocabulary_rel where id = ? and relation = ?";
con = getConnection();
ps = con.prepareStatement(sql);
ps.setString(1, id);
ps.setString(2, relation);
ResultSet rs = ps.executeQuery();
while (rs.next()) {
related.add(rs.getString(1));
}
} catch (SQLException e) {
e.printStackTrace();
} finally {
try
{
if (con != null)
con.close();
if (ps != null)
ps.close();
} catch (SQLException e) {
e.printStackTrace();
}
}
return related;
}
}