package kea.vocab;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.sql.Connection;
import java.sql.DatabaseMetaData;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.Set;
import java.util.Vector;
import org.apache.commons.dbcp.ConnectionFactory;
import org.apache.commons.dbcp.DriverManagerConnectionFactory;
import org.apache.commons.dbcp.PoolableConnectionFactory;
import org.apache.commons.dbcp.PoolingDriver;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.commons.pool.ObjectPool;
import org.apache.commons.pool.impl.GenericObjectPool;
import org.openrdf.concepts.skos.core.Concept;
import org.openrdf.elmo.ElmoModule;
import org.openrdf.elmo.sesame.SesameManager;
import org.openrdf.elmo.sesame.SesameManagerFactory;
import org.openrdf.repository.Repository;
import org.openrdf.repository.sail.SailRepository;
import org.openrdf.sail.nativerdf.NativeStore;
import org.perf4j.StopWatch;
import org.perf4j.log4j.Log4JStopWatch;
import edu.unc.ils.mrc.hive.api.SKOSScheme;
import edu.unc.ils.mrc.hive2.api.HiveVocabulary;
import edu.unc.ils.mrc.hive2.api.impl.HiveH2IndexImpl;
import edu.unc.ils.mrc.hive2.api.impl.HiveVocabularyImpl;
import kea.stemmers.PorterStemmer;
import kea.stemmers.Stemmer;
import kea.stopwords.Stopwords;
import kea.stopwords.StopwordsEnglish;
/**
* Vocabulary implementation backed by an embedded H2 database
* http://www.h2database.com/
*
* The database table structure is identical to the default Jena-based
* Vocabulary implementation supplied with the KEA++ distribution.
*
* Like the Sesame-based Vocabulary implementation developed for HIVE,
* the H2 implementation reads concepts from a Sesame database, loaded
* from original SKOS RDF/XML. This is actually likely unnecessary,
* but in place today to replicate existing behavior as closely as possible.
*
* Insert directly into H2 for large vocabularies is slow. Instead,
* temporary delimited files are created and bulk-loaded using CSVREAD.
*
*/
public class VocabularyH2 extends Vocabulary
{
private static final Log logger = LogFactory.getLog(VocabularyH2.class);
private static final long serialVersionUID = 7089304477568443576L;
private SesameManager manager;
String name;
/* OutputStreamWriter used during H2 database initialization */
OutputStreamWriter vocabularyEN;
OutputStreamWriter vocabularyENrev;
OutputStreamWriter vocabularyUSE;
OutputStreamWriter vocabularyREL;
File h2;
HiveH2IndexImpl h2Index = null;
/**
* Constructs a VocabularyH2 instance
* @param h2path Path to the H2 database for the current vocabulary
* @param documentLanguage Language (not currently used)
* @param manager SesameManager for the current vocabulary
* @throws ClassNotFoundException
* @throws SQLException
*/
public VocabularyH2(String name, String basePath, String documentLanguage, SesameManager manager)
throws ClassNotFoundException, SQLException
{
super(documentLanguage);
this.manager = manager;
this.name = name;
HiveVocabularyImpl hv = HiveVocabularyImpl.getInstance(basePath, name);
h2Index = (HiveH2IndexImpl)hv.getH2Index();
}
public VocabularyH2(SKOSScheme scheme, String documentLanguage)
throws ClassNotFoundException, SQLException
{
super(documentLanguage);
this.manager = scheme.getManager();
this.name = scheme.getName();
HiveVocabularyImpl hv = (HiveVocabularyImpl)scheme.getHiveVocabulary();
if (hv != null)
h2Index = (HiveH2IndexImpl)hv.getH2Index();
setStopwords(new StopwordsEnglish(scheme.getStopwordsPath()));
try
{
Class cls = Class.forName(scheme.getKeaStemmerClass());
Stemmer stemmer = (Stemmer)cls.newInstance();
setStemmer(stemmer);
} catch (Exception e) {
logger.error("Error instantiating stemmer: " + e);
setStemmer(new PorterStemmer());
}
}
/**
* Returns a connection from the pool
* @return
* @throws Exception
*/
protected Connection getConnection() throws SQLException {
logger.trace("getConnection()");
return h2Index.getConnection();
}
@Override
public void initialize()
{
try
{
if (!exists()) {
buildSKOS();
} else {
logger.info("H2 database exists, skipping H2 initialization");
}
} catch (Exception e) {
logger.error(e);
}
}
protected boolean exists() {
if (!h2Index.exists())
return false;
else
{
Connection con = null;
try {
con = getConnection();
DatabaseMetaData dm = con.getMetaData();
ResultSet rs = dm.getTables(null, null, "VOCABULARY_EN", null);
if (rs.next())
return true;
else
return false;
} catch (SQLException e) {
} finally {
if (con != null) {
try {
con.close();
} catch (Exception e) { }
}
}
}
return false;
}
/**
* Initializes the H2 databases for the current vocabulary from an
* existing Sesame store.
*/
@Override
public void buildSKOS() throws Exception {
StopWatch stopwatch = new Log4JStopWatch();
logger.info("buildSKOS");
// Temporary files used to store KEA++ maps prior to import into H2
File fileEN = File.createTempFile("vocabularyEN", null);
File fileENrev = File.createTempFile("vocabularyENrev", null);
File fileUSE = File.createTempFile("vocabularyUSE", null);
File fileREL = File.createTempFile("vocabularyREL", null);
boolean hasRelated = false;
boolean hasAltLabels = false;
try {
logger.info("Creating temp file " + fileEN.getAbsolutePath());
logger.info("Creating temp file " + fileENrev.getAbsolutePath());
logger.info("Creating temp file " + fileUSE.getAbsolutePath());
logger.info("Creating temp file " + fileREL.getAbsolutePath());
vocabularyEN = new OutputStreamWriter(new FileOutputStream(fileEN), "UTF-8");
vocabularyENrev = new OutputStreamWriter(new FileOutputStream(fileENrev), "UTF-8");
vocabularyUSE = new OutputStreamWriter(new FileOutputStream(fileUSE), "UTF-8");
vocabularyREL = new OutputStreamWriter(new FileOutputStream(fileREL), "UTF-8");
int count = 1;
// Iterate through all concepts in the vocabulary and write
// to temporary file
for (Concept concept : manager.findAll(Concept.class)) {
String uri = concept.getQName().getNamespaceURI()
+ concept.getQName().getLocalPart();
logger.debug("Adding concept " + uri);
String preferredLabel = concept.getSkosPrefLabel();
String pseudoPhrase = pseudoPhrase(preferredLabel);
if (pseudoPhrase == null) {
pseudoPhrase = preferredLabel;
}
if (pseudoPhrase.length() > 1) {
addConcept(uri, pseudoPhrase, preferredLabel);
}
Set<String> altLabels = concept.getSkosAltLabels();
for (String altLabel : altLabels) {
addNonDescriptor(count, uri, altLabel);
count++;
}
if (altLabels.size() > 0)
hasAltLabels = true;
String uriBroader = "";
Set<Concept> broaders = concept.getSkosBroaders();
try
{
for (Concept b : broaders) {
uriBroader = b.getQName().getNamespaceURI()
+ b.getQName().getLocalPart();
addBroader(uri, uriBroader);
}
} catch (Exception e) {
logger.error("Failed to get broader:" + uriBroader + "for (" + concept.getQName() + ")", e);
}
String uriNarrower = "";
Set<Concept> narrowers = concept.getSkosNarrowers();
try
{
for (Concept n : narrowers) {
uriNarrower = n.getQName().getNamespaceURI()
+ n.getQName().getLocalPart();
addNarrower(uri, uriNarrower);
}
} catch (Exception e) {
logger.error("Failed to get narrower:" + uriNarrower, e);
}
String uriRelated = "";
Set<Concept> related = concept.getSkosRelated();
try
{
for (Concept r: related) {
uriRelated = r.getQName().getNamespaceURI()
+ r.getQName().getLocalPart();
addRelated(uri, uriRelated);
}
} catch (Exception e) {
logger.error("Failed to get related:" + uriRelated + "," + uri, e);
}
if (related.size() > 0)
hasRelated = true;
}
} catch (Exception e) {
logger.error(e);
e.printStackTrace();
}
// Close the writers
vocabularyEN.close();
vocabularyENrev.close();
vocabularyREL.close();
vocabularyUSE.close();
logger.info("Importing to H2");
Connection con = null;
Statement s = null;
try {
con = getConnection();
s = con.createStatement();
StopWatch stopWatch = new Log4JStopWatch();
// Bulk load KEA++ relations from temporary files
s.execute("CREATE TABLE vocabulary_en (id varchar(512) , value varchar(1024)) AS SELECT * FROM CSVREAD('" + fileEN.getAbsolutePath() + "',null, 'UTF-8', '|');");
s.execute("CREATE INDEX idx_kea_1 on vocabulary_en(id);");
s.execute("CREATE TABLE vocabulary_enrev (id varchar(512) , value varchar(1024)) AS SELECT * FROM CSVREAD('" + fileENrev.getAbsolutePath() + "',null, 'UTF-8', '|');");
s.execute("CREATE INDEX idx_kea_2 on vocabulary_enrev(id);");
if (hasRelated)
s.execute("CREATE TABLE vocabulary_rel (id varchar(512), value varchar(1024), relation varchar(20)) AS SELECT * FROM CSVREAD('" + fileREL.getAbsolutePath() + "',null, 'UTF-8', '|');");
else
s.execute("CREATE TABLE vocabulary_rel (id varchar(512), value varchar(1024), relation varchar(20));");
s.execute("CREATE INDEX idx_kea_3 on vocabulary_rel(id);");
if (hasAltLabels)
s.execute("CREATE TABLE vocabulary_use ( id varchar(512) , value varchar(1024)) AS SELECT * FROM CSVREAD('" + fileUSE.getAbsolutePath() + "',null, 'UTF-8', '|');");
else
s.execute("CREATE TABLE vocabulary_use ( id varchar(512) , value varchar(1024));");
s.execute("CREATE INDEX idx_kea_4 on vocabulary_use(id);");
stopWatch.lap("H2 Created");
} catch (Exception e) {
logger.error(e);
} finally {
if (con != null) {
try {
if (con != null)
con.close();
if (s != null)
s.close();
} catch (Exception e) { }
}
}
// Delete the temporary files
fileEN.delete();
fileENrev.delete();
fileREL.delete();
fileUSE.delete();
stopwatch.lap("BuildSKOS");
}
private void addConcept(String uri, String pseudoPhrase, String preferredLabel)
{
logger.trace("addConcept: " + uri + "," + pseudoPhrase + "," + preferredLabel);
try {
vocabularyEN.write(pseudoPhrase + "|" + uri + "\n");
vocabularyENrev.write(uri + "|" + preferredLabel + "\n");
} catch (IOException e) {
logger.error(e);
}
}
private void addBroader (String uri, String uriBroader)
{
logger.trace("addBroader: " + uri + "," + uriBroader );
try {
vocabularyREL.write(uri + "|" + uriBroader + "|broader\n");
} catch (IOException e) {
logger.error(e);
}
}
private void addNarrower (String uri, String uriNarrower)
{
logger.trace("addNarrower: " + uri + "," + uriNarrower );
try {
vocabularyREL.write(uri + "|" + uriNarrower + "|narrower\n");
} catch (IOException e) {
logger.error(e);
}
}
private void addRelated (String uri, String uriRelated)
{
logger.trace("addRelated: " + uri + "," + uriRelated );
try {
vocabularyREL.write(uri + "|" + uriRelated + "|related\n");
vocabularyREL.write(uriRelated + "|" + uri + "|related\n");
} catch (IOException e) {
logger.error(e);
}
}
private void addNonDescriptor(int count, String uri, String altLabel)
{
logger.trace("addNonDescriptor: " + count + "," + uri + "," + altLabel );
String id_non_descriptor = "d_" + count;
String avterm = pseudoPhrase(altLabel);
try {
if (avterm.length() > 2) {
vocabularyEN.write(avterm + "|" + id_non_descriptor + "\n");
vocabularyENrev.write(id_non_descriptor + "|" + altLabel + "\n");
}
vocabularyUSE.write(id_non_descriptor + "|" + uri + "\n");
} catch (IOException e) {
logger.error(e);
}
}
@Override
public String getID(String phrase)
{
logger.trace("getID: " + phrase );
String pseudo = pseudoPhrase(phrase);
String id = null;
if (pseudo != null) {
Connection con = null;
PreparedStatement ps = null;
PreparedStatement ps2 = null;
try {
String sql = "select value from vocabulary_en where id = ?";
con = getConnection();
ps = con.prepareStatement(sql);
ps.setString(1, pseudo);
ResultSet rs = ps.executeQuery();
while (rs.next())
id = rs.getString(1);
String sql2 = "select value from vocabulary_use where id = ?";
ps2 = con.prepareStatement(sql2);
ps2.setString(1, id);
ResultSet rs2 = ps2.executeQuery();
// TODO: To replicate KEA behavior, use the last entry, sadly
while (rs2.next())
id = rs2.getString(1);
} catch (SQLException e) {
e.printStackTrace();
logger.error(e);
} finally {
try
{
if (con != null)
con.close();
if (ps != null)
ps.close();
if (ps2 != null)
ps2.close();
} catch (SQLException e) {
logger.error(e);
}
}
}
return id;
}
@Override
public String getOrig(String id)
{
logger.trace("getOrig: " + id );
String orig = null;
Connection con = null;
PreparedStatement ps = null;
try {
String sql = "select value from vocabulary_enrev where id = ?";
con = getConnection();
ps = con.prepareStatement(sql);
ps.setString(1, id);
ResultSet rs = ps.executeQuery();
if (rs.next())
orig = rs.getString(1);
} catch (SQLException e) {
logger.error(e);
} finally {
try
{
if (con != null)
con.close();
if (ps != null)
ps.close();
} catch (SQLException e) {
logger.error(e);
}
}
return orig;
}
@Override
public Vector<String> getRelated(String id)
{
logger.trace("getRelated: " + id );
Vector<String> related = new Vector<String>();
Connection con = null;
PreparedStatement ps = null;
try {
String sql = "select value from vocabulary_rel where id = ? and relation = 'related'";
con = getConnection();
ps = con.prepareStatement(sql);
ps.setString(1, id);
ResultSet rs = ps.executeQuery();
while (rs.next()) {
related.add(rs.getString(1));
}
} catch (SQLException e) {
logger.error(e);
} finally {
try
{
if (con != null)
con.close();
if (ps != null)
ps.close();
} catch (SQLException e) {
logger.error(e);
}
}
return related;
}
@Override
public Vector<String> getRelated(String id, String relation)
{
logger.trace("getRelated: " + id + "," + relation);
Vector<String> related = new Vector<String>();
Connection con = null;
PreparedStatement ps = null;
try {
String sql = "select value from vocabulary_rel where id = ? and relation = ?";
con = getConnection();
ps = con.prepareStatement(sql);
ps.setString(1, id);
ps.setString(2, relation);
ResultSet rs = ps.executeQuery();
while (rs.next()) {
related.add(rs.getString(1));
}
} catch (SQLException e) {
logger.error(e);
} finally {
try
{
if (con != null)
con.close();
if (ps != null)
ps.close();
} catch (SQLException e) {
logger.error(e);
}
}
return related;
}
public static void main(String[] args) throws Exception {
//NativeStore store = new NativeStore(new File("/usr/local/hive/hive-data/agrovoc/agrovocStore"));
NativeStore store = new NativeStore(new File("/usr/local/hive/hive-data/mesh/meshStore"));
//NativeStore store = new NativeStore(new File("/usr/local/hive/hive-data/nbii/nbiiStore"));
//NativeStore store = new NativeStore(new File("/usr/local/hive/hive-data/tgn/tgnStore"));
//String h2path = "/usr/local/hive/hive-data/agrovoc/agrovocH2/agrovoc";
String h2path = "/usr/local/hive/hive-data/mesh/meshH2/mesh";
//String h2path = "/usr/local/hive/hive-data/tgn/tgnH2/tgn";
//String h2path = "/usr/local/hive/hive-data/nbii/nbiiH2/nbii";
Repository repository = new SailRepository(store);
repository.initialize();
ElmoModule module = new ElmoModule();
SesameManagerFactory factory = new SesameManagerFactory(module, repository);
// Create a new ElmoManager with default locale
SesameManager manager = factory.createElmoManager();
VocabularyH2 voc = new VocabularyH2("hive", h2path, "en", manager);
Stopwords sw = new StopwordsEnglish("/usr/local/hive/hive-data/agrovoc//agrovocKEA/data/stopwords/stopwords_en.txt");
voc.setStopwords(sw);
voc.setStemmer(new PorterStemmer());
long start = System.currentTimeMillis();
voc.buildSKOS();
long end = System.currentTimeMillis();
long dur = end - start;
System.out.println("Total Time: " + dur);
voc.getID("Universities");
}
@Override
public void buildUSE() throws Exception {
// Not implemented
}
@Override
public void buildREL() throws Exception {
// Not supported
}
}