VocabularyH2.java example

Explorer

hive-mrc-master
- doc
  - sampleCode
- hive-core
  - src
  - test
    - edu
      - unc
        ils
        mrc
        hive
        api
        SKOSSchemeTest.java
        SearcherTest.java
        TaggerTest.java
        ir
        lucene
        search
        AutocompleteTest
        AutocompleteTest.java
        tagging
        KEATaggerTest.java
        util
        SimpleCrawlerTest.java
        hive2
        api
        impl
        test
        HiveH2IndexImplTest.java
        HiveLuceneIndexImplTest.java
        HiveVocabularyImplTest.java
- hive-rs
  - src
    - org
      - unc
        hive
        services
        rs
        ConceptsResource.java
        ConfigurationListener.java
        SchemesResource.java
  - test
    - org
      - unc
        hive
        services
        rs
        ConceptsResourceTest.java
        FileIO.java
        SchemesResourceTest.java
- hive-web
  - src
    - org
      - unc
        hive
        client
        ClosablePanel.java
        ConceptBrowser.java
        ConceptBrowserService.java
        ConceptBrowserServiceAsync.java
        ConceptLink.java
        ConceptProxy.java
        HIVEMessages.java
        HomePage.java
        Indexer.java
        IndexerService.java
        IndexerServiceAsync.java
        RecordFormatter.java
        TestVis.java
        server
        ConceptBrowserServiceImpl.java
        FileUpload.java
        IndexerServiceImpl.java
        VocabularyService.java
        services
        ConceptListResource.java
        Main.java
        SKOSResourceApplication.java
        servlet
        AutocompleteServlet.java
        TermSuggestionServlet.java
        sync
        SyncJob.java

package maui.vocab;

import java.io.File;

import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Vector;

import org.apache.commons.dbcp.ConnectionFactory;
import org.apache.commons.dbcp.DriverManagerConnectionFactory;
import org.apache.commons.dbcp.PoolableConnectionFactory;
import org.apache.commons.dbcp.PoolingDriver;
import org.apache.commons.pool.ObjectPool;
import org.apache.commons.pool.impl.GenericObjectPool;

import maui.stemmers.Stemmer;
import maui.stopwords.Stopwords;

/**
 * Indexes the content of the controlled vocabulary into an embedded H2
 * database. Accepts vocabularies only as rdf files (SKOS format).

 * @author craig.willis
 */

public class VocabularyH2 implements Vocabulary
{
	
	/** Document language */
	private String language = "en";

	/** Document encoding */
	private String encoding = "UTF-8";

	/** Default stemmer to be used */
	private Stemmer stemmer;

	/** List of stopwords to be used */
	private Stopwords stopwords;

	/** Normalization to lower case - defaulte no */
	private boolean toLowerCase = true;

	/** Normalization via alphabetic reordering - default true*/
	private boolean reorder = true;
	
	private boolean debugMode = false;

	private String vocabularyName = "";

	/** Vocabulary constructor. 
	 * 
	 * Given the name of the vocabulary and the format, it first checks whether
	 * the data/vocabularies directory contains the specified files:<br>
	 * - vocabularyName.rdf if skos format is selected<br>
	 * - or a set of 3 flat txt files starting with vocabularyName and with extensions<br>
	 * <li>.en (id term)
	 * <li>.use (non-descriptor \t descriptor)
	 * <li>.rel (id \t related_id1 related_id2 ...)
	 * If the required files exist, the vocabulary index is built.
	 * 
	 * @param vocabularyName The name of the vocabulary file (before extension).
	 * @param vocabularyFormat The format of the vocabulary (skos or text).
	 * @throws Exception 
	 * */
	public VocabularyH2(String vocabularyName, String vocabularyDirectory, String h2Path)
			throws Exception {

	    this.vocabularyName = vocabularyName;
		
        // Initialize an H2 connection pool
        String uri = "jdbc:h2:" + h2Path + File.separator + vocabularyName;
        Class.forName("org.h2.Driver");

        ObjectPool connectionPool = new GenericObjectPool(null);
        ConnectionFactory connectionFactory = new DriverManagerConnectionFactory(uri, "", "");
        new PoolableConnectionFactory(connectionFactory, connectionPool, null, null, false, true);

        Class.forName("org.apache.commons.dbcp.PoolingDriver");
        PoolingDriver driver = (PoolingDriver) DriverManager.getDriver("jdbc:apache:commons:dbcp:"); 
        driver.registerPool(vocabularyName, connectionPool);
	}

	public void setLanguage(String language) {
		this.language = language;
	}

	public void setEncoding(String encoding) {
		this.encoding = encoding;
	}

	public void setLowerCase(boolean toLowerCase) {
		this.toLowerCase = toLowerCase;
	}

	public void setReorder(boolean reorder) {
		this.reorder = reorder;
	}

	public void setStemmer(Stemmer stemmer) {
		this.stemmer = stemmer;
	}
	
	public void setDebug(boolean debugMode) {
		this.debugMode = debugMode;
	}

	/**
	 * Starts initialization of the vocabulary.
	 * @throws Exception 
	 *
	 */
	public void initialize() throws Exception {

	}

	/**
	 * Set the stopwords class.
	 * @param stopwords 
	 */
	public void setStopwords(Stopwords stopwords) {
		this.stopwords = stopwords;
	}


    public Connection getConnection() throws SQLException {
        return DriverManager.getConnection("jdbc:apache:commons:dbcp:" + vocabularyName);
    }    
    
	/**
	 * Returns the id of the given term
	 * @param phrase
	 * @return term id
	 */
    public String getID(String phrase) 
    {
        String id = null;
        if (phrase != null) {
            Connection con = null;
            PreparedStatement ps = null;
            PreparedStatement ps2 = null;
            try {
            	String sql = "select value from vocabulary_en where id = ?";
                
                con = getConnection();
                ps = con.prepareStatement(sql);
                ps.setString(1, phrase.toLowerCase());
                ResultSet rs = ps.executeQuery();
                while (rs.next())
                    id = rs.getString(1);
                
                if (id == null)
                {
                	String sql2 = "select value from vocabulary_use where id = ?";
    
                    ps2 = con.prepareStatement(sql2);
                    ps2.setString(1, phrase);
                    ResultSet rs2 = ps2.executeQuery();
                    while (rs2.next())
                    {
                        id = rs2.getString(1);
                    }
                }

            } catch (SQLException e) {
                e.printStackTrace();
            } finally {
                try
                {
                    if (con != null) 
                        con.close();
                    if (ps != null)
                        ps.close();
                    if (ps2 != null)
                        ps2.close();
                } catch (SQLException e) {
                    e.printStackTrace();
                }
            }
        }
        return id;
    }
    
    public String getIDFromPrefLabel(String prefLabel) 
    {
        String id = null;
        if (prefLabel != null) {
            Connection con = null;
            PreparedStatement ps = null;
            PreparedStatement ps2 = null;
            try {
            	String sql = "select id from vocabulary_enrev where value = ?";
                
                con = getConnection();
                ps = con.prepareStatement(sql);
                ps.setString(1, prefLabel);
                ResultSet rs = ps.executeQuery();
                while (rs.next())
                    id = rs.getString(1);
                rs.close();
                
            	String sql2 = "select value from vocabulary_use where id = ?";

                ps2 = con.prepareStatement(sql2);
                ps2.setString(1, id);
                ResultSet rs2 = ps2.executeQuery();
                while (rs2.next())
                    id = rs2.getString(1);
                rs2.close();
                
        
            } catch (SQLException e) {
                e.printStackTrace();
            } finally {
                try
                {
                    if (con != null) 
                        con.close();
                    if (ps != null)
                        ps.close();
                    if (ps2 != null)
                        ps2.close();
                } catch (SQLException e) {
                    e.printStackTrace();
                }
            }
        }
        return id;
    }

    /**
     * Returns the id of the given term
     * @param phrase
     * @return term id
     */
    public Vector<String> getIDs(String normalized) 
    {
        Vector<String> ids = new Vector<String>();

        
        if (normalized != null) {
            Connection con = null;
            PreparedStatement ps = null;
            PreparedStatement ps2 = null;
            try {
                String sql = "select value from vocabulary_en where id = ?";
                
                con = getConnection();
                ps = con.prepareStatement(sql);
                ps.setString(1, normalized);
                ResultSet rs = ps.executeQuery();
                List<String> tmp = new ArrayList<String>();
                while (rs.next()) {
                    String id = rs.getString(1);
                    tmp.add(id);
                }
                
                String sql2 = "select value from vocabulary_use where id = ?";
                ps2 = con.prepareStatement(sql2);
                for (String id: tmp)
                {
                    ps2.setString(1, id);
                    ResultSet rs2 = ps2.executeQuery();
                    if (rs2.next())
                    {
                        String id2 = rs2.getString(1);
                        ids.add(id2);
                    }
                    else
                    {
                    	ids.add(id);
                    }
                }


            } catch (SQLException e) {
                e.printStackTrace();
            } finally {
                try
                {
                    if (con != null) 
                        con.close();
                    if (ps != null)
                        ps.close();
                    if (ps2 != null)
                        ps2.close();
                } catch (SQLException e) {
                    e.printStackTrace();
                }
            }
        }
        return ids;
    }


	/**
	 * Checks whether a normalized phrase 
	 * is a valid vocabulary term.
	 * @param phrase
	 * @return true if phrase is in the vocabulary
	 */
	public boolean containsNormalizedEntry(String phrase) {
		return getIDs(normalizePhrase(phrase)).size() > 0;
	}

	/**
	 * Returns true if a phrase has more than one senses
	 * @param phrase
	 * @return false if a phrase has only one sense
	 */
	public boolean isAmbiguous(String phrase) {
	    return getIDs(normalizePhrase(phrase)).size() > 1;
	}

	/**
	 * Retrieves all possible descriptors for a given phrase
	 * @param phrase
	 * @return a vector list of all senses of a given term
	 */
	public Vector<String> getSenses(String phrase) 
	{
		String normalized = normalizePhrase(phrase);

		return getIDs(normalized);
	}
	
 
	
	/** 
	 * Generates the preudo phrase from a string.
	 * A pseudo phrase is a version of a phrase
	 * that only contains non-stopwords,
	 * which are stemmed and sorted into alphabetical order. 
	 */
	public String normalizePhrase(String phrase) {		
		
		if (toLowerCase) {
			phrase = phrase.toLowerCase();
		}
		
		if (toLowerCase) {
			phrase = phrase.toLowerCase();
		}
		StringBuffer result = new StringBuffer();
		char prev = ' ';
		int i = 0;
		while (i < phrase.length()) {
			char c = phrase.charAt(i);
			
			// we ignore everything after the "/" symbol and everything in brackets
			// e.g. Monocytes/*immunology/microbiology -> monocytes
			// e.g. Vanilla (Spice) -> vanilla
			if (c == '/' || c == '(') 
				break;
			
			if (c == '-' ||  c == '&' || c == '.' || c == '.') 
				c = ' ';
				
			if (c == '*' || c == ':') {
				prev = c;
				i++;
				continue;
			}
			
			if (c != ' ' || prev != ' ')
				result.append(c);
			
			prev = c;
			i++;
		}

	
		phrase = result.toString().trim();
		
		if (reorder || stopwords != null || stemmer != null) {
			phrase = pseudoPhrase(phrase);
		} 
		if (phrase.equals("")) {
			// to prevent cases where the term is a stop word (e.g. Back).
			return result.toString(); 
		} else {
			return phrase;
		}
	}

	/** 
	 * Generates the preudo phrase from a string.
	 * A pseudo phrase is a version of a phrase
	 * that only contains non-stopwords,
	 * which are stemmed and sorted into alphabetical order. 
	 */
	public String pseudoPhrase(String str) {
		String result = "";
		String[] words = str.split(" ");
		if (reorder) {
			Arrays.sort(words);
		}
		for (String word : words) {

			if (stopwords != null) {
				if (stopwords.isStopword(word)) {
					continue;
				}
			}

			int apostr = word.indexOf('\'');
			if (apostr != -1) {
				word = word.substring(0, apostr);
			}

			if (stemmer != null) {
				word = stemmer.stem(word);
			}
			result += word + " ";
		}
		return result.trim();
	}
    
    /**
     * Returns the term for the given id
     * @param id - id of some phrase in the vocabulary
     * @return phrase, i.e. the full form listed in the vocabulary
     */
    public String getTerm(String id) 
    {
        String orig = null;

        Connection con = null;
        PreparedStatement ps = null;
        try {
        	String sql = "select value from vocabulary_enrev where id = ?";
            
            con = getConnection();
            ps = con.prepareStatement(sql);
            ps.setString(1, id);
            ResultSet rs = ps.executeQuery();
            if (rs.next())
                orig = rs.getString(1);

        } catch (SQLException e) {
            e.printStackTrace();
        } finally {
            try
            {
                if (con != null) 
                    con.close();
                if (ps != null)
                    ps.close();
            } catch (SQLException e) {
                e.printStackTrace();
            }
        }
            
        return orig;
    }


    public Vector<String> getRelated(String id) 
    {
        
        Vector<String> related = new Vector<String>();

        Connection con = null;
        PreparedStatement ps = null;
        try {
        	String sql = "select value from vocabulary_rel where id = ? and relation = 'related'";
            
            con = getConnection();
            ps = con.prepareStatement(sql);
            ps.setString(1, id);
            ResultSet rs = ps.executeQuery();
            while (rs.next()) {
                related.add(rs.getString(1));
            }
        } catch (SQLException e) {
            e.printStackTrace();
        } finally {
            try
            {
                if (con != null) 
                    con.close();
                if (ps != null)
                    ps.close();
            } catch (SQLException e) {
                e.printStackTrace();
            }
        }
        return related;
        
    }

    /**
     * Given an ID of a term gets the list of all IDs of terms
     * that are semantically related to the given term
     * with a specific relation
     * @param id - id of some term in the vocabulary
     * @param relation - a given semantic relation
     * @return a vector with ids related to the input id by a specified relation
     */
    public Vector<String> getRelated(String id, String relation) 
    {
        Vector<String> related = new Vector<String>();

        Connection con = null;
        PreparedStatement ps = null;
        try {
        	String sql = "select value from vocabulary_rel where id = ? and relation = ?";
            
            con = getConnection();
            ps = con.prepareStatement(sql);
            ps.setString(1, id);
            ps.setString(2, relation);
            ResultSet rs = ps.executeQuery();
            while (rs.next()) {
                related.add(rs.getString(1));
            }

        } catch (SQLException e) {
            e.printStackTrace();
        } finally {
            try
            {
                if (con != null) 
                    con.close();
                if (ps != null)
                    ps.close();
            } catch (SQLException e) {
                e.printStackTrace();
            }
        }
        return related;
    }
 
}