VocabularyJena.java example

Explorer
hive-mrc-master
- doc
  - sampleCode
- hive-core
  - src
  - test
    - edu
      - unc
        ils
        mrc
        hive
        api
        SKOSSchemeTest.java
        SearcherTest.java
        TaggerTest.java
        ir
        lucene
        search
        AutocompleteTest
        AutocompleteTest.java
        tagging
        KEATaggerTest.java
        util
        SimpleCrawlerTest.java
        hive2
        api
        impl
        test
        HiveH2IndexImplTest.java
        HiveLuceneIndexImplTest.java
        HiveVocabularyImplTest.java
- hive-rs
  - src
    - org
      - unc
        hive
        services
        rs
        ConceptsResource.java
        ConfigurationListener.java
        SchemesResource.java
  - test
    - org
      - unc
        hive
        services
        rs
        ConceptsResourceTest.java
        FileIO.java
        SchemesResourceTest.java
- hive-web
  - src
    - org
      - unc
        hive
        client
        ClosablePanel.java
        ConceptBrowser.java
        ConceptBrowserService.java
        ConceptBrowserServiceAsync.java
        ConceptLink.java
        ConceptProxy.java
        HIVEMessages.java
        HomePage.java
        Indexer.java
        IndexerService.java
        IndexerServiceAsync.java
        RecordFormatter.java
        TestVis.java
        server
        ConceptBrowserServiceImpl.java
        FileUpload.java
        IndexerServiceImpl.java
        VocabularyService.java
        services
        ConceptListResource.java
        Main.java
        SKOSResourceApplication.java
        servlet
        AutocompleteServlet.java
        TermSuggestionServlet.java
        sync
        SyncJob.java
package maui.vocab;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Serializable;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Vector;
import java.util.zip.GZIPInputStream;

import maui.stemmers.Stemmer;
import maui.stopwords.Stopwords;

import com.hp.hpl.jena.rdf.model.Model;
import com.hp.hpl.jena.rdf.model.ModelFactory;
import com.hp.hpl.jena.rdf.model.Property;
import com.hp.hpl.jena.rdf.model.RDFNode;
import com.hp.hpl.jena.rdf.model.Resource;
import com.hp.hpl.jena.rdf.model.Statement;
import com.hp.hpl.jena.rdf.model.StmtIterator;

/**
 * Builds an index with the content of the controlled vocabulary.
 * Accepts vocabularies as rdf files (SKOS format) and in plain text format:
 * vocabulary_name.en (with "ID TERM" per line) - descriptors & non-descriptors
 * vocabulary_name.use (with "ID_NON-DESCR \t ID_DESCRIPTOR" per line)
 * vocabulary_name.rel (with "ID \t RELATED_ID1 RELATED_ID2 ... " per line)
 * 
 * @author Olena Medelyan
 */

public class VocabularyJena implements Serializable, Vocabulary {

	private static final long serialVersionUID = 1L;

	/** Location of the rdf version of the controlled vocabulary
	 * it needs to be in the SKOS format! */
	private static File SKOS;
	/** Location of the vocabulary's *.en file
	 * containing all terms of the vocabularies and their ids.*/
	private static File EN;
	/** Location of the vocabulary's *.use file
	 * containing ids of non-descriptor with the corresponding ids of descriptors.*/
	private static File USE;
	/** Location of the vocabulary's *.rel file
	 * containing semantically related terms for each descriptor in the vocabulary.*/
	private static File REL;

	private String vocabularyFormat;

	/** index : descriptor --> id */
	private HashMap<String, String> termIdIndex;

	/** reverse index : id --> descriptor */
	private HashMap<String, String> idTermIndex;

	/** normalized descriptor --> list of all possible meanings */
	private HashMap<String, Vector<String>> listsOfSenses;

	/** non-descriptor id  --> descriptors id */
	private HashMap<String, String> nonDescriptorIndex = null;

	/** id -->  list of related ids */
	private HashMap<String, Vector<String>> listsOfRelatedTerms = null;

	/** id-relatedId --> relation */
	private HashMap<String, String> relationIndex = null;

	/** Document language */
	private String language = "en";

	/** Document encoding */
	private String encoding = "UTF-8";

	/** Default stemmer to be used */
	private Stemmer stemmer;

	/** List of stopwords to be used */
	private Stopwords stopwords;

	/** Normalization to lower case - defaulte no */
	private boolean toLowerCase = true;

	/** Normalization via alphabetic reordering - default true*/
	private boolean reorder = true;
	
	private boolean debugMode = false;

	int numConcepts = 0;
	
	public int getSize() {
	    return numConcepts;
	}

	/** Vocabulary constructor. 
	 * 
	 * Given the name of the vocabulary and the format, it first checks whether
	 * the data/vocabularies directory contains the specified files:<br>
	 * - vocabularyName.rdf if skos format is selected<br>
	 * - or a set of 3 flat txt files starting with vocabularyName and with extensions<br>
	 * <li>.en (id term)
	 * <li>.use (non-descriptor \t descriptor)
	 * <li>.rel (id \t related_id1 related_id2 ...)
	 * If the required files exist, the vocabulary index is built.
	 * 
	 * @param vocabularyName The name of the vocabulary file (before extension).
	 * @param vocabularyFormat The format of the vocabulary (skos or text).
	 * @throws Exception 
	 * */
	public VocabularyJena(String vocabularyName, String vocabularyFormat, String vocabularyDirectory)
			throws Exception {

		this.vocabularyFormat = vocabularyFormat;
		
		if (vocabularyFormat.equals("skos")) {
			SKOS = new File(vocabularyDirectory + "/" + vocabularyName + ".rdf.gz");
			
			if (!SKOS.exists())
				throw new Exception("File " + SKOS.getAbsolutePath() + " not found!");

		} else if (vocabularyFormat.equals("text")) {
			EN = new File(vocabularyDirectory +"/" + vocabularyName + ".en");
			USE = new File(vocabularyDirectory +"/" + vocabularyName + ".use");
			REL = new File(vocabularyDirectory +"/" + vocabularyName + ".rel");

			if (!EN.exists())
				throw new Exception("File " + EN.getAbsolutePath()
						+ " does not exist.");

			if (!USE.exists())
				throw new Exception("File " + USE.getAbsolutePath()
						+ " does not exist.");

			if (!REL.exists())
				throw new Exception("File " + REL.getAbsolutePath()
						+ " does not exist.");

		} else {
			throw new Exception(vocabularyFormat
					+ "is an unsupported vocabulary format! Use skos or text");
		}
	}

	/* (non-Javadoc)
     * @see maui.vocab.Vocabulary#setLanguage(java.lang.String)
     */
	@Override
    public void setLanguage(String language) {
		this.language = language;
	}

	/* (non-Javadoc)
     * @see maui.vocab.Vocabulary#setEncoding(java.lang.String)
     */
	@Override
    public void setEncoding(String encoding) {
		this.encoding = encoding;
	}

	/* (non-Javadoc)
     * @see maui.vocab.Vocabulary#setLowerCase(boolean)
     */
	@Override
    public void setLowerCase(boolean toLowerCase) {
		this.toLowerCase = toLowerCase;
	}

	/* (non-Javadoc)
     * @see maui.vocab.Vocabulary#setReorder(boolean)
     */
	@Override
    public void setReorder(boolean reorder) {
		this.reorder = reorder;
	}

	/* (non-Javadoc)
     * @see maui.vocab.Vocabulary#setStemmer(maui.stemmers.Stemmer)
     */
	@Override
    public void setStemmer(Stemmer stemmer) {
		this.stemmer = stemmer;
	}
	
	/* (non-Javadoc)
     * @see maui.vocab.Vocabulary#setDebug(boolean)
     */
	@Override
    public void setDebug(boolean debugMode) {
		this.debugMode = debugMode;
	}

	/* (non-Javadoc)
     * @see maui.vocab.Vocabulary#initialize()
     */
	@Override
    public void initialize() throws Exception {

		if (vocabularyFormat.equals("skos")) {
			buildSKOS();
		} else {
			buildTEXT();
			buildUSE();
			buildREL();
		}
	}

	/* (non-Javadoc)
     * @see maui.vocab.Vocabulary#setStopwords(maui.stopwords.Stopwords)
     */
	@Override
    public void setStopwords(Stopwords stopwords) {
		this.stopwords = stopwords;
	}

	/**
	 * Builds the vocabulary indexes from SKOS file.
	 */
	public void buildSKOS() throws Exception {
		
		if (debugMode) {
			System.err.println("--- Building the Vocabulary index from the SKOS file...");
		}	
		
		termIdIndex = new HashMap<String, String>();
		idTermIndex = new HashMap<String, String>();
		listsOfSenses = new HashMap<String, Vector<String>>();

		nonDescriptorIndex = new HashMap<String, String>();
		listsOfRelatedTerms = new HashMap<String, Vector<String>>();
		relationIndex = new HashMap<String, String>();

		Model model = ModelFactory.createDefaultModel();
		
		
		InputStream gzipStream = new GZIPInputStream(new FileInputStream(SKOS));
		model.read(new InputStreamReader(gzipStream,encoding),"");
		
		// FileInputStream fileStream = new FileInputStream(SKOS);
		//model.read(new InputStreamReader(fileStream,encoding),"");

		StmtIterator iter;
		Statement stmt;
		Resource concept;
		Property property;
		RDFNode value;

		// to create IDs for non-descriptors!
		int count = 0;
		// Iterating over all statements in the SKOS file
		iter = model.listStatements();

		while (iter.hasNext()) {
			stmt = iter.nextStatement();

			// id of the concept (Resource), e.g. "c_4828"
			concept = stmt.getSubject();
			String id = concept.getURI();

			// relation or Property of the concept, e.g. "narrower"
			property = stmt.getPredicate();
			String relation = property.getLocalName();

			// value of the property, e.g. c_4828 has narrower term "c_4829"
			value = stmt.getObject();
			String name = value.toString();

			if (relation.equals("prefLabel")) {

			    numConcepts++;
				String descriptor, language;
				int atPosition = name.indexOf('@');
				if (atPosition != -1) {
					language = name.substring(atPosition + 1);
					name = name.substring(0, atPosition);
					if (language.equals(this.language))
						descriptor = name;
					else
						continue;

				} else {
					descriptor = name;
				}

				String descriptorNormalized = normalizePhrase(descriptor);

				if (descriptorNormalized.length() >= 1) {
					Vector<String> ids = listsOfSenses
							.get(descriptorNormalized);
					if (ids == null)
						ids = new Vector<String>();
					ids.add(id);
					listsOfSenses.put(descriptorNormalized, ids);

					termIdIndex.put(descriptor.toLowerCase(), id);
					idTermIndex.put(id, descriptor);
				}

			} else if (relation.equals("altLabel")
					|| (relation.equals("hiddenLabel"))) {

				String non_descriptor, language;

				int atPosition = name.indexOf('@');
				if (atPosition != -1) {
					language = name.substring(atPosition + 1);
					name = name.substring(0, atPosition);
					if (language.equals(this.language))
						non_descriptor = name;
					else
						continue;

				} else {
					non_descriptor = name;
				}

				addNonDescriptor(count, id, non_descriptor);
				count++;

			} else if (relation.equals("broader")
					|| relation.equals("narrower")
					|| relation.equals("composite")
					|| relation.equals("compositeOf")
					|| relation.equals("hasTopConcept")
					|| relation.equals("related")) {

				String relatedId = name;

				Vector<String> relatedIds = listsOfRelatedTerms.get(id);
				if (relatedIds == null)
					relatedIds = new Vector<String>();

				relatedIds.add(relatedId);

				listsOfRelatedTerms.put(id, relatedIds);

				relationIndex.put(id + "-" + relatedId, relation);
				if (relation.equals("related")) {
					relationIndex.put(relatedId + "-" + id, relation);
				}
			}
		}
	
		if (debugMode) {
			System.err.println("--- Statistics about the vocabulary: ");
			System.err.println("\t" + termIdIndex.size() + " terms in total");
			System.err.println("\t" + nonDescriptorIndex.size() + " non-descriptive terms");
			System.err.println("\t" + listsOfRelatedTerms.size()
				+ " terms have related terms");
		}
	}

	private void addNonDescriptor(int count, String idDescriptor,
			String nonDescriptor) {

		String idNonDescriptor = "d_" + count;
		count++;

		String normalizedNonDescriptor = normalizePhrase(nonDescriptor);
		if (normalizedNonDescriptor.length() >= 1) {
			Vector<String> ids = listsOfSenses.get(normalizedNonDescriptor);
			if (ids == null)
				ids = new Vector<String>();
			ids.add(idNonDescriptor);
			listsOfSenses.put(normalizedNonDescriptor, ids);
		}

		termIdIndex.put(nonDescriptor.toLowerCase(), idNonDescriptor);
		idTermIndex.put(idNonDescriptor, nonDescriptor);

		nonDescriptorIndex.put(idNonDescriptor, idDescriptor);
	}

	/**
	 * Builds the vocabulary index from the text files.
	 */
	public void buildTEXT() throws Exception {

		System.err.println("-- Building the Vocabulary index");

		termIdIndex = new HashMap<String, String>();
		idTermIndex = new HashMap<String, String>();

		String readline;
		String term;
		String avterm;
		String id;

		InputStreamReader is = new InputStreamReader(new FileInputStream(EN));
		BufferedReader br = new BufferedReader(is);
		while ((readline = br.readLine()) != null) {
			int i = readline.indexOf(' ');
			term = readline.substring(i + 1);

			avterm = normalizePhrase(term);

			if (avterm.length() >= 1) {
				id = readline.substring(0, i);
				termIdIndex.put(avterm, id);
				idTermIndex.put(id, term);
			}
		}

	}

	/**
	 * Builds the vocabulary index with descriptors/non-descriptors relations.
	 */
	public void buildUSE() throws Exception {

		nonDescriptorIndex = new HashMap<String, String>();
		String readline;
		String[] entry;

		InputStreamReader is = new InputStreamReader(new FileInputStream(USE));
		BufferedReader br = new BufferedReader(is);
		while ((readline = br.readLine()) != null) {
			entry = readline.split("\t");

			//			if more than one descriptors for
			//			one non-descriptors are used, ignore it!
			//			probably just related terms (cf. latest edition of Agrovoc)

			if ((entry[1].indexOf(" ")) == -1) {
				nonDescriptorIndex.put(entry[0], entry[1]);
			}
		}
	}

	/**
	 * Builds the vocabulary index with semantically related terms.
	 */
	public void buildREL() throws Exception {
		System.err
				.println("-- Building the Vocabulary index with related pairs");

		listsOfRelatedTerms = new HashMap<String, Vector<String>>();

		String readline;
		String[] entry;

		InputStreamReader is = new InputStreamReader(new FileInputStream(REL));
		BufferedReader br = new BufferedReader(is);
		while ((readline = br.readLine()) != null) {
			entry = readline.split("\t");
			String[] temp = entry[1].split(" ");
			Vector<String> relatedTerms = new Vector<String>();
			for (int i = 0; i < temp.length; i++) {
				relatedTerms.add(temp[i]);
			}
			listsOfRelatedTerms.put(entry[0], relatedTerms);
		}
	}

	/* (non-Javadoc)
     * @see maui.vocab.Vocabulary#getID(java.lang.String)
     */
	@Override
    public String getID(String phrase) {
		String id = termIdIndex.get(phrase.toLowerCase());
		if (id != null) {
			if (nonDescriptorIndex.containsKey(id))
				id = nonDescriptorIndex.get(id);
		}
		return id;
	}

	/* (non-Javadoc)
     * @see maui.vocab.Vocabulary#getTerm(java.lang.String)
     */
	@Override
    public String getTerm(String id) {
		return idTermIndex.get(id);
	}

	/**
	 * Checks whether a normalized phrase 
	 * is a valid vocabulary term.
	 * @param phrase
	 * @return true if phrase is in the vocabulary
	 */
	public boolean containsNormalizedEntry(String phrase) {
		return listsOfSenses.containsKey(normalizePhrase(phrase));
	}

	/**
	 * Returns true if a phrase has more than one senses
	 * @param phrase
	 * @return false if a phrase has only one sense
	 */
	public boolean isAmbiguous(String phrase) {
		Vector<String> meanings = listsOfSenses.get(normalizePhrase(phrase));
		if (meanings == null || meanings.size() == 1) {
			return false;
		}
		return true;
	}

	/* (non-Javadoc)
     * @see maui.vocab.Vocabulary#getSenses(java.lang.String)
     */
	@Override
    public Vector<String> getSenses(String phrase) {
		String normalized = normalizePhrase(phrase);

		Vector<String> senses = new Vector<String>();
		if (listsOfSenses.containsKey(normalized)) {
			for (String senseId : listsOfSenses.get(normalized)) {
				// 1. retrieve a descriptor if this sense is a non-descriptor
				if (nonDescriptorIndex.containsKey(senseId))
					senseId = nonDescriptorIndex.get(senseId);
				
				senses.add(senseId);
			}
		}
		return senses;

	}

	/* (non-Javadoc)
     * @see maui.vocab.Vocabulary#getRelated(java.lang.String)
     */
	@Override
    public Vector<String> getRelated(String id) {
		return listsOfRelatedTerms.get(id);
	}

	/* (non-Javadoc)
     * @see maui.vocab.Vocabulary#getRelated(java.lang.String, java.lang.String)
     */
	@Override
    public Vector<String> getRelated(String id, String relation) {
		Vector<String> related = new Vector<String>();
		Vector<String> all_related = listsOfRelatedTerms.get(id);
		if (all_related != null) {

			for (String rel_id : all_related) {
				String rel = relationIndex.get(id + "-" + rel_id);

				if (rel != null) {
					if (rel.equals(relation))
						related.add(rel_id);
				}
			}
		}
		return related;
	}

	/** 
	 * Generates the preudo phrase from a string.
	 * A pseudo phrase is a version of a phrase
	 * that only contains non-stopwords,
	 * which are stemmed and sorted into alphabetical order. 
	 */
	public String normalizePhrase(String phrase) {		
		
		if (toLowerCase) {
			phrase = phrase.toLowerCase();
		}
		
		if (toLowerCase) {
			phrase = phrase.toLowerCase();
		}
		StringBuffer result = new StringBuffer();
		char prev = ' ';
		int i = 0;
		while (i < phrase.length()) {
			char c = phrase.charAt(i);
			
			// we ignore everything after the "/" symbol and everything in brackets
			// e.g. Monocytes/*immunology/microbiology -> monocytes
			// e.g. Vanilla (Spice) -> vanilla
			if (c == '/' || c == '(') 
				break;
			
			if (c == '-' ||  c == '&' || c == '.' || c == '.') 
				c = ' ';
				
			if (c == '*' || c == ':') {
				prev = c;
				i++;
				continue;
			}
			
			if (c != ' ' || prev != ' ')
				result.append(c);
			
			prev = c;
			i++;
		}

	
		phrase = result.toString().trim();
		
		if (reorder || stopwords != null || stemmer != null) {
			phrase = pseudoPhrase(phrase);
		} 
		if (phrase.equals("")) {
			// to prevent cases where the term is a stop word (e.g. Back).
			return result.toString(); 
		} else {
			return phrase;
		}
	}

	/** 
	 * Generates the preudo phrase from a string.
	 * A pseudo phrase is a version of a phrase
	 * that only contains non-stopwords,
	 * which are stemmed and sorted into alphabetical order. 
	 */
	public String pseudoPhrase(String str) {
		String result = "";
		String[] words = str.split(" ");
		if (reorder) {
			Arrays.sort(words);
		}
		for (String word : words) {

			if (stopwords != null) {
				if (stopwords.isStopword(word)) {
					continue;
				}
			}

			int apostr = word.indexOf('\'');
			if (apostr != -1) {
				word = word.substring(0, apostr);
			}

			if (stemmer != null) {
				word = stemmer.stem(word);
			}
			result += word + " ";
		}
		return result.trim();
	}

	@Override
	public String getIDFromPrefLabel(String prefLabel) {
		return termIdIndex.get(prefLabel);
	}

}