VocabularySesame.java example

Explorer
hive-mrc-master
- doc
  - sampleCode
- hive-core
  - src
  - test
    - edu
      - unc
        ils
        mrc
        hive
        api
        SKOSSchemeTest.java
        SearcherTest.java
        TaggerTest.java
        ir
        lucene
        search
        AutocompleteTest
        AutocompleteTest.java
        tagging
        KEATaggerTest.java
        util
        SimpleCrawlerTest.java
        hive2
        api
        impl
        test
        HiveH2IndexImplTest.java
        HiveLuceneIndexImplTest.java
        HiveVocabularyImplTest.java
- hive-rs
  - src
    - org
      - unc
        hive
        services
        rs
        ConceptsResource.java
        ConfigurationListener.java
        SchemesResource.java
  - test
    - org
      - unc
        hive
        services
        rs
        ConceptsResourceTest.java
        FileIO.java
        SchemesResourceTest.java
- hive-web
  - src
    - org
      - unc
        hive
        client
        ClosablePanel.java
        ConceptBrowser.java
        ConceptBrowserService.java
        ConceptBrowserServiceAsync.java
        ConceptLink.java
        ConceptProxy.java
        HIVEMessages.java
        HomePage.java
        Indexer.java
        IndexerService.java
        IndexerServiceAsync.java
        RecordFormatter.java
        TestVis.java
        server
        ConceptBrowserServiceImpl.java
        FileUpload.java
        IndexerServiceImpl.java
        VocabularyService.java
        services
        ConceptListResource.java
        Main.java
        SKOSResourceApplication.java
        servlet
        AutocompleteServlet.java
        TermSuggestionServlet.java
        sync
        SyncJob.java
package kea.vocab;

import java.io.BufferedReader;

import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Set;
import java.util.Vector;

import org.openrdf.concepts.skos.core.Concept;
import org.openrdf.elmo.sesame.SesameManager;

import kea.stemmers.Stemmer;
import kea.stopwords.Stopwords;

/**
 * Builds an index with the content of the controlled vocabulary. Accepts
 * vocabularies as rdf files (SKOS format) and in plain text format:
 * vocabulary_name.en (with "ID TERM" per line) - descriptors & non-descriptors
 * vocabulary_name.use (with "ID_NON-DESCR \t ID_DESCRIPTOR" per line)
 * vocabulary_name.rel (with "ID \t RELATED_ID1 RELATED_ID2 ... " per line) See
 * KEA's homepage for more details.
 * 
 * @author Olena Medelyan
 * Modifications by Jose R. Perez-Aguera
 */

public class VocabularySesame extends Vocabulary {

	/**
	 * Location of the rdf version of the controlled vocabulary it needs to be
	 * in the SKOS format!
	 */
	public static File SKOS;
	/**
	 * Location of the vocabulary's *.en file containing all terms of the
	 * vocabularies and their ids.
	 */
	public static File EN;
	/**
	 * Location of the vocabulary's *.use file containing ids of non-descriptor
	 * with the corresponding ids of descriptors.
	 */
	public static File USE;
	/**
	 * Location of the vocabulary's *.rel file containing semantically related
	 * terms for each descriptor in the vocabulary.
	 */
	public static File REL;

	// if the type of the semantic relation will be required later
	// this could be a file containing
	// this information
	// public static File RT;

	/**
	 * Boolean describing which vocabulary format has been chosen: true if SKOS,
	 * false if text.
	 */
	private boolean useSkos;

	/** <i>Vocabulary</i> index */
	private HashMap VocabularyEN = null;
	/** <i>Vocabulary</i> reverse index */
	private HashMap VocabularyENrev = null;
	/** <i>Vocabulary</i> non-descriptors - descriptors list */
	private HashMap VocabularyUSE = null;
	/** <i>Vocabulary</i> related terms */
	private HashMap VocabularyREL = null;
	private HashMap VocabularyRT = null;

	private SesameManager manager;

	/** The document language */
	private String m_language;

	/** The default stemmer to be used */
	private Stemmer m_Stemmer;

	/** The list of stop words to be used */
	private Stopwords m_Stopwords;

	/**
	 * Vocabulary constructor.
	 * 
	 * Given the name of the vocabulary and the format it first checks whether
	 * the VOCABULARIES directory contains the specified files: -
	 * vocabularyName.rdf if skos format is selected - or a set of 3 flat files
	 * starting with vocabularyName and with extensions .en (id term) .use
	 * (non-descriptor \t descriptor) .rel (id \t related_id1 related_id2 ...)
	 * If the required files exist, the vocabulary index is built.
	 * 
	 * @param vocabularyName
	 *            The name of the vocabulary file (before extension).
	 * @param vocabularyFormat
	 *            The format of the vocabulary (skos or text).
	 * */

	public VocabularySesame(String vocabularyName, String vocabularyFormat,
			String documentLanguage, SesameManager manager) {
		super(documentLanguage);
		this.manager = manager;
		if (vocabularyFormat.equals("skos")) {
			// SKOS = new File(vocabularyDir + "/" + vocabularyName + ".rdf");
			//SKOS = new File(vocabularyName);
//			if (!SKOS.exists()) {
//				System.err.println("File " + vocabularyPath + "/ "
//						+ vocabularyName + ".rdf does not exist.");
//				System.exit(1);
//			}
			useSkos = true;

		}
	}

	/**
	 * Starts initialization of the vocabulary.
	 * 
	 */
	public void initialize() {

		System.out.println("-- Loading the Index...");
		if (useSkos) {
			try {
				buildSKOS();
			} catch (Exception e) {
				e.printStackTrace();
				System.exit(1);
			}
		} else {
			try {
				build();
			} catch (Exception e) {
				e.printStackTrace();
				System.exit(1);
			}
		}
	}

	/**
	 * Set the Stemmer value.
	 * 
	 * @param newStemmer
	 *            The new Stemmer value.
	 */
	public void setStemmer(Stemmer newStemmer) {

		this.m_Stemmer = newStemmer;

	}

	/**
	 * Set the M_Stopwords value.
	 * 
	 * @param newM_Stopwords
	 *            The new M_Stopwords value.
	 */
	public void setStopwords(Stopwords newM_Stopwords) {
		this.m_Stopwords = newM_Stopwords;
	}

	/**
	 * Builds the vocabulary indexes from SKOS file.
	 */
	public void buildSKOS() throws Exception {

		System.out.println("-- Building the Vocabulary index from SKOS Sesame Store");

		VocabularyEN = new HashMap();
		VocabularyENrev = new HashMap();
		VocabularyUSE = new HashMap();
		VocabularyREL = new HashMap();
		VocabularyRT = new HashMap();

		try {

			int count = 1;

			for (Concept concept : manager.findAll(Concept.class)) {

				// id of the concept (Resource), e.g. "c_4828"
				String id = concept.getQName().getNamespaceURI()
						+ concept.getQName().getLocalPart();

				// value of the property, e.g. c_4828 has narrower term "c_4829"
				//String val = concept.getSkosPrefLabel();

				/*
				 * For prefLabels
				 */
				String descriptor = concept.getSkosPrefLabel();

//				if (val.contains("@")) {
//					String[] val_components = val.split("@");
//					// System.err.println(val_components[1] + " " +
//					// m_language);
//					if (val_components[1].equals(m_language)) {
//						// System.err.println("Yes");
//						descriptor = val_components[0];
//					} else {
//						continue;
//					}
//				} else {
//					descriptor = val;
//				}

				String avterm = pseudoPhrase(descriptor);
				if (avterm == null) {
					avterm = descriptor;
				}

				if (avterm.length() > 1) {
					VocabularyEN.put(avterm, id);
					VocabularyENrev.put(id, descriptor);
				}

				/*
				 * For altLabels
				 */

				String non_descriptor;
				Set<String> altLabels = concept.getSkosAltLabels();
				for (String a : altLabels) {
					non_descriptor = a;
//					val = a;
//					if (val.contains("@")) {
//						String[] val_components = val.split("@");
//						// System.err.println(val_components[1] + " " +
//						// m_language);
//						if (val_components[1].equals(m_language)) {
//							// System.err.println("Yes");
//							non_descriptor = val_components[0];
//						} else {
//							continue;
//						}
//					} else {
//						non_descriptor = val;
//					}

					// first add the non_descriptor to the index hash
					// then fill here non-descriptor hash

					// id => id_non_descriptor
					addNonDescriptor(count, id, non_descriptor);
					count++;

				}

				/*
				 * For broader terms
				 */

				String id_broader;
				Set<Concept> broaders = concept.getSkosBroaders();
				for (Concept b : broaders) {
					id_broader = b.getQName().getNamespaceURI()
							+ b.getQName().getLocalPart();

					if (VocabularyREL.get(id) == null) {
						Vector rt = new Vector();
						rt.add(id_broader);
						VocabularyREL.put(id, rt);
					} else {
						Vector rt = (Vector) VocabularyREL.get(id);
						rt.add(id_broader);
						VocabularyREL.put(id, rt);
					}

					VocabularyRT.put(id + "-" + id_broader, "broader");
				}

				/*
				 * For narrower terms
				 */

				String id_narrower;
				Set<Concept> narrowers = concept.getSkosNarrowers();
				for (Concept n : narrowers) {
					id_narrower = n.getQName().getNamespaceURI()
							+ n.getQName().getLocalPart();

					if (VocabularyREL.get(id) == null) {
						Vector rt = new Vector();
						rt.add(id_narrower);
						VocabularyREL.put(id, rt);
					} else {
						Vector rt = (Vector) VocabularyREL.get(id);
						rt.add(id_narrower);
						VocabularyREL.put(id, rt);
					}

					VocabularyRT.put(id + "-" + id_narrower, "narrower");
				}

				/*
				 * For related terms
				 */

				String id_related;
				Set<Concept> related = concept.getSkosRelated();
				for (Concept r : related) {
					id_related = r.getQName().getNamespaceURI()
							+ r.getQName().getLocalPart();
					VocabularyRT.put(id + "-" + id_related, "related");
					VocabularyRT.put(id_related + "-" + id, "related");
				}

			}

		} catch (Exception e) {
			e.printStackTrace();
		}
	}

	private void addNonDescriptor(int count, String id_descriptor,
			String non_descriptor) {
		// id => id_non_descriptor
		String id_non_descriptor = "d_" + count;
		count++;

		String avterm = pseudoPhrase(non_descriptor);
		if (avterm.length() > 2) {
			VocabularyEN.put(avterm, id_non_descriptor);
			VocabularyENrev.put(id_non_descriptor, non_descriptor);
		}
		VocabularyUSE.put(id_non_descriptor, id_descriptor);
	}

	public String remove(String[] words, int i) {

		String result = "";
		for (int j = 0; j < words.length; j++) {
			if ((j != i) && (!m_Stopwords.isStopword(words[j]))) {

				result = result + words[j];

				if ((j + 1) != words.length) {
					result = result + " ";
				}
			}

		}
		return result;
	}

	/**
	 * Builds the vocabulary index from the text files.
	 */
	public void build() throws Exception {

		System.out.println("-- Building the Vocabulary index");

		VocabularyEN = new HashMap();
		VocabularyENrev = new HashMap();

		String readline;
		String term;
		String avterm;
		String id;

		try {
			InputStreamReader is = new InputStreamReader(
					new FileInputStream(EN));
			BufferedReader br = new BufferedReader(is);
			while ((readline = br.readLine()) != null) {
				int i = readline.indexOf(' ');
				term = readline.substring(i + 1);

				avterm = pseudoPhrase(term);

				if (avterm.length() > 2) {
					id = readline.substring(0, i);
					VocabularyEN.put(avterm, id);
					VocabularyENrev.put(id, term);
				}
			}
		} catch (Exception e) {
			e.printStackTrace();
		}

	}

	/**
	 * Builds the vocabulary index with descriptors/non-descriptors relations.
	 */
	public void buildUSE() throws Exception {
		if (!useSkos) {
			VocabularyUSE = new HashMap();
			String readline;
			String[] entry;

			try {

				InputStreamReader is = new InputStreamReader(
						new FileInputStream(USE));
				BufferedReader br = new BufferedReader(is);
				while ((readline = br.readLine()) != null) {
					entry = split(readline, "\t");

					// if more than one descriptors for
					// one non-descriptors are used, ignore it!
					// probably just related terms (cf. latest edition of
					// Agrovoc)

					if ((entry[1].indexOf(" ")) == -1) {
						VocabularyUSE.put(entry[0], entry[1]);
					}
				}
			} catch (Exception e) {
				e.printStackTrace();
			}
		}

	}

	/**
	 * Builds the vocabulary index with semantically related terms.
	 */
	public void buildREL() throws Exception {
		if (!useSkos) {

			System.err
					.println("-- Building the Vocabulary index with related pairs");

			VocabularyREL = new HashMap();

			String readline;
			String[] entry;

			try {

				InputStreamReader is = new InputStreamReader(
						new FileInputStream(REL));
				BufferedReader br = new BufferedReader(is);
				while ((readline = br.readLine()) != null) {
					entry = split(readline, "\t");
					String[] temp = split(entry[1], " ");
					Vector rt = new Vector();
					for (int i = 0; i < temp.length; i++) {
						rt.add(temp[i]);
					}
					VocabularyREL.put(entry[0], rt);
				}
			} catch (Exception e) {
				e.printStackTrace();
			}
		}

	}

	// Might be useful later, when the kind of relation is important
	// or wether two terms are related or not
	// public void buildRT() throws Exception {
	//		
	// VocabularyRT = new HashMap();
	//		
	// String[] entry;
	// String readline;
	// try {
	// InputStreamReader is2 = new InputStreamReader(new FileInputStream(RT));
	// BufferedReader br2 = new BufferedReader(is2);
	// while((readline=br2.readLine()) != null) {
	// entry = split(readline,"\t");
	// String pair = entry[0] + "-" + entry[1];
	// VocabularyRT.put(pair,"1");
	//				
	// }
	// } catch (Exception e) {
	// System.err.println("You need to put the .pairs file into KEA directory");
	// }
	//		
	// }
	//	

	/**
	 * Checks whether a normalized version of a phrase (pseudo phrase) is a
	 * valid vocabulary term.
	 * 
	 * @param phrase
	 * @return true if phrase is in the vocabulary
	 */
	public boolean containsEntry(String phrase) {
		return VocabularyEN.containsKey(phrase);
	}

	/**
	 * Given a phrase returns its id in the vocabulary.
	 * 
	 * @param phrase
	 * @return id of the phrase in the vocabulary index
	 */
	public String getID(String phrase) {
		String pseudo = pseudoPhrase(phrase);
		String id = null;
		if (pseudo != null) {
			id = (String) VocabularyEN.get(pseudo);
			if (VocabularyUSE.containsKey(id)) {
				id = (String) VocabularyUSE.get(id);
			}
		}
		return id;
	}

	/**
	 * Given id, gets the original version of vocabulary term.
	 * 
	 * @param id
	 * @return original version of the vocabulary term
	 */
	public String getOrig(String id) {
		return (String) VocabularyENrev.get(id);
	}

	/**
	 * Given id of the non-descriptor returs the id of the corresponding
	 * descriptor
	 * 
	 * @param id
	 *            of the non-descriptor
	 * @return id of the descriptor
	 */
	public String getDescriptor(String id) {
		return (String) VocabularyUSE.get(id);
	}

	/**
	 * Given id of a term returns the list with ids of terms related to this
	 * term.
	 * 
	 * @param id
	 * @return a vector with ids related to the input id
	 */
	public Vector getRelated(String id) {
		return (Vector) VocabularyREL.get(id);
	}

	/**
	 * Given an ID of a term gets the list of all IDs of terms that are
	 * semantically related to the given term with a specific relation
	 * 
	 * @param id
	 *            , relation
	 * @return a vector with ids related to the input id by a specified relation
	 */
	public Vector getRelated(String id, String relation) {
		Vector related = new Vector();
		Vector all_related = (Vector) VocabularyREL.get(id);
		if (all_related != null) {

			for (int d = 0; d < all_related.size(); d++) {
				String rel_id = (String) all_related.elementAt(d);

				String rel = (String) VocabularyRT.get(id + "-" + rel_id);

				if (rel != null) {
					if (rel.equals(relation)) {
						related.add(rel_id);
					}
				} else {
					System.err.println("Problem with " + getOrig(id) + " and "
							+ getOrig(rel_id));
				}
			}
		}
		return related;
	}

	/**
	 * Splits a string str at given character sequence (separator) into an
	 * array.
	 * 
	 * @param str
	 *            , separator
	 * @return String array with string parts separated by the separator string
	 */
	public String[] split(String str, String separator) {

		ArrayList lst = new ArrayList();
		String word = "";

		for (int i = 0; i < str.length(); i++) {
			int j = i + 1;
			String letter = str.substring(i, j);
			if (!letter.equalsIgnoreCase(separator)) {
				word = word + str.charAt(i);
			} else {
				lst.add(word);
				word = "";
			}
		}
		if (word != "") {
			lst.add(word);
		}
		String[] result = (String[]) lst.toArray(new String[lst.size()]);
		return result;
	}

	/**
	 * Generates the preudo phrase from a string. A pseudo phrase is a version
	 * of a phrase that only contains non-stopwords, which are stemmed and
	 * sorted into alphabetical order.
	 */
	public String pseudoPhrase(String str) {
		// System.err.print(str + "\t");
		String[] pseudophrase;
		String[] words;
		String str_nostop;
		String stemmed;

		str = str.toLowerCase();

		// This is often the case with Mesh Terms,
		// where a term is accompanied by another specifying term
		// e.g. Monocytes/*immunology/microbiology
		// we ignore everything after the "/" symbol.
		if (str.matches(".+?/.+?")) {
			String[] elements = str.split("/");
			str = elements[0];
		}

		// removes scop notes in brackets
		// should be replaced with a cleaner solution !!
		if (str.matches(".+?\\(.+?")) {
			String[] elements = str.split("\\(");
			str = elements[0];
		}

		// Remove some non-alphanumeric characters

		// str = str.replace('/', ' ');
		str = str.replace('-', ' ');
		str = str.replace('&', ' ');

		str = str.replaceAll("\\*", "");
		str = str.replaceAll("\\, ", " ");
		str = str.replaceAll("\\. ", " ");
		str = str.replaceAll("\\:", "");

		str = str.trim();

		// Stem string
		words = str.split(" ");
		str_nostop = "";

		for (int i = 0; i < words.length; i++) {
			String word = words[i];
			if (!m_Stopwords.isStopword(word)) {

				if (word.matches(".+?\\'.+?")) {
					String[] elements = word.split("\\'");
					if (elements.length < 1)
						word = elements[1];
				}

				if (str_nostop.equals("")) {
					str_nostop = word;
				} else {
					str_nostop = str_nostop + " " + word;
				}
			}
		}

		stemmed = m_Stemmer.stemString(str_nostop);
		// System.err.println(stemmed + "\t" + str_nostop + "\t"+ str);
		pseudophrase = sort(stemmed.split(" "));
		// System.err.println(join(pseudophrase));
		return join(pseudophrase);
	}


	/**
	 * overloaded swap method: exchange 2 locations in an array of Strings.
	 */
	public static void swap(int loc1, int loc2, String[] a) {
		String temp = a[loc1];
		a[loc1] = a[loc2];
		a[loc2] = temp;
	} // end swap

	/**
	 * Sorts an array of Strings into alphabetic order
	 * 
	 */
	public static String[] sort(String[] a) {

		// rename firstAt to reflect new role in alphabetic sorting
		int i, j, firstAt;

		for (i = 0; i < a.length - 1; i++) {
			firstAt = i;
			for (j = i + 1; j < a.length; j++) {
				// modify to preserve ordering of a String that starts with
				// upper case preceding the otherwise identical String that
				// has only lower case letters
				if (a[j].toUpperCase().compareTo(a[firstAt].toUpperCase()) < 0) {
					// reset firstAt
					firstAt = j;
				}
				// if identical when converted to all same case
				if (a[j].toUpperCase().compareTo(a[firstAt].toUpperCase()) == 0) {
					// but a[j] precedes when not converted
					if (a[j].compareTo(a[firstAt]) < 0) {
						// reset firstAt
						firstAt = j;
					}
				}
			}
			if (firstAt != i) {
				swap(i, firstAt, a);
			}
		}
		return a;
	} // end method selectionSort
}