Vocabulary.java example

Explorer

hive-mrc-master
- doc
  - sampleCode
- hive-core
  - src
  - test
    - edu
      - unc
        ils
        mrc
        hive
        api
        SKOSSchemeTest.java
        SearcherTest.java
        TaggerTest.java
        ir
        lucene
        search
        AutocompleteTest
        AutocompleteTest.java
        tagging
        KEATaggerTest.java
        util
        SimpleCrawlerTest.java
        hive2
        api
        impl
        test
        HiveH2IndexImplTest.java
        HiveLuceneIndexImplTest.java
        HiveVocabularyImplTest.java
- hive-rs
  - src
    - org
      - unc
        hive
        services
        rs
        ConceptsResource.java
        ConfigurationListener.java
        SchemesResource.java
  - test
    - org
      - unc
        hive
        services
        rs
        ConceptsResourceTest.java
        FileIO.java
        SchemesResourceTest.java
- hive-web
  - src
    - org
      - unc
        hive
        client
        ClosablePanel.java
        ConceptBrowser.java
        ConceptBrowserService.java
        ConceptBrowserServiceAsync.java
        ConceptLink.java
        ConceptProxy.java
        HIVEMessages.java
        HomePage.java
        Indexer.java
        IndexerService.java
        IndexerServiceAsync.java
        RecordFormatter.java
        TestVis.java
        server
        ConceptBrowserServiceImpl.java
        FileUpload.java
        IndexerServiceImpl.java
        VocabularyService.java
        services
        ConceptListResource.java
        Main.java
        SKOSResourceApplication.java
        servlet
        AutocompleteServlet.java
        TermSuggestionServlet.java
        sync
        SyncJob.java

package kea.vocab;

import java.io.Serializable;
import java.util.Arrays;
import java.util.List;

import kea.stemmers.Stemmer;
import kea.stopwords.Stopwords;

public abstract class Vocabulary implements Serializable 
{
	private static final long serialVersionUID = -7157202718619833534L;

	/** The document language */
	private String m_language;
	
	/** The default stemmer to be used */
	private Stemmer m_Stemmer;
	
	/** The list of stop words to be used */
	private Stopwords m_Stopwords;
	
	public Vocabulary(String documentLanguage) 
	{
		m_language = documentLanguage;
	}
	
	/**
	 * Initializes the vocabulary.
	 */
	public abstract void initialize();
	
	/**
	 * Builds the vocabulary index from SKOS RDF/XML files.
	 * @throws Exception
	 */
	public abstract void buildSKOS() throws Exception;

	/**
	 * Builds the vocabulary index from text files.
	 */
	//public abstract void build() throws Exception;
	
	/**
	 * Builds the vocabulary index with descriptors/non-descriptors relations.
	 */
	public abstract void buildUSE() throws Exception;
	
	
	/**
	 * Builds the vocabulary index with semantically related terms.
	 */
	public abstract void buildREL() throws Exception;
	
	/**
	 * Given a phrase returns its id in the vocabulary.
	 * @param phrase
	 * @return id of the phrase in the vocabulary index
	 */
	public abstract String getID(String phrase);
	
	/**
	 * Given id, gets the original version of vocabulary term.
	 * @param id
	 * @return original version of the vocabulary term
	 */
	public abstract String getOrig(String id);
	
	/**
	 * Given id of a term returns the list with ids of terms related to this term.
	 * @param id
	 * @return a vector with ids related to the input id
	 */
	public abstract List<String> getRelated(String id);
	
	
	/**
	 * Given an ID of a term gets the list of all IDs of terms
	 * that are semantically related to the given term
	 * with a specific relation
	 * @param id, relation
	 * @return a vector with ids related to the input id by a specified relation
	 */
	public abstract List<String> getRelated (String id, String relation);
	
	public void setStemmer(Stemmer newStemmer) {	
		this.m_Stemmer = newStemmer;
	}
	
	public Stemmer getStemmer() {
		return m_Stemmer;
	}

	public void setStopwords(Stopwords newM_Stopwords) {	
		this.m_Stopwords = newM_Stopwords;
	}
	
	public String getLanguage() {
		return m_language;
	}
	
    public String remove (String[] words, int i) {

        String result = "";
        for (int j = 0; j < words.length; j++) {
            if ((j != i) && (!m_Stopwords.isStopword(words[j]))) {
               
                result = result + words[j];
               
                if ((j+1) != words.length) {
                    result = result + " ";
                }
            }
             
        }
        return result;
    }
	
	
	/** 
	 * Generates the preudo phrase from a string.
	 * A pseudo phrase is a version of a phrase
	 * that only contains non-stopwords,
	 * which are stemmed and sorted into alphabetical order. 
	 */
	public String pseudoPhrase(String str) {
		if (str == null)
			return null;
		// System.err.print(str + "\t");
		String[] pseudophrase;
		String[] words;
		String str_nostop;
		String stemmed;
		
		
		str = str.toLowerCase();
		
		
		// This is often the case with Mesh Terms,
		// where a term is accompanied by another specifying term
		// e.g. Monocytes/*immunology/microbiology
		// we ignore everything after the "/" symbol.
		if (str.matches(".+?/.+?")) {
			String[] elements = str.split("/");		
			str = elements[0];
		}	
				
		// removes scop notes in brackets
		// should be replaced with a cleaner solution !!
		if (str.matches(".+?\\(.+?")) {
			String[] elements = str.split("\\(");		
			str = elements[0];			
		}	
	
		// Remove some non-alphanumeric characters
		
		// str = str.replace('/', ' ');
		str = str.replace('-', ' ');
		str = str.replace('&', ' ');
		

		str = str.replaceAll("\\*", "");
		str = str.replaceAll("\\, "," ");
		str = str.replaceAll("\\. "," ");
		str = str.replaceAll("\\:","");
	
		
		str = str.trim();
		
		// Stem string
		words = str.split(" ");
		str_nostop = "";
		
		for (int i = 0; i < words.length; i++) {
			String word = words[i];
			if (m_Stopwords != null && !m_Stopwords.isStopword(word)) {
				
				if (word.matches(".+?\\'.+?")) {
					String[] elements = word.split("\\'");
					if (elements.length > 1)
						word = elements[1];
				}	

				
				if (str_nostop.equals("")) {
					str_nostop = word;
				} else {
					str_nostop = str_nostop + " " + word;
				}
			}
		}

		stemmed = m_Stemmer.stemString(str_nostop);
		// System.err.println(stemmed + "\t" + str_nostop + "\t"+ str);
		pseudophrase = stemmed.split(" ");
		Arrays.sort(pseudophrase);
		//System.err.println(join(pseudophrase));
		return join(pseudophrase);
	}
	
	/** 
	 * Joins an array of strings to a single string.
	 */
	protected static String join(String[] str) {
		String result = "";
		for(int i = 0; i < str.length; i++) {
			if (result != "") {
				result = result + " " + str[i];
			} else {
				result = str[i];
			}
		}
		return result;
	}
	
	/**
	 * overloaded swap method: exchange 2 locations in an array of Strings.
	 */
	public static void swap(int loc1, int loc2, String[] a) {
		String temp = a[loc1];
		a[loc1] = a[loc2];
		a[loc2] = temp;
	} // end swap

	/**
	 * Sorts an array of Strings into alphabetic order
	 * 
	 */
	public static String[] sort(String[] a) {

		// rename firstAt to reflect new role in alphabetic sorting
		int i, j, firstAt;

		for (i = 0; i < a.length - 1; i++) {
			firstAt = i;
			for (j = i + 1; j < a.length; j++) {
				// modify to preserve ordering of a String that starts with
				// upper case preceding the otherwise identical String that
				// has only lower case letters
				if (a[j].toUpperCase().compareTo(a[firstAt].toUpperCase()) < 0) {
					// reset firstAt
					firstAt = j;
				}
				// if identical when converted to all same case
				if (a[j].toUpperCase().compareTo(a[firstAt].toUpperCase()) == 0) {
					// but a[j] precedes when not converted
					if (a[j].compareTo(a[firstAt]) < 0) {
						// reset firstAt
						firstAt = j;
					}
				}
			}
			if (firstAt != i) {
				swap(i, firstAt, a);
			}
		}
		return a;
	} // end method selectionSort
	
}