VocabularyJena.java example

Explorer
hive-mrc-master
- doc
  - sampleCode
- hive-core
  - src
  - test
    - edu
      - unc
        ils
        mrc
        hive
        api
        SKOSSchemeTest.java
        SearcherTest.java
        TaggerTest.java
        ir
        lucene
        search
        AutocompleteTest
        AutocompleteTest.java
        tagging
        KEATaggerTest.java
        util
        SimpleCrawlerTest.java
        hive2
        api
        impl
        test
        HiveH2IndexImplTest.java
        HiveLuceneIndexImplTest.java
        HiveVocabularyImplTest.java
- hive-rs
  - src
    - org
      - unc
        hive
        services
        rs
        ConceptsResource.java
        ConfigurationListener.java
        SchemesResource.java
  - test
    - org
      - unc
        hive
        services
        rs
        ConceptsResourceTest.java
        FileIO.java
        SchemesResourceTest.java
- hive-web
  - src
    - org
      - unc
        hive
        client
        ClosablePanel.java
        ConceptBrowser.java
        ConceptBrowserService.java
        ConceptBrowserServiceAsync.java
        ConceptLink.java
        ConceptProxy.java
        HIVEMessages.java
        HomePage.java
        Indexer.java
        IndexerService.java
        IndexerServiceAsync.java
        RecordFormatter.java
        TestVis.java
        server
        ConceptBrowserServiceImpl.java
        FileUpload.java
        IndexerServiceImpl.java
        VocabularyService.java
        services
        ConceptListResource.java
        Main.java
        SKOSResourceApplication.java
        servlet
        AutocompleteServlet.java
        TermSuggestionServlet.java
        sync
        SyncJob.java
package kea.vocab;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Vector;

import kea.stemmers.SpanishStemmerSB;
import kea.stemmers.Stemmer;
import kea.stopwords.Stopwords;

import com.hp.hpl.jena.rdf.model.Model;
import com.hp.hpl.jena.rdf.model.ModelFactory;
import com.hp.hpl.jena.rdf.model.Property;
import com.hp.hpl.jena.rdf.model.RDFNode;
import com.hp.hpl.jena.rdf.model.Resource;
import com.hp.hpl.jena.rdf.model.Statement;
import com.hp.hpl.jena.rdf.model.StmtIterator;


/**
* Builds an index with the content of the controlled vocabulary.
* Accepts vocabularies as rdf files (SKOS format) and in plain text format:
* vocabulary_name.en (with "ID TERM" per line) - descriptors & non-descriptors
* vocabulary_name.use (with "ID_NON-DESCR \t ID_DESCRIPTOR" per line)
* vocabulary_name.rel (with "ID \t RELATED_ID1 RELATED_ID2 ... " per line)
* See KEA's homepage for more details.
* @author Olena Medelyan
*/

public class VocabularyJena extends Vocabulary {
	
	private static final long serialVersionUID = 1L;
	
	/** Location of the rdf version of the controlled vocabulary
	 * it needs to be in the SKOS format! */
	public static File SKOS;
	/** Location of the vocabulary's *.en file
	 * containing all terms of the vocabularies and their ids.*/
	public static File EN;
	/** Location of the vocabulary's *.use file
	 * containing ids of non-descriptor with the corresponding ids of descriptors.*/
	public static File USE;
	/** Location of the vocabulary's *.rel file
	 * containing semantically related terms for each descriptor in the vocabulary.*/
	public static File REL;
	
	// if the type of the semantic relation will be required later 
	// this could be a file containing
	// this information
	// public static File RT;
	
	/** 
	 * Boolean describing which vocabulary format has been chosen:
	 * true if SKOS, false if text.
	 */
	private boolean useSkos;
	
	/** <i>Vocabulary</i> index */
	private HashMap<String,String> VocabularyEN = null;
	/** <i>Vocabulary</i> reverse index */
	private HashMap<String,String> VocabularyENrev = null;
	/** <i>Vocabulary</i> non-descriptors - descriptors list */
	private HashMap<String,String> VocabularyUSE = null;
	/** <i>Vocabulary</i> related terms */
	private HashMap<String,Vector<String>> VocabularyREL = null;
	private HashMap<String,String> VocabularyRT = null;
	
	/** The document language */
	private String m_language;
	
	/** The default stemmer to be used */
	private Stemmer m_Stemmer;
	
	/** The list of stop words to be used */
	private Stopwords m_Stopwords;
	
	
	
	/** Vocabulary constructor. 
	 * 
	 * Given the name of the vocabulary and the format it first checks whether
	 * the VOCABULARIES directory contains the specified files:
	 * - vocabularyName.rdf if skos format is selected
	 * - or a set of 3 flat files starting with vocabularyName and with extensions
	 * .en (id term)
	 * .use (non-descriptor \t descriptor)
	 * .rel (id \t related_id1 related_id2 ...)
	 * If the required files exist, the vocabulary index is built.
	 * 
	 * @param vocabularyName The name of the vocabulary file (before extension).
	 * @param vocabularyFormat The format of the vocabulary (skos or text).
	 * */
	
	public VocabularyJena(String vocabularyName, String vocabularyFormat, String documentLanguage) 
	{
		super(documentLanguage);
		if (vocabularyFormat.equals("skos")) {
			SKOS = new File("VOCABULARIES/" + vocabularyName + ".rdf");
			if (!SKOS.exists()){
				System.err.println("File VOCABULARIES/" + vocabularyName + ".rdf does not exist.");
				System.exit(1);
			} 
			useSkos = true;
			
		} else if (vocabularyFormat.equals("text")) {
			EN = new File("VOCABULARIES/" + vocabularyName + ".en");
			USE = new File("VOCABULARIES/" + vocabularyName + ".use");
			REL = new File("VOCABULARIES/" + vocabularyName + ".rel");
		//	RT = new File("VOCABULARIES/" + vocabularyName + ".pairs.p1");
			
			if (!EN.exists()) {
				System.err.println("File VOCABULARIES/" + vocabularyName + ".en does not exist.");
				System.exit(1);
			}		
			if (!USE.exists()) {
				System.err.println("File VOCABULARIES/" + vocabularyName + ".list.use does not exist.");
				System.exit(1);
			}
			if (!REL.exists()) {
				System.err.println("File VOCABULARIES/" + vocabularyName + ".rel.p1 does not exist.");
				System.exit(1);
			}
//			if (!RT.exists()) {
//				System.err.println("File VOCABULARIES/" + vocabularyName + ".pairs.p1 does not exist.");
//				System.exit(1);
//			}

		}

	}
	
	/**
	 * Starts initialization of the vocabulary.
	 *
	 */
	public void initialize() {
		
		System.err.println("-- Loading the Index...");
		if (useSkos) {
			try {
				buildSKOS();
			} catch(Exception e) {
				e.printStackTrace();
				System.exit(1);
			}
		} else {
			try {
				build();
			} catch(Exception e) {
				e.printStackTrace();
				System.exit(1);
			}
		}	
	}
	
	/**
	 * Set the Stemmer value.
	 * @param newStemmer The new Stemmer value.
	 */
	public void setStemmer(Stemmer newStemmer) {	

			this.m_Stemmer = newStemmer;
		
	}
	/**
	 * Set the M_Stopwords value.
	 * @param newM_Stopwords The new M_Stopwords value.
	 */
	public void setStopwords(Stopwords newM_Stopwords) {	
		this.m_Stopwords = newM_Stopwords;
	}
	
	
	/**
	 * Builds the vocabulary indexes from SKOS file.
	 */
	public void buildSKOS() throws Exception {
		
		
		System.err.println("-- Building the Vocabulary index from SKOS file");
		
		VocabularyEN = new HashMap<String,String>();
		VocabularyENrev = new HashMap<String,String>();
		VocabularyUSE = new HashMap<String,String>();
		VocabularyREL = new HashMap<String,Vector<String>>();
		VocabularyRT = new HashMap<String,String>();
		
		
        // create an empty model
        Model model = ModelFactory.createDefaultModel();
        
        
        try {

    	    model.read(new InputStreamReader(new FileInputStream(SKOS),"UTF-8"), "");
            
    	    StmtIterator iter;
    	    Statement stmt;
    	    Property relation;
    	    Resource concept;
    	    RDFNode value;
    	    
    	    
    	    
    	    int count = 1;
    	    // Iterating over all statements in the SKOS file
    	    iter = model.listStatements();
    	    
    	    while (iter.hasNext()) {
    	    	stmt = iter.nextStatement();

    	    	// id of the concept (Resource), e.g. "c_4828"
    	    	concept = stmt.getSubject();    	    
    	    	String id = concept.getURI();
    	    	
    	    	// relation or Property of the concept, e.g. "narrower"
    	    	relation = stmt.getPredicate();
    	    	String rel = relation.getLocalName();
    	    	
    	    	// value of the property, e.g. c_4828 has narrower term "c_4829"
    	    	value = stmt.getObject();  	    	
    	    	String val = value.toString();
    	    	
    	    	//System.out.println("Concept " + concept);
    	    	//System.out.println("Relation " + rel);    	    	
    	    	//System.out.println("Value " + val);
    	    	
    	    	if (rel.equals("prefLabel")) {
    	    		
    	    		String descriptor;
    	    		
    	    		if (val.contains("@")) {
    	    			String[] val_components = val.split("@");
    	    			// System.err.println(val_components[1] + " " + m_language);
    	    			if (val_components[1].equals(m_language)) {
    	    			//	System.err.println("Yes");
    	    				descriptor = val_components[0];
    	    			} else {
    	    				continue;
    	    			}
    	    		} else {
    	    			descriptor = val;
    	    		}
        	    	
    	    		String avterm = pseudoPhrase(descriptor); 
    	    		if (avterm.equals("")) {
    	    			avterm = descriptor;
    	    		}
    	    		
    	    		// System.out.println(descriptor + " ==> " + avterm);
    	    		
    				if (avterm.length() > 1) {    					
    					VocabularyEN.put(avterm, id);
    					VocabularyENrev.put(id,descriptor);
    				}	
    				
    	    		// fill here the index hash
    	    		
    	    		// id => descriptor
    	    		//if (id.equals("http://www.fao.org/aos/agrovoc#c_4314")) {
    	    		//	System.out.println("Descriptor " + descriptor + " (" + id + ")");
    	    		//}
    	    		
    	    	} else if (rel.equals("altLabel") || (rel.equals("hiddenLabel"))) {
    	    		
    	    		String non_descriptor;
    	    		
    	    		if (val.contains("@")) {
    	    			String[] val_components = val.split("@");
    	    			// System.err.println(val_components[1] + " " + m_language);
    	    			if (val_components[1].equals(m_language)) {
    	    			//	System.err.println("Yes");
    	    				non_descriptor = val_components[0];
    	    			} else {
    	    				continue;
    	    			}
    	    		} else {
    	    			non_descriptor = val;
    	    		}
    	    		// System.out.println("Descriptor " + non_descriptor);
    	    		// first add the non_descriptor to the index hash	
    	    		// then fill here non-descriptor hash
    	    		
    	    		// id => id_non_descriptor
    	    		 addNonDescriptor (count, id, non_descriptor);                   
                     count++;
    				
    	    		//System.out.println("Descriptor " + VocabularyENrev.get(id) + " with id (" + id + ")" +
    	    			//	" has a non-descriptor " + non_descriptor + " (" + id_non_descriptor + ")");    	    		
    	    		
    	    	} else if (rel.equals("broader") 
    	    		|| rel.equals("narrower") 
    	    		|| rel.equals("composite")
    	    		|| rel.equals("compositeOf")
    	    		|| rel.equals("hasTopConcept")
    	    		|| rel.equals("related")) {
    	    		    	    	
        	    	String id_related = val;
        	    	
        	    	// System.out.println("Descriptor " + VocabularyENrev.get(id) + " with id " + id + 
    	    		//		" has a " + rel + " term " + VocabularyENrev.get(id_related) + " with id (" + id_related + ")");    
        	    	
        	    	// fill here semantic relations hash
        	    	// id => id_related
        	    	
        	    	if (VocabularyREL.get(id) == null) {
        	    		Vector rt = new Vector();
        	    		rt.add(id_related);         	    		
        	    		VocabularyREL.put(id,rt);
        	    	}	else {      	    		
        	    		Vector rt = (Vector)VocabularyREL.get(id);
        	    		rt.add(id_related);         	    		
        	    	}
        	    	
        	    	VocabularyRT.put(id + "-" + id_related,rel);
        	    	if (rel.equals("related")) {
        	    		VocabularyRT.put(id_related + "-" + id,rel);
        	    	}
    				
        	    	
        	    	// VocabularyRT.put("id-id_related","1");
        	    	// VocabularyRT.put("id_related-id","1");
    				
    	    	}
    	    	
    	    }
    	   
    	    // Some statistics:
    	 //    System.out.println(VocabularyEN.size() + " terms in total");
    	 //   System.out.println(VocabularyUSE.size() + " non-descriptors");
    	  //   System.out.println(VocabularyREL.size() + " terms have related terms");
    	    
    	} catch (Exception e) {
    	   e.printStackTrace(); 
    	}
	}
	
	

   
    private void addNonDescriptor (int count, String id_descriptor, String non_descriptor) {
        //     id => id_non_descriptor
        String id_non_descriptor = "d_" + count;
        count++;
   
        String avterm = pseudoPhrase(non_descriptor);
        if (avterm.length() > 2) {                       
            VocabularyEN.put(avterm, id_non_descriptor);
            VocabularyENrev.put(id_non_descriptor,non_descriptor);
        }   
        VocabularyUSE.put(id_non_descriptor,id_descriptor);
    }
   
    public String remove (String[] words, int i) {

        String result = "";
        for (int j = 0; j < words.length; j++) {
            if ((j != i) && (!m_Stopwords.isStopword(words[j]))) {
               
                result = result + words[j];
               
                if ((j+1) != words.length) {
                    result = result + " ";
                }
            }
             
        }
        return result;
    }
    
	/**
	 * Builds the vocabulary index from the text files.
	 */
	public void build() throws Exception {
		
		System.err.println("-- Building the Vocabulary index");
		
		VocabularyEN = new HashMap<String,String>();
		VocabularyENrev = new HashMap<String,String>();
		
		String readline;
		String term;
		String avterm;
		String id;
		
		try {	  			
			InputStreamReader is = new InputStreamReader(new FileInputStream(EN));     
			BufferedReader br = new BufferedReader(is);
			while((readline=br.readLine()) != null) {
				int i = readline.indexOf(' ');
				term = readline.substring(i+1);
				
				avterm = pseudoPhrase(term);
				
				if (avterm.length() > 2) {
					id = readline.substring(0,i); 
					VocabularyEN.put(avterm, id);
					VocabularyENrev.put(id,term);
				}				
			}
		} catch (Exception e) {
			e.printStackTrace();	
		}
		
	}   
	
	
	/**
	 * Builds the vocabulary index with descriptors/non-descriptors relations.
	 */
	public void buildUSE() throws Exception {
		if (!useSkos) {
			VocabularyUSE = new HashMap<String,String>();
			String readline;
			String[] entry;
			
			try {	  
				
				InputStreamReader is = new InputStreamReader(new FileInputStream(USE));     
				BufferedReader br = new BufferedReader(is);
				while((readline=br.readLine()) != null) {
					entry = readline.split("\t");
					
//					if more than one descriptors for
//					one non-descriptors are used, ignore it!
//					probably just related terms (cf. latest edition of Agrovoc)
					
					if ((entry[1].indexOf(" ")) == -1) {
						VocabularyUSE.put(entry[0],entry[1]);
					}
				}
			} catch (Exception e) {
				e.printStackTrace();				 	
			}
		}
		
	}
	
	/**
	 * Builds the vocabulary index with semantically related terms.
	 */
	public void buildREL() throws Exception {
		if (!useSkos) {
			
			System.err.println("-- Building the Vocabulary index with related pairs");
			
			VocabularyREL = new HashMap();
			
			String readline;
			String[] entry;
			
			
			try {	  
				
				InputStreamReader is = new InputStreamReader(new FileInputStream(REL));     
				BufferedReader br = new BufferedReader(is);
				while((readline=br.readLine()) != null) {
					entry = readline.split("\t");
					String[] temp = entry[1].split(" ");
					Vector<String> rt = new Vector<String>();
					for (int i = 0; i < temp.length; i++) {
						rt.add(temp[i]);
					}
					VocabularyREL.put(entry[0],rt);
				}
			} catch (Exception e) {
				e.printStackTrace(); 	
			}
		}
		
	}	
	
	public void buildRT() throws Exception {
	}
// Might be useful later, when the kind of relation is important
// or wether two terms are related or not
//	public void buildRT() throws Exception {
//		
//		VocabularyRT = new HashMap();
//		
//		String[] entry;
//		String readline;
//		try {	  				
//			InputStreamReader is2 = new InputStreamReader(new FileInputStream(RT));     
//			BufferedReader br2 = new BufferedReader(is2);	  
//			while((readline=br2.readLine()) != null) {
//				entry = split(readline,"\t");
//				String pair = entry[0] + "-" + entry[1];
//				VocabularyRT.put(pair,"1");
//				
//			}
//		} catch (Exception e) {
//			System.err.println("You need to put the .pairs file into KEA directory"); 	
//		}
//		
//	}
//	
	
	

	/**
	 * Checks whether a normalized version of a phrase (pseudo phrase)
	 * is a valid vocabulary term.
	 * 
	 * @param phrase
	 * @return true if phrase is in the vocabulary
	 */
	public boolean containsEntry(String phrase) {
		return VocabularyEN.containsKey(phrase);
	}
	
	/**
	 * Given a phrase returns its id in the vocabulary.
	 * @param phrase
	 * @return id of the phrase in the vocabulary index
	 */
	public String getID(String phrase) {
		String pseudo = pseudoPhrase(phrase);
		String id = null;
		if (pseudo != null) {
			id = (String)VocabularyEN.get(pseudo);
			if (VocabularyUSE.containsKey(id)) {
				id = (String)VocabularyUSE.get(id);
			}
		}
		return id;
	}
	
	/**
	 * Given id, gets the original version of vocabulary term.
	 * @param id
	 * @return original version of the vocabulary term
	 */
	public String getOrig(String id) {
		return (String)VocabularyENrev.get(id);
	}
	
	/**
	 * Given id of the non-descriptor returs the id of the corresponding descriptor
	 * @param id of the non-descriptor
	 * @return id of the descriptor
	 */
	public String getDescriptor(String id) {
		return (String)VocabularyUSE.get(id);
	}
	
	/**
	 * Given id of a term returns the list with ids of terms related to this term.
	 * @param id
	 * @return a vector with ids related to the input id
	 */
	public Vector<String> getRelated(String id) {
		return VocabularyREL.get(id);
	}
	
	
	/**
	 * Given an ID of a term gets the list of all IDs of terms
	 * that are semantically related to the given term
	 * with a specific relation
	 * @param id, relation
	 * @return a vector with ids related to the input id by a specified relation
	 */
	public Vector<String> getRelated (String id, String relation) {
		Vector<String> related = new Vector<String>(); 
		Vector<String> all_related = (Vector<String>)VocabularyREL.get(id);
		if (all_related != null) {
    	
			for (int d = 0; d < all_related.size(); d++) {
				String rel_id = (String)all_related.elementAt(d);	
			
				String rel = (String)VocabularyRT.get(id + "-" + rel_id);

				if (rel != null) { 
					if (rel.equals(relation)) {
						related.add(rel_id);
					}
				} else {
					System.err.println("Problem with " + getOrig(id) + " and " + getOrig(rel_id));	
				}
			}
    	}
    	return related;
	}
	
}