/** * OpenKM, Open Document Management System (http://www.openkm.com) * Copyright (c) 2006-2011 Paco Avila & Josep Llort * * No bytes were intentionally harmed during the development of this application. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */ package com.openkm.kea.vocab; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.InputStreamReader; import java.io.Serializable; import java.util.ArrayList; import java.util.HashMap; import java.util.Vector; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.hp.hpl.jena.rdf.model.Model; import com.hp.hpl.jena.rdf.model.ModelFactory; import com.hp.hpl.jena.rdf.model.Property; import com.hp.hpl.jena.rdf.model.RDFNode; import com.hp.hpl.jena.rdf.model.Resource; import com.hp.hpl.jena.rdf.model.Statement; import com.hp.hpl.jena.rdf.model.StmtIterator; import com.openkm.kea.stemmers.Stemmer; import com.openkm.kea.stopwords.Stopwords; /** * Builds an index with the content of the controlled vocabulary. * Accepts vocabularies as rdf files (SKOS format) and in plain text format: * vocabulary_name.en (with "ID TERM" per line) - descriptors & non-descriptors * vocabulary_name.use (with "ID_NON-DESCR \t ID_DESCRIPTOR" per line) * vocabulary_name.rel (with "ID \t RELATED_ID1 RELATED_ID2 ... " per line) * See KEA's homepage for more details. * @author Olena Medelyan */ public class Vocabulary implements Serializable { private static Logger log = LoggerFactory.getLogger(Vocabulary.class); private static final long serialVersionUID = 1L; /** Location of the rdf version of the controlled vocabulary * it needs to be in the SKOS format! */ public static File SKOS; /** Location of the vocabulary's *.en file * containing all terms of the vocabularies and their ids.*/ public static File EN; /** Location of the vocabulary's *.use file * containing ids of non-descriptor with the corresponding ids of descriptors.*/ public static File USE; /** Location of the vocabulary's *.rel file * containing semantically related terms for each descriptor in the vocabulary.*/ public static File REL; // if the type of the semantic relation will be required later // this could be a file containing // this information // public static File RT; /** * Boolean describing which vocabulary format has been chosen: * true if SKOS, false if text. */ private boolean useSkos; /** <i>Vocabulary</i> index */ private HashMap<String,String> VocabularyEN = null; /** <i>Vocabulary</i> reverse index */ private HashMap<String,String> VocabularyENrev = null; /** <i>Vocabulary</i> non-descriptors - descriptors list */ private HashMap<String,String> VocabularyUSE = null; /** <i>Vocabulary</i> related terms */ private HashMap<String, Vector<String>> VocabularyREL = null; private HashMap<String, String> VocabularyRT = null; /** The document language */ private String m_language; /** The default stemmer to be used */ private Stemmer m_Stemmer; /** The list of stop words to be used */ private Stopwords m_Stopwords; /** Vocabulary constructor. * * Given the name of the vocabulary and the format it first checks whether * the VOCABULARIES directory contains the specified files: * - vocabularyName.rdf if skos format is selected * - or a set of 3 flat files starting with vocabularyName and with extensions * .en (id term) * .use (non-descriptor \t descriptor) * .rel (id \t related_id1 related_id2 ...) * If the required files exist, the vocabulary index is built. * * @param vocabularyName The name of the vocabulary file (before extension). * @param vocabularyFormat The format of the vocabulary (skos or text). * */ public Vocabulary(String vocabularyName, String vocabularyFormat, String documentLanguage) { m_language = documentLanguage; if (vocabularyFormat.equals("skos")) { // My change here: need to load vocab from our location - use full path SKOS = new File(vocabularyName); //SKOS = new File("VOCABULARIES/" + vocabularyName + ".rdf"); if (!SKOS.exists()){ log.info("File " + vocabularyName + " does not exist."); System.exit(1); } useSkos = true; } else if (vocabularyFormat.equals("text")) { EN = new File(vocabularyName + ".en"); USE = new File(vocabularyName + ".use"); REL = new File(vocabularyName + ".rel"); // RT = new File("vocabularyName + ".pairs.p1"); if (!EN.exists()) { log.info(vocabularyName + ".en does not exist."); System.exit(1); } if (!USE.exists()) { log.info(vocabularyName + ".list.use does not exist."); System.exit(1); } if (!REL.exists()) { log.info(vocabularyName + ".rel.p1 does not exist."); System.exit(1); } // if (!RT.exists()) { // log.info(vocabularyName + ".pairs.p1 does not exist."); // System.exit(1); // } } } /** * Starts initialization of the vocabulary. * */ public void initialize() { if (useSkos) { try { buildSKOS(); } catch(Exception e) { e.printStackTrace(); System.exit(1); } } else { try { build(); } catch(Exception e) { e.printStackTrace(); System.exit(1); } } } /** * Set the Stemmer value. * @param newStemmer The new Stemmer value. */ public void setStemmer(Stemmer newStemmer) { this.m_Stemmer = newStemmer; } /** * Set the M_Stopwords value. * @param newM_Stopwords The new M_Stopwords value. */ public void setStopwords(Stopwords newM_Stopwords) { this.m_Stopwords = newM_Stopwords; } /** * Builds the vocabulary indexes from SKOS file. */ public void buildSKOS() throws Exception { VocabularyEN = new HashMap<String,String>(); VocabularyENrev = new HashMap<String,String>(); VocabularyUSE = new HashMap<String,String>(); VocabularyREL = new HashMap<String, Vector<String>>(); VocabularyRT = new HashMap<String, String>(); // create an empty model Model model = ModelFactory.createDefaultModel(); try { model.read(new InputStreamReader(new FileInputStream(SKOS),"UTF-8"), ""); StmtIterator iter; Statement stmt; Property relation; Resource concept; RDFNode value; int count = 1; // Iterating over all statements in the SKOS file iter = model.listStatements(); while (iter.hasNext()) { stmt = iter.nextStatement(); // id of the concept (Resource), e.g. "c_4828" concept = stmt.getSubject(); String id = concept.getURI(); // relation or Property of the concept, e.g. "narrower" relation = stmt.getPredicate(); String rel = relation.getLocalName(); // value of the property, e.g. c_4828 has narrower term "c_4829" value = stmt.getObject(); String val = value.toString(); if (rel.equals("prefLabel")) { String descriptor; if (val.contains("@")) { String[] val_components = val.split("@"); if (val_components[1].equals(m_language)) { descriptor = val_components[0]; } else { continue; } } else { descriptor = val; } String avterm = pseudoPhrase(descriptor); if (avterm == null) { avterm = descriptor; } if (avterm.length() > 1) { VocabularyEN.put(avterm, id); VocabularyENrev.put(id,descriptor); } } else if (rel.equals("altLabel") || (rel.equals("hiddenLabel"))) { String non_descriptor; if (val.contains("@")) { String[] val_components = val.split("@"); if (val_components[1].equals(m_language)) { non_descriptor = val_components[0]; } else { continue; } } else { non_descriptor = val; } addNonDescriptor (count, id, non_descriptor); count++; } else if (rel.equals("broader") || rel.equals("narrower") || rel.equals("composite") || rel.equals("compositeOf") || rel.equals("hasTopConcept") || rel.equals("related")) { String id_related = val; if (VocabularyREL.get(id) == null) { Vector<String> rt = new Vector<String>(); rt.add(id_related); VocabularyREL.put(id,rt); } else { Vector<String> rt = (Vector<String>)VocabularyREL.get(id); rt.add(id_related); VocabularyREL.put(id,rt); } VocabularyRT.put(id + "-" + id_related,rel); if (rel.equals("related")) { VocabularyRT.put(id_related + "-" + id,rel); } } } } catch (Exception e) { e.printStackTrace(); } } /** * addNonDescriptor * * @param count * @param id_descriptor * @param non_descriptor */ private void addNonDescriptor (int count, String id_descriptor, String non_descriptor) { // id => id_non_descriptor String id_non_descriptor = "d_" + count; count++; String avterm = pseudoPhrase(non_descriptor); if (avterm.length() > 2) { VocabularyEN.put(avterm, id_non_descriptor); VocabularyENrev.put(id_non_descriptor,non_descriptor); } VocabularyUSE.put(id_non_descriptor,id_descriptor); } /** * remove * * @param words * @param i * @return */ public String remove (String[] words, int i) { String result = ""; for (int j = 0; j < words.length; j++) { if ((j != i) && (!m_Stopwords.isStopword(words[j]))) { result = result + words[j]; if ((j+1) != words.length) { result = result + " "; } } } return result; } /** * Builds the vocabulary index from the text files. */ public void build() throws Exception { VocabularyEN = new HashMap<String,String>(); VocabularyENrev = new HashMap<String,String>(); String readline; String term; String avterm; String id; try { InputStreamReader is = new InputStreamReader(new FileInputStream(EN)); BufferedReader br = new BufferedReader(is); while((readline=br.readLine()) != null) { int i = readline.indexOf(' '); term = readline.substring(i+1); avterm = pseudoPhrase(term); if (avterm.length() > 2) { id = readline.substring(0,i); VocabularyEN.put(avterm, id); VocabularyENrev.put(id,term); } } } catch (Exception e) { e.printStackTrace(); } } /** * Builds the vocabulary index with descriptors/non-descriptors relations. */ public void buildUSE() throws Exception { if (!useSkos) { VocabularyUSE = new HashMap<String,String>(); String readline; String[] entry; try { InputStreamReader is = new InputStreamReader(new FileInputStream(USE)); BufferedReader br = new BufferedReader(is); while((readline=br.readLine()) != null) { entry = split(readline,"\t"); if ((entry[1].indexOf(" ")) == -1) { VocabularyUSE.put(entry[0],entry[1]); } } } catch (Exception e) { e.printStackTrace(); } } } /** * Builds the vocabulary index with semantically related terms. */ public void buildREL() throws Exception { if (!useSkos) { VocabularyREL = new HashMap<String, Vector<String>>(); String readline; String[] entry; try { InputStreamReader is = new InputStreamReader(new FileInputStream(REL)); BufferedReader br = new BufferedReader(is); while((readline=br.readLine()) != null) { entry = split(readline,"\t"); String[] temp = split(entry[1]," "); Vector<String> rt = new Vector<String>(); for (int i = 0; i < temp.length; i++) { rt.add(temp[i]); } VocabularyREL.put(entry[0],rt); } } catch (Exception e) { e.printStackTrace(); } } } // Might be useful later, when the kind of relation is important // or wether two terms are related or not // public void buildRT() throws Exception { // // VocabularyRT = new HashMap(); // // String[] entry; // String readline; // try { // InputStreamReader is2 = new InputStreamReader(new FileInputStream(RT)); // BufferedReader br2 = new BufferedReader(is2); // while((readline=br2.readLine()) != null) { // entry = split(readline,"\t"); // String pair = entry[0] + "-" + entry[1]; // VocabularyRT.put(pair,"1"); // // } // } catch (Exception e) { // log.info("You need to put the .pairs file into KEA directory"); // } // // } // /** * Checks whether a normalized version of a phrase (pseudo phrase) * is a valid vocabulary term. * * @param phrase * @return true if phrase is in the vocabulary */ public boolean containsEntry(String phrase) { return VocabularyEN.containsKey(phrase); } /** * Given a phrase returns its id in the vocabulary. * @param phrase * @return id of the phrase in the vocabulary index */ public String getID(String phrase) { String pseudo = pseudoPhrase(phrase); String id = null; if (pseudo != null) { id = (String)VocabularyEN.get(pseudo); if (VocabularyUSE.containsKey(id)) { id = (String)VocabularyUSE.get(id); } } return id; } /** * Given id, gets the original version of vocabulary term. * @param id * @return original version of the vocabulary term */ public String getOrig(String id) { return (String)VocabularyENrev.get(id); } /** * Given id of the non-descriptor returs the id of the corresponding descriptor * @param id of the non-descriptor * @return id of the descriptor */ public String getDescriptor(String id) { return (String)VocabularyUSE.get(id); } /** * Given id of a term returns the list with ids of terms related to this term. * @param id * @return a vector with ids related to the input id */ public Vector<String> getRelated(String id) { return (Vector<String>)VocabularyREL.get(id); } /** * Given an ID of a term gets the list of all IDs of terms * that are semantically related to the given term * with a specific relation * @param id, relation * @return a vector with ids related to the input id by a specified relation */ public Vector<String> getRelated (String id, String relation) { Vector<String> related = new Vector<String>(); Vector<String> all_related = (Vector<String>) VocabularyREL.get(id); if (all_related != null) { for (int d = 0; d < all_related.size(); d++) { String rel_id = (String) all_related.elementAt(d); String rel = (String)VocabularyRT.get(id + "-" + rel_id); if (rel != null) { if (rel.equals(relation)) { related.add(rel_id); } } else { log.info("Problem with " + getOrig(id) + " and " + getOrig(rel_id)); } } } return related; } /** * Splits a string str at given character sequence (separator) into an array. * @param str, separator * @return String array with string parts separated by the separator string */ public String[] split(String str,String separator) { ArrayList<String> lst = new ArrayList<String>(); String word = ""; for (int i = 0; i < str.length(); i++) { int j = i + 1; String letter = str.substring(i,j); if (!letter.equalsIgnoreCase(separator)) { word = word + str.charAt(i); } else { lst.add(word); word = ""; } } if (word != "") { lst.add(word); } String[] result = (String[])lst.toArray(new String[lst.size()]); return result; } /** * Generates the preudo phrase from a string. * A pseudo phrase is a version of a phrase * that only contains non-stopwords, * which are stemmed and sorted into alphabetical order. */ public String pseudoPhrase(String str) { String[] pseudophrase; String[] words; String str_nostop; String stemmed; str = str.toLowerCase(); // This is often the case with Mesh Terms, // where a term is accompanied by another specifying term // e.g. Monocytes/*immunology/microbiology // we ignore everything after the "/" symbol. if (str.matches(".+?/.+?")) { String[] elements = str.split("/"); str = elements[0]; } // removes scop notes in brackets // should be replaced with a cleaner solution !! if (str.matches(".+?\\(.+?")) { String[] elements = str.split("\\("); str = elements[0]; } // Remove some non-alphanumeric characters // str = str.replace('/', ' '); str = str.replace('-', ' '); str = str.replace('&', ' '); str = str.replaceAll("\\*", ""); str = str.replaceAll("\\, "," "); str = str.replaceAll("\\. "," "); str = str.replaceAll("\\:",""); str = str.trim(); // Stem string words = str.split(" "); str_nostop = ""; for (int i = 0; i < words.length; i++) { String word = words[i]; if (!m_Stopwords.isStopword(word)) { if (word.matches(".+?\\'.+?")) { String[] elements = word.split("\\'"); word = elements[1]; } if (str_nostop.equals("")) { str_nostop = word; } else { str_nostop = str_nostop + " " + word; } } } stemmed = m_Stemmer.stemString(str_nostop); pseudophrase = sort(stemmed.split(" ")); return join(pseudophrase); } /** * Joins an array of strings to a single string. */ private static String join(String[] str) { String result = ""; for(int i = 0; i < str.length; i++) { if (result != "") { result = result + " " + str[i]; } else { result = str[i]; } } return result; } /** * overloaded swap method: exchange 2 locations in an array of Strings. */ public static void swap (int loc1, int loc2, String [] a) { String temp = a [loc1]; a [loc1] = a [loc2]; a [loc2] = temp; } // end swap /** * Sorts an array of Strings into alphabetic order * */ public static String[] sort (String [] a) { // rename firstAt to reflect new role in alphabetic sorting int i, j, firstAt; for (i = 0 ; i < a.length - 1 ; i++) { firstAt = i; for (j = i + 1 ; j < a.length ; j++) { // modify to preserve ordering of a String that starts with // upper case preceding the otherwise identical String that // has only lower case letters if (a [j].toUpperCase ().compareTo (a [firstAt].toUpperCase ()) < 0) { // reset firstAt firstAt = j; } // if identical when converted to all same case if (a [j].toUpperCase ().compareTo (a [firstAt].toUpperCase ()) == 0) { // but a[j] precedes when not converted if (a [j].compareTo (a [firstAt]) < 0) { // reset firstAt firstAt = j; } } } if (firstAt != i) { swap (i, firstAt, a); } } return a; } // end method selectionSort }