package plugins.LuceneIndex; import java.io.File; //import java.net.URI; //import java.net.URISyntaxException; import java.util.ArrayList; import java.util.List; //import java.util.Map; //import java.util.TreeMap; import java.util.Vector; //import uk.ac.ebi.ontocat.Ontology; import uk.ac.ebi.ontocat.OntologyService; import uk.ac.ebi.ontocat.OntologyServiceException; //import uk.ac.ebi.ontocat.OntologyTerm; //import uk.ac.ebi.ontocat.bioportal.BioportalOntologyService; import uk.ac.ebi.ontocat.file.FileOntologyService; //import uk.ac.ebi.ontocat.virtual.CompositeDecorator; /** * Expands the query by adding synonyms and children to initial query, using * Boolean OR (not necessary, but more convenient to look through the query), * expansion terms are weighted less than initial query terms * * @param query_terms * - chunks: all possible combinations of query terms(subsequent) * @param init_query * - list of query terms, to which the expansion terms are then added * @param boost_factor * - the weight of expansion terms * @param spec_symbols * - supported Boolean search operators (& = AND, | = OR, both * variants can be used) * */ public class OntocatQueryExpansion_lucene { List<String> query_terms;// n-grams OntologyService os; List<String> init_query = new Vector<String>(); static String spec_symbols = "\\(\\)\\&\\|\\+\\-\\~"; static final float boost_factor = 0.8f; public OntocatQueryExpansion_lucene() { } public OntocatQueryExpansion_lucene(String fname) throws OntologyServiceException { File file1 = new File(fname); os = new FileOntologyService(file1.toURI()); } public void setInit_query(List<String> query) { for (String s : query) init_query.add(s); } public void setQuery_terms(List<String> query) { query_terms = query; } private boolean isIn(String s, char ch) { for (int i = 0; i < s.length(); i++) if (s.charAt(i) == ch) return true; return false; } /** * Chunking the query into strings of different length */ public List<String> chunk(List<String> words) { String q = ""; List<String> result = new ArrayList<String>(); int l = words.size(); for (int i = l; i > 0; i--) { for (int x = 0; x <= (l - i); x++) { q = ""; for (int j = x; j < (x + i); j++) { q = q + words.get(j) + " "; } result.add(q.trim()); // System.out.println(q); } } System.out.println("parsed query: " + result); query_terms = result; return result; } /** * generating a list of possible word combinations of different length from * the query and setting it as query_terms */ public List<String> parseQuery(String query) { List<String> words = new ArrayList<String>(); int i = 0; int first_letter = 0; char ch; String ignore = "[,.\\:\\!\\?;]"; /** punctuation to be ignored */ query = query.replaceAll(ignore, " ").trim(); /** remove punctuation */ /** replacing special symbols */ query = query.replaceAll("( *OR)|(OR *)", "|"); query = query.replaceAll(" *AND *", "&"); query = query.toLowerCase(); int len = query.length(); // * splitting by ' ' and by spec_symbols, leaving phrases in "" as // single unit phrases */ while (i < len) { ch = query.charAt(i); if (ch == ' ') { words.add(query.substring(first_letter, i)); i++; first_letter = i; } else if (ch == '"') { int j = 0; i++; ch = query.charAt(i); while (ch != '"') { j++; ch = query.charAt(i + j); } words.add(query.substring(i, i + j)); i += j + 1; if (i >= len) break; if ((query.charAt(i) == ' ')) { i++; first_letter = i; } else if (isIn(spec_symbols, query.charAt(i))) { words.add(query.substring(i, i + 1)); i++; first_letter = i; } } else if (isIn(spec_symbols, ch)) { if (i != first_letter) words.add(query.substring(first_letter, i)); char[] tmp = new char[1]; tmp[0] = ch; String s = new String(tmp); words.add(s); i++; first_letter = i; } else { i++; if (i == len) { words.add(query.substring(first_letter, i)); break; } } } /** * setting the initial query list (with Boolean operators) */ setInit_query(words); List<String> result = new ArrayList<String>(); List<String> tmp = new ArrayList<String>(); /** * chunking the query into all possible n-grams, skipping the Boolean * operators */ int size = words.size(); for (int x = 0; x < size; x++) { String cur = words.get(x); if (!spec_symbols.contains(cur)) tmp.add(cur); else { result.addAll(chunk(tmp)); tmp.clear(); } if ((!tmp.isEmpty()) && (x == size - 1)) result.addAll(chunk(tmp)); } query_terms = result; return result; } /** * convert String[] to List<String>, removing repeated elements * * @param arr * @return */ public List<String> array2listNotDuplicate(String[] arr) { List<String> list = new ArrayList<String>(); int len = arr.length; for (int i = 0; i < len; i++) { if (!list.contains(arr[i])) list.add(arr[i]); } return list; } /** * Query expansion. Changes init_query by adding expansion terms after found * terms and joining found phrase terms hrase - current part of the query * found_terms - terms found in ontologies * searcher.SearchIndexOntocat(String phrase, List<Strings> ontologies) * searches the phrase in index files of ontologies, returns * (term:syn1;syn2;...;child1;child2;...) expansion - synonyms + children * found all - found_terms + expansion * * @param ontologiesToUse */ public void expand(List<String> ontologiesToUse) { /* * phrase - current part of the query found_terms - terms found in * ontologies searcher.SearchIndexOntocat(String phrase, List<Strings> * ontologies) searches the phrase in index files of ontologies, returns * (term:syn1;syn2;...;child1;child2;...) expansion - synonyms + * children found all - found_terms + expansion */ int i = 0; int z = 0; while (z < query_terms.size()) { if (query_terms.isEmpty()) break; String phrase = query_terms.get(z); OntoCatIndexPlugin searcher = new OntoCatIndexPlugin("x", null); /** * searching the phrase in ontologies */ List<String> found_terms = new ArrayList<String>(); for (String str : searcher.SearchIndexOntocat(phrase, ontologiesToUse).split(":")) if (str != "") found_terms.add(str); z++; if (!found_terms.isEmpty()) { i = query_terms.indexOf(phrase); List<String> found_all = new ArrayList<String>(); List<String> expansion = new ArrayList<String>(); /** adding the phrase and expansion terms */ found_all.add(phrase.toLowerCase()); if (!found_all.contains(found_terms.get(0))) expansion.add(found_terms.get(0)); System.out.println("found terms: " + found_terms); // System.out.println(found_terms.get(1)); if (found_terms.get(0) != "") { if (found_terms.size() > 1) { expansion = array2listNotDuplicate(found_terms.get(1).split(";")); } } found_all.addAll(expansion); /** * replacing the words in init_query, corresponding to the * phrase, with the expanded phrase (found_all) */ String[] spl = phrase.split(" "); String first_word = spl[0]; String last_word = spl[spl.length - 1]; int first_index = 0; int last_index = 0; for (String w : init_query) { if (w.equals(first_word)) first_index = init_query.indexOf(w); if (w.equals(phrase)) { first_index = init_query.indexOf(w); last_index = init_query.indexOf(w); } if (w.equals(last_word)) last_index = init_query.indexOf(w); if ((first_index != 0) && (last_index != 0)) break; } int to_delete_count = last_index - first_index; while (to_delete_count >= 0) { init_query.remove(first_index + to_delete_count); to_delete_count--; } if (!init_query.isEmpty()) init_query.addAll(first_index, found_all); else init_query.addAll(found_all); /** * replacing phrases, containing words from the found phrase, * from query terms (to avoid duplicate expansions, to reduce * the time spent on searching) */ z = 0; List<String> new_query_terms = new ArrayList<String>(); for (int j = i + 1; j < query_terms.size(); j++) { boolean contained = false; if (phrase.contains(query_terms.get(j))) { contained = true; } if (!contained) { new_query_terms.add(query_terms.get(j)); } } query_terms = new_query_terms; if (query_terms.isEmpty()) break; } } } /** * constructing the expanded query * * @param parsed * @return */ // TODO: do it with the help of Lucene. OR isn't necessary, ' ' = OR public String output(List<String> parsed) { float boost_factor = 0.8f; String res_query = ""; int i = 0; int size = init_query.size(); for (String s : init_query) { if (i + 1 < size) { String next = init_query.get(i + 1); if ((parsed.contains(s)) || (spec_symbols.contains(s))) { /** * to avoid having stopwords in "" */ if (s.split(" ").length == 1) res_query += s + " "; else res_query += "\"" + s + "\"" + " "; if ((!parsed.contains(next)) && (!spec_symbols.contains(next))) res_query += "OR ("; } else { if ((!parsed.contains(next)) && (!spec_symbols.contains(next))) { res_query += s + "^" + boost_factor + " OR "; } else res_query += s + "^" + boost_factor + ")" + " "; } } else { if (size > 1) if ((parsed.contains(s)) || (spec_symbols.contains(s))) if (s.split(" ").length == 1) res_query += s; else res_query += "\"" + s + "\""; else res_query += "" + s + "^" + boost_factor + ")"; else if (s.split(" ").length == 1) res_query += s; else res_query += "\"" + s + "\""; break; } i++; } return res_query.replaceAll(" *\\& *", " AND ").replaceAll(" *\\| *", " OR "); } public static void main(String[] args) throws OntologyServiceException { // String query = // "never asthma AND (\"cystic lung disease\" OR (Parkinson Disease))"; String query = "Butoconazole Nitrate"; // String query = "hallux valgus"; // String query = "\"vldl cholesterol\""; long start = System.currentTimeMillis(); OntocatQueryExpansion_lucene q = new OntocatQueryExpansion_lucene(); List<String> parsed = q.parseQuery(query); List<String> OntologiesForExpansion = new ArrayList<String>(); // OntologiesForExpansion.add("Human Phenotype Ontology"); OntologiesForExpansion.add("NCI Thesaurus"); // OntologiesForExpansion.add("Human Disease"); // OntologiesForExpansion.add("MeSH"); System.out.println("Expanding the query..."); q.expand(OntologiesForExpansion); System.out.println("\nThe expanded query: "); String res = q.output(parsed); System.out.println(res); System.out.println("Finished searching "); long end = System.currentTimeMillis(); System.out.println("Execution time was " + (end - start) + " ms."); } }