package com.manning.hsia.dvdstore; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.TermQuery; import org.hibernate.Transaction; import org.hibernate.LockMode; import org.hibernate.search.FullTextSession; import java.io.*; import java.util.*; import com.manning.hsia.ch13.Synonym; public class SynonymHelper { private FullTextSession session; private Transaction tx; public Query expandQuery(String query, FullTextSession session, Analyzer a, String field, float boost) throws IOException { Set<String> synsList = getSynonyms(query, session, a); BooleanQuery bq = new BooleanQuery(); for (String synonym : synsList) { // add in unique synonyms TermQuery tq = new TermQuery(new Term(field, synonym)); tq.setBoost(boost); bq.add(tq, BooleanClause.Occur.SHOULD); } return bq; } public Set<String> getSynonyms(String query, FullTextSession session, Analyzer a) throws IOException { Set<String> querySet = new HashSet(); // avoid dups TokenStream ts = a.tokenStream("word", new StringReader(query)); Token t = new Token(); String anaQuery; while ((t = ts.next( t )) != null) { anaQuery = new String(t.termBuffer(), 0, t.termLength()); querySet.add(anaQuery); } BooleanQuery bq = new BooleanQuery(); for (String str : querySet) { TermQuery tq = new TermQuery(new Term("word", str)); bq.add(tq, BooleanClause.Occur.SHOULD); } org.hibernate.search.FullTextQuery hibQuery = session.createFullTextQuery(bq, Synonym.class); hibQuery.setProjection("syn"); List<Object[]> results = hibQuery.list(); Set<String> alreadyPresent = new HashSet(); for (Object[] obj : results) { StringTokenizer st = new StringTokenizer((String) obj[0], " "); while (st.hasMoreElements()) { String syn = (String) st.nextElement(); alreadyPresent.add(syn); } } alreadyPresent.addAll(querySet); // add original query terms return alreadyPresent; } public void buildSynonymIndex(FullTextSession session, String synFile) throws IOException { //if index already there, don't do anything final File indexDirectory = new File( "synonym_index", Synonym.class.getName() ); if ( indexDirectory.exists() && indexDirectory.listFiles().length > 2 ) return; //prolog file must be there if ( ! ( new File(synFile).exists() ) ) { throw new IllegalStateException("Place " + synFile + " in the root directory (see ch13/readme.html file)"); } this.session = session; if (!(new File(synFile)).canRead()) { throw new IOException("Prolog file is not readable: " + synFile); } final FileInputStream fis = new FileInputStream(synFile); final BufferedReader br = new BufferedReader(new InputStreamReader(fis)); String lineIn; final Map<String, List<String>> word2Groups = new TreeMap();// maps a word to all the "groups" it's in final Map<String, List<String>> group2Words = new TreeMap();// maps a group to all the words in it while ((lineIn = br.readLine()) != null) { if (!lineIn.startsWith("s(")) { // syntax check throw new IOException("Wrong input format " + lineIn); } lineIn = lineIn.substring(2); // parse line String id = lineIn.substring(0, lineIn.indexOf(',')); int quote1 = lineIn.indexOf('\''); int quote2 = lineIn.lastIndexOf('\''); String word = lineIn.substring(quote1 + 1, quote2).toLowerCase(); List<String> list = word2Groups.get(word); if (list == null) { list = new LinkedList(); list.add(id); word2Groups.put(word, list); } else { list.add(id); } list = group2Words.get(id); if (list == null) { list = new LinkedList(); list.add(word); group2Words.put(id, list); } else { list.add(word); } } fis.close(); br.close(); // create the index index(word2Groups, group2Words); } private static boolean isWord(String s) { int len = s.length(); for (int i = 0; i < len; i++) { if (!Character.isLetter(s.charAt(i))) { return false; } } return true; } private void index(Map<String, List<String>> word2Groups, Map<String, List<String>> group2Words) throws IOException { try { Iterator iter = word2Groups.keySet().iterator(); tx = session.beginTransaction(); int counter = 0; while (iter.hasNext()) { // for each word counter++; String word2GroupsKey = (String) iter.next(); Synonym syn = new Synonym(); syn.setId( counter ); int n = index(word2Groups, group2Words, word2GroupsKey, syn); if (n > 0) { syn.setWord(word2GroupsKey); session.lock( syn, LockMode.NONE ); session.index(syn); } if (counter % 1000 == 0) { // 100 at a time session.flushToIndexes(); session.clear(); } } tx.commit(); } finally { session.clear(); session.getSearchFactory().optimize(); } } private int index(Map<String, List<String>> word2Groups, Map<String, List<String>> group2Words, String word2GroupsKey, Synonym syn) { List keys = word2Groups.get(word2GroupsKey); // get list of key#'s Iterator iter = keys.iterator(); Set alreadyPresent = new TreeSet(); while (iter.hasNext()) { // for each key# pass fill up 'alreadyPresent' with all words alreadyPresent.addAll(group2Words.get(iter.next())); // get list of words } int num = 0; alreadyPresent.remove(word2GroupsKey); // word is it's own synonym Iterator it = alreadyPresent.iterator(); while (it.hasNext()) { String cur = (String) it.next(); if (cur.startsWith("flick knife")) { System.out.println(""); } if (!isWord(cur)) { // don't store things with spaces or non-alphas continue; } num++; if (syn.getSyn() != null) { syn.setSyn(syn.getSyn() + " " + cur); } else { syn.setSyn(cur); } } return num; } }