package org.apache.lucene.wordnet; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.io.InputStream; import java.nio.ByteBuffer; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.Iterator; import java.util.Map; import java.util.TreeMap; import java.util.TreeSet; /** * Loads the <a target="_blank" * href="http://www.cogsci.princeton.edu/~wn/">WordNet </a> prolog file <a * href="http://www.cogsci.princeton.edu/2.0/WNprolog-2.0.tar.gz">wn_s.pl </a> * into a thread-safe main-memory hash map that can be used for fast * high-frequency lookups of synonyms for any given (lowercase) word string. * <p> * There holds: If B is a synonym for A (A -> B) then A is also a synonym for B (B -> A). * There does not necessarily hold: A -> B, B -> C then A -> C. * <p> * Loading typically takes some 1.5 secs, so should be done only once per * (server) program execution, using a singleton pattern. Once loaded, a * synonym lookup via {@link #getSynonyms(String)}takes constant time O(1). * A loaded default synonym map consumes about 10 MB main memory. * An instance is immutable, hence thread-safe. * <p> * This implementation borrows some ideas from the Lucene Syns2Index demo that * Dave Spencer originally contributed to Lucene. Dave's approach * involved a persistent Lucene index which is suitable for occasional * lookups or very large synonym tables, but considered unsuitable for * high-frequency lookups of medium size synonym tables. * <p> * Example Usage: * <pre> * String[] words = new String[] { "hard", "woods", "forest", "wolfish", "xxxx"}; * SynonymMap map = new SynonymMap(new FileInputStream("samples/fulltext/wn_s.pl")); * for (int i = 0; i < words.length; i++) { * String[] synonyms = map.getSynonyms(words[i]); * System.out.println(words[i] + ":" + java.util.Arrays.asList(synonyms).toString()); * } * * Example output: * hard:[arduous, backbreaking, difficult, fermented, firmly, grueling, gruelling, heavily, heavy, intemperately, knockout, laborious, punishing, severe, severely, strong, toilsome, tough] * woods:[forest, wood] * forest:[afforest, timber, timberland, wood, woodland, woods] * wolfish:[edacious, esurient, rapacious, ravening, ravenous, voracious, wolflike] * xxxx:[] * </pre> * * <p> * <b>See also:</b><br> * <a target="_blank" * href="http://www.cogsci.princeton.edu/~wn/man/prologdb.5WN.html">prologdb * man page </a><br> * <a target="_blank" href="http://www.hostmon.com/rfc/advanced.jsp">Dave's synonym demo site</a> */ public class SynonymMap { /** the index data; Map<String word, String[] synonyms> */ private final HashMap<String,String[]> table; private static final String[] EMPTY = new String[0]; private static final boolean DEBUG = false; /** * Constructs an instance, loading WordNet synonym data from the given input * stream. Finally closes the stream. The words in the stream must be in * UTF-8 or a compatible subset (for example ASCII, MacRoman, etc.). * * @param input * the stream to read from (null indicates an empty synonym map) * @throws IOException * if an error occured while reading the stream. */ public SynonymMap(InputStream input) throws IOException { this.table = input == null ? new HashMap<String,String[]>(0) : read(toByteArray(input)); } /** * Returns the synonym set for the given word, sorted ascending. * * @param word * the word to lookup (must be in lowercase). * @return the synonyms; a set of zero or more words, sorted ascending, each * word containing lowercase characters that satisfy * <code>Character.isLetter()</code>. */ public String[] getSynonyms(String word) { String[] synonyms = table.get(word); if (synonyms == null) return EMPTY; String[] copy = new String[synonyms.length]; // copy for guaranteed immutability System.arraycopy(synonyms, 0, copy, 0, synonyms.length); return copy; } /** * Returns a String representation of the index data for debugging purposes. * * @return a String representation */ @Override public String toString() { StringBuilder buf = new StringBuilder(); Iterator<String> iter = new TreeMap<String,String[]>(table).keySet().iterator(); int count = 0; int f0 = 0; int f1 = 0; int f2 = 0; int f3 = 0; while (iter.hasNext()) { String word = iter.next(); buf.append(word + ":"); String[] synonyms = getSynonyms(word); buf.append(Arrays.asList(synonyms)); buf.append("\n"); count += synonyms.length; if (synonyms.length == 0) f0++; if (synonyms.length == 1) f1++; if (synonyms.length == 2) f2++; if (synonyms.length == 3) f3++; } buf.append("\n\nkeys=" + table.size() + ", synonyms=" + count + ", f0=" + f0 +", f1=" + f1 + ", f2=" + f2 + ", f3=" + f3); return buf.toString(); } /** * Analyzes/transforms the given word on input stream loading. This default implementation simply * lowercases the word. Override this method with a custom stemming * algorithm or similar, if desired. * * @param word * the word to analyze * @return the same word, or a different word (or null to indicate that the * word should be ignored) */ protected String analyze(String word) { return word.toLowerCase(); } private static boolean isValid(String str) { for (int i=str.length(); --i >= 0; ) { if (!Character.isLetter(str.charAt(i))) return false; } return true; } private HashMap<String,String[]> read(byte[] data) { int WORDS = (int) (76401 / 0.7); // presizing int GROUPS = (int) (88022 / 0.7); // presizing HashMap<String,ArrayList<Integer>> word2Groups = new HashMap<String,ArrayList<Integer>>(WORDS); // Map<String word, int[] groups> HashMap<Integer,ArrayList<String>> group2Words = new HashMap<Integer,ArrayList<String>>(GROUPS); // Map<int group, String[] words> HashMap<String,String> internedWords = new HashMap<String,String>(WORDS);// Map<String word, String word> Charset charset = Charset.forName("UTF-8"); int lastNum = -1; Integer lastGroup = null; int len = data.length; int i=0; while (i < len) { // until EOF /* Part A: Parse a line */ // scan to beginning of group while (i < len && data[i] != '(') i++; if (i >= len) break; // EOF i++; // parse group int num = 0; while (i < len && data[i] != ',') { num = 10*num + (data[i] - 48); i++; } i++; // if (DEBUG) System.err.println("num="+ num); // scan to beginning of word while (i < len && data[i] != '\'') i++; i++; // scan to end of word int start = i; do { while (i < len && data[i] != '\'') i++; i++; } while (i < len && data[i] != ','); // word must end with "'," if (i >= len) break; // EOF String word = charset.decode(ByteBuffer.wrap(data, start, i-start-1)).toString(); // String word = new String(data, 0, start, i-start-1); // ASCII /* * Part B: ignore phrases (with spaces and hyphens) and * non-alphabetic words, and let user customize word (e.g. do some * stemming) */ if (!isValid(word)) continue; // ignore word = analyze(word); if (word == null || word.length() == 0) continue; // ignore /* Part C: Add (group,word) to tables */ // ensure compact string representation, minimizing memory overhead String w = internedWords.get(word); if (w == null) { word = new String(word); // ensure compact string internedWords.put(word, word); } else { word = w; } Integer group = lastGroup; if (num != lastNum) { group = Integer.valueOf(num); lastGroup = group; lastNum = num; } // add word --> group ArrayList<Integer> groups = word2Groups.get(word); if (groups == null) { groups = new ArrayList<Integer>(1); word2Groups.put(word, groups); } groups.add(group); // add group --> word ArrayList<String> words = group2Words.get(group); if (words == null) { words = new ArrayList<String>(1); group2Words.put(group, words); } words.add(word); } /* Part D: compute index data structure */ HashMap<String,String[]> word2Syns = createIndex(word2Groups, group2Words); /* Part E: minimize memory consumption by a factor 3 (or so) */ // if (true) return word2Syns; word2Groups = null; // help gc //TODO: word2Groups.clear(); would be more appropriate ? group2Words = null; // help gc //TODO: group2Words.clear(); would be more appropriate ? return optimize(word2Syns, internedWords); } private HashMap<String,String[]> createIndex(Map<String,ArrayList<Integer>> word2Groups, Map<Integer,ArrayList<String>> group2Words) { HashMap<String,String[]> word2Syns = new HashMap<String,String[]>(); for (final Map.Entry<String,ArrayList<Integer>> entry : word2Groups.entrySet()) { // for each word ArrayList<Integer> group = entry.getValue(); String word = entry.getKey(); // HashSet synonyms = new HashSet(); TreeSet<String> synonyms = new TreeSet<String>(); for (int i=group.size(); --i >= 0; ) { // for each groupID of word ArrayList<String> words = group2Words.get(group.get(i)); for (int j=words.size(); --j >= 0; ) { // add all words String synonym = words.get(j); // note that w and word are interned if (synonym != word) { // a word is implicitly it's own synonym synonyms.add(synonym); } } } int size = synonyms.size(); if (size > 0) { String[] syns = new String[size]; if (size == 1) syns[0] = synonyms.first(); else synonyms.toArray(syns); // if (syns.length > 1) Arrays.sort(syns); // if (DEBUG) System.err.println("word=" + word + ":" + Arrays.asList(syns)); word2Syns.put(word, syns); } } return word2Syns; } private HashMap<String,String[]> optimize(HashMap<String,String[]> word2Syns, HashMap<String,String> internedWords) { if (DEBUG) { System.err.println("before gc"); for (int i=0; i < 10; i++) System.gc(); System.err.println("after gc"); } // collect entries int len = 0; int size = word2Syns.size(); String[][] allSynonyms = new String[size][]; String[] words = new String[size]; Iterator<Map.Entry<String,String[]>> iter = word2Syns.entrySet().iterator(); for (int j=0; j < size; j++) { Map.Entry<String,String[]> entry = iter.next(); allSynonyms[j] = entry.getValue(); words[j] = entry.getKey(); len += words[j].length(); } // assemble large string containing all words StringBuilder buf = new StringBuilder(len); for (int j=0; j < size; j++) buf.append(words[j]); String allWords = new String(buf.toString()); // ensure compact string across JDK versions buf = null; // intern words at app level via memory-overlaid substrings for (int p=0, j=0; j < size; j++) { String word = words[j]; internedWords.put(word, allWords.substring(p, p + word.length())); p += word.length(); } // replace words with interned words for (int j=0; j < size; j++) { String[] syns = allSynonyms[j]; for (int k=syns.length; --k >= 0; ) { syns[k] = internedWords.get(syns[k]); } word2Syns.remove(words[j]); word2Syns.put(internedWords.get(words[j]), syns); } if (DEBUG) { words = null; allSynonyms = null; internedWords = null; allWords = null; System.err.println("before gc"); for (int i=0; i < 10; i++) System.gc(); System.err.println("after gc"); } return word2Syns; } // the following utility methods below are copied from Apache style Nux library - see http://dsd.lbl.gov/nux private static byte[] toByteArray(InputStream input) throws IOException { try { // safe and fast even if input.available() behaves weird or buggy int len = Math.max(256, input.available()); byte[] buffer = new byte[len]; byte[] output = new byte[len]; len = 0; int n; while ((n = input.read(buffer)) >= 0) { if (len + n > output.length) { // grow capacity byte tmp[] = new byte[Math.max(output.length << 1, len + n)]; System.arraycopy(output, 0, tmp, 0, len); System.arraycopy(buffer, 0, tmp, len, n); buffer = output; // use larger buffer for future larger bulk reads output = tmp; } else { System.arraycopy(buffer, 0, output, len, n); } len += n; } if (len == output.length) return output; buffer = null; // help gc buffer = new byte[len]; System.arraycopy(output, 0, buffer, 0, len); return buffer; } finally { input.close(); } } }