package textsearch;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.StringTokenizer;
import java.util.TreeSet;
public class InMemorySearch {
private final int MAX_WORDS_PER_DOCUMENT = 1000;
public static void main(String[] args) {
InMemorySearch ims = new InMemorySearch();
TestDocument doc1 = new TestDocument("This is a test for index and a test for search.");
ims.add(doc1);
TestDocument doc2 = new TestDocument("Please test the index code.");
ims.add(doc2);
TestDocument doc3 = new TestDocument("Please test the index code before tomorrow.");
ims.add(doc3);
ims.debug();
List<Integer> search_results = ims.search("test index", 2);
System.out.println("\nSearch results (doc IDs): " + search_results);
}
public void add(TestDocument document) {
Map<String,Integer> wcount = new Hashtable<String,Integer>();
StringTokenizer st = new StringTokenizer(document.text.toLowerCase(), " .,;:!");
int num_words = st.countTokens();
if (num_words == 0) return;
while (st.hasMoreTokens()) {
String word = st.nextToken();
System.out.println(word);
if (wcount.containsKey(word)) {
wcount.put(word, wcount.get(word) + (MAX_WORDS_PER_DOCUMENT / num_words));
} else {
wcount.put(word, MAX_WORDS_PER_DOCUMENT / num_words);
}
}
for (String word : wcount.keySet()) {
TreeSet<IdCount> ts;
if (index.containsKey(word)) {
ts = index.get(word);
} else {
ts = new TreeSet<IdCount>();
index.put(word, ts);
}
ts.add(new IdCount(document.id, wcount.get(word) * MAX_WORDS_PER_DOCUMENT / num_words));
}
}
public List<Integer> search(String search_terms) {
return search(search_terms, 10);
}
public List<Integer> search(String search_terms, int max_terms) {
List<Integer> ret = new ArrayList<Integer>(10);
// temporary tree set to keep ordered search results:
final Map<Integer,Integer> ordered_results = new Hashtable<Integer,Integer>(0);
StringTokenizer st = new StringTokenizer(search_terms.toLowerCase(), " .,;:!");
while (st.hasMoreTokens()) {
String word = st.nextToken();
Iterator<IdCount> word_counts = index.get(word).iterator();
while (word_counts.hasNext()) {
IdCount ts = word_counts.next();
Integer id = ts.id;
if (ordered_results.containsKey(id)) {
ordered_results.put(id, ordered_results.get(id) + ts.count);
} else {
ordered_results.put(id, ts.count);
}
}
}
List<Integer> keys = new ArrayList<Integer>(ordered_results.keySet());
Collections.sort(keys, new Comparator<Integer>() {
public int compare(Integer a, Integer b) {
return -ordered_results.get(a).compareTo(ordered_results.get(b)) ;
}
});
int count = 0;
result_loop:
for (Integer id : keys) {
if (count++ >= max_terms) break result_loop;
ret.add(id);
}
return ret;
}
public void debug() {
System.out.println("*** Debug: dump of search index:\n");
for (String word : index.keySet()) {
System.out.println("\n* " + word);
TreeSet<IdCount> ts = index.get(word);
Iterator<IdCount> iter = ts.iterator();
while (iter.hasNext()) {
System.out.println(" " + iter.next());
}
}
}
private Map<String,TreeSet<IdCount>> index =
new Hashtable<String, TreeSet<IdCount>>();
}
class TestDocument {
int id;
String text;
static int count = 0;
TestDocument(String text) {
this.text = text;
id = count++;
}
public String toString() {
int len = text.length();
if (len > 25) len = 25;
return "[Document id: " + id + ": " + text.substring(0,len) + "...]";
}
}
class IdCount implements Comparable<IdCount> {
int id = 0;
int count = 0;
public IdCount(int k, int v) {
this.id = k;
this.count = v;
}
public String toString() {
return "[IdCount: " + id + " : " + count + "]";
}
public int compareTo(IdCount o) {
// don't use i1-i2: avoid overflows
if (o.count == count) return 0;
if (o.count > count) return 1;
return -1;
}
}