/* * Copyright 2010 Bizosys Technologies Limited * * Licensed to the Bizosys Technologies Limited (Bizosys) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The Bizosys licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.bizosys.hsearch.dictionary; import java.util.HashMap; import java.util.Map; import java.util.TreeMap; /** * * W. Burkhard and R. Keller. Some approaches to best-match file searching, * CACM, 1973 * * http://portal.acm.org/citation.cfm?doid=362003.362025 * * * From Wikipedia, the free encyclopedia: * * A BK-tree is a metric tree suggested by Burkhard and Keller specifically * adapted to discrete metric spaces. * * For simplicity, let us consider integer discrete metric p(x,y). Then, BK-tree * is defined in the following way. An arbitrary element a is selected as root * node. Root node may have zero or more subtrees. The k-th subtree is * recursively built of all elements b such that p(a,b)=k. BK-trees can be used * for approximate string matching in a dictionary. * * http://en.wikipedia.org/wiki/BK-tree * http://blog.notdot.net/2007/4/Damn-Cool-Algorithms-Part-1-BK-Trees * * To further improve performance, implement (fast) Map interface for query(). * Don't forget about fast similarity! * * http://www.dcs.shef.ac.uk/~sam/stringmetrics.html - huge list of different * metrics * * http://www.catalysoft.com/articles/StrikeAMatch.html - extremely simple algo * providing fast results (better than Levenstein) * * @author Fuad Efendi * */ public class BKTree<E> { private Node root; private Distance distance; public BKTree(Distance distance) { root = null; this.distance = distance; } public void add(E term) { if (root != null) { root.add(term); } else { root = new Node(term); } } public Map<E, Integer> query(E searchObject, int threshold) { Map<E, Integer> matches = new HashMap<E, Integer>(); root.query(searchObject, threshold, matches); return matches; } private class Node { E term; Map<Integer, Node> children; public Node(E term) { this.term = term; children = new TreeMap<Integer, Node>(); } public void add(E term) { int score = distance.getDistance(term, this.term); Node child = children.get(score); if (child != null) { child.add(term); } else { children.put(score, new Node(term)); } } public void query(E term, int threshold, Map<E, Integer> collected) { int distanceAtNode = distance.getDistance(term, this.term); if (distanceAtNode <= threshold) { collected.put(this.term, distanceAtNode); } for (int score = distanceAtNode - threshold; score <= distanceAtNode + threshold; score++) { if (score > 0) { Node child = children.get(score); if (child != null) { child.query(term, threshold, collected); } } } } } }