NewPopularityIndex.java example

Explorer
indextank-engine-master
/*
 * Copyright (c) 2011 LinkedIn, Inc
 * 
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package com.flaptor.indextank.suggest;

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Map;
import java.util.Scanner;

import org.apache.log4j.Logger;

import com.flaptor.indextank.index.DocId;
import com.flaptor.indextank.index.scorer.Boosts;
import com.flaptor.indextank.index.scorer.DynamicDataManager;
import com.flaptor.indextank.index.storage.InMemoryStorage;
import com.flaptor.util.Execute;
import com.flaptor.util.FunctionUtils;
import com.google.common.base.Function;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;


class NewPopularityIndex {
    
    private static final Logger logger = Logger.getLogger(Execute.whoAmI());
    private static final String MAIN_FILE_NAME = "autocompleteTerms";
    private static final int MAX_SUGGESTIONS = 5;

    private final File backupDir;
    private Node root;
    private int nodeCount = 0;
    private int termCount = 0;
    private int totalCount = 0;
    
    @SuppressWarnings("deprecation")
    public NewPopularityIndex(File backupDir) throws IOException {
        this.backupDir = backupDir;
        this.root = new Node("",0);
        
        File termsFile = new File(backupDir, MAIN_FILE_NAME);
        File oldFormatFile = new File(backupDir, PopularityIndex.MAIN_FILE_NAME);
        if (!termsFile.exists() && oldFormatFile.exists()) {
            logger.info("Found old format popularity index file. Converting to new format.");
            PopularityIndex old = new PopularityIndex(backupDir, true);
            old.writeNewFormat(termsFile);
            logger.info("Saved new format file");
        }
        
        if (termsFile.exists()) {
            logger.info("Loading popularity index terms from disk.");
            loadTerms(termsFile);
            logger.info("Terms loaded");
        }
        this.addTerm("text:");
    }

    private void loadTerms(File termsFile) throws IOException {
        DataInputStream dis = new DataInputStream(new BufferedInputStream(new FileInputStream(termsFile)));
        while (dis.available() > 0) {
            String str = dis.readUTF();
            int c = dis.readInt();
            this.incrementTermCount(str, c);
            if (logger.isDebugEnabled()) {
                logger.debug("Loaded " + str + " " + c);
            }
        }
    }
    
    private synchronized void incrementTermCount(String str, int c) {
        root.add(str, c, this);
        totalCount += c;
    }

    /**
     * Counts how many occurrences of {@code term} we've seen.
     * 
     * @param term The term to count occurrences. Never {@code null}
     * @return an int indicating how many times we saw {@code term}. 0 for never.
     */
    public int getCount(String term){
        Preconditions.checkNotNull(term);

        Node node = root.find(term);
        if (node == null) 
            return 0;

        // if there's a matching node with the same length
        // return it's count
        if (node.len == term.length())
            return node.count;

        // else
        return 0;
    }

    public List<String> getMostPopular(String prefix) {
        Node node = root.find(prefix);
        if (node == null) {
            return ImmutableList.of();
        }
        List<Node> best = Lists.newArrayList(node.best);
        Collections.sort(best, new Comparator<Node>() {
            public int compare(Node o1, Node o2) {
                return o2.count - o1.count;
            }
        });
        if (best.size() > MAX_SUGGESTIONS) {
            best = best.subList(0, MAX_SUGGESTIONS);
        }
        return Lists.transform(best, FunctionUtils.getToString());
    }
    
    public void addTerm(String term) {
        if (isAscii(term)) {
            incrementTermCount(term, 1);
        }
    }
    
    private boolean isAscii(String term) {
        for (int i = 0; i < term.length(); i++) {
            if (term.charAt(i) > 127) {
                return false;
            }
        }
        return true;
    }

    public void dump() throws FileNotFoundException, IOException {
        logger.info("Dumping PopularityIndex terms file.");
        File termsFile = new File(backupDir, MAIN_FILE_NAME);
        DataOutputStream dos = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(termsFile)));
        try {
            dumpNode(root, dos);
        } finally {
            Execute.close(dos);
        }
        logger.info("PopularityIndex dumped to disk.");
    }
    
    private static void dumpNode(Node node, DataOutputStream dos) throws IOException {
        if (node.count > 0) {
            dos.writeUTF(node.toString());
            dos.writeInt(node.count);
            if (logger.isDebugEnabled()) {
                logger.debug("Dumping " + node.toString() + " " + node.count);
            }
        }
        for (Node child : node.children) {
            dumpNode(child, dos);
        }
    }

    private static class Node {
		String str;
		int len;
		int count;
		Node[] children;
		List<Node> best = Lists.newArrayListWithCapacity(MAX_SUGGESTIONS);

		Node(String str, int count) {
		    this(str, str.length(), count, new Node[0]);
		    best.add(this);
		}
		
		Node(String str, int len, int count, Node[] chl) {
		    this.str = str;
		    this.len = len;
		    this.count = count;
		    this.children = chl;
		}

		/**
		 * Adds ncount to the count of nstr. And checks
		 * if the best list should be updated 
		 * @param newPopularityIndex 
		 */
		Node add(String nstr, int ncount, NewPopularityIndex index) {
		    Node node = this.insert(nstr, ncount, index);
		    this.offerBestCandidate(node);
		    return node;
		}
		
		/**
		 * Increments nstr's count by ncount, creating
		 * the necessary nodes.
		 * @param index 
		 */
		Node insert(String nstr, int ncount, NewPopularityIndex index) {
		    if (nstr.length() == len) {
		        // current nodes maches nstr, increment and return
		        this.count += ncount;
		        return this;
		    }
		    int p = len;
		    char c = nstr.charAt(p);
		    int i;
		    for (i = 0; i < children.length; i++) {
		        Node n = children[i];
		        char nc = n.str.charAt(p);
		        if (nc == c) {
		            // first char matches, insert at matching node
		            return insertAt(i, n, nstr, ncount, index);
		        }
		        if (nc > c) {
		            break;
		        }
		    }
		    // all smaller first chars have been skipped
		    // insert a new node at i
		    Node[] nchildren = new Node[children.length + 1];
		    Node newn = new Node(nstr, ncount);
		    index.termCount++;
		    index.nodeCount++;
		    index.totalCount += ncount;
		    System.arraycopy(children, 0, nchildren, 0, i);
		    nchildren[i] = newn;
		    System.arraycopy(children, i, nchildren, i+1, children.length - i);
		    children = nchildren;
		    return newn; 
        }

        private Node insertAt(int insert, Node n, String nstr, int ncount, NewPopularityIndex index) {
            int p = len;
            int minlen = Math.min(nstr.length(), n.len);
            
            // find the first non matching character betwen n and nstr
            while (p < minlen && nstr.charAt(p) == n.str.charAt(p)) {
                p++;
            }
            if (p == n.len) {
                // n is a prefix or equal to nstr
                // propagate it until the proper node
                // is found or created
                return n.add(nstr, ncount, index);
            } else if (p == nstr.length()) {
                // nstr is a prefix of n create a new node  
                // for nstr and insert it between this and n
                Node newn = new Node(nstr, nstr.length(), ncount, new Node[] {n});
                index.nodeCount++;
                index.termCount++;
                index.totalCount += ncount;
                // replace n with the new node
                children[insert] = newn;
                return newn;
            } else {
                // there a partial match between n and nstr
                // a new node for the matching part should be 
                // created with both n and nstr as its children
                Node split;
                Node newn = new Node(nstr, ncount);
                index.nodeCount++;
                index.termCount++;
                index.totalCount += ncount;
                if (nstr.charAt(p) > n.str.charAt(p)) {
                    // n is smaller than nstr
                    split = new Node(nstr, p, 0, new Node[] {n, newn} );
                    index.nodeCount++;
                } else {
                    // n is greater than nstr
                    split = new Node(nstr, p, 0, new Node[] {newn, n} );
                    index.nodeCount++;
                }
                split.best.addAll(n.best);
                split.offerBestCandidate(n);
                split.offerBestCandidate(newn);
                // replace n with the new split node
                children[insert] = split;
                return newn;
            }
            
        }
		
		@Override
		public String toString() {
		    return str.substring(0, len);
		}

        /**
         * Finds a node for the given prefix
         * If none is found, returns null
         */
        private Node find(String prefix) {
            Node[] chl = children;
            if (prefix.length() <= len) {
                if (str.startsWith(prefix)) {
                    return this;
                }
            } else if (chl.length > 0) {
                char x = prefix.charAt(len);
    		    int lo = 0;
    		    int hi = chl.length;
    		    while (hi - lo > 1) {
    		        int m = (lo+hi)/2;
    		        char cm = chl[m].str.charAt(len);
    		        if (cm > x) {
    		            hi = m; 
    		        } else {
    		            lo = m;
    		        }
    		    }
    		    Node candidate = chl[lo];
                if (candidate.str.charAt(len) == x) {
    		        return candidate.find(prefix);
    		    }
            }
            return null;
        }

        private Node find(char next) {
            Node[] chl = children;
            if (chl.length > 0) {
                int lo = 0;
                int hi = chl.length;
                while (hi - lo > 1) {
                    int m = (lo+hi)/2;
                    char cm = chl[m].str.charAt(len);
                    if (cm > next) {
                        hi = m; 
                    } else {
                        lo = m;
                    }
                }
                Node candidate = chl[lo];
                if (candidate.str.charAt(len) == next) {
                    return candidate;
                }
            }
            return null;
        }
        
        /**
         * Offer the given node as possible candidate for the best
         * suggestions list. 
         */
        public void offerBestCandidate(Node n) {
            // ignore this node and countless nodes
            best.remove(n);
            if (n.count > 0) {
                if (best.size() == MAX_SUGGESTIONS) {
                    // swap nodes with worse ones until
                    // in the end the worst one will be left out
                    for (int i = 0; i < best.size(); i++) {
                        if (best.get(i).count < n.count) {
                            Node t = n;
                            n = best.get(i);
                            best.set(i, t);
                        }
                    }
                } else {
                    // still not enough suggestions
                    this.best.add(n);
                }
            }
        }
    }

    public static void main(String[] args) throws IOException {
        File dir = new File(args[0]);
        int bc = Integer.parseInt(args[1]);
        NewPopularityIndex index = new NewPopularityIndex(dir);
        InMemoryStorage ims = new InMemoryStorage(dir, true);
        DynamicDataManager ddm = new DynamicDataManager(bc, dir);
        Scanner in = new Scanner(System.in);
        
        while (in.hasNextLine()) {
            String line = in.nextLine();
            if (line.startsWith("get ")) {
                String idStr = line.substring(4);
                DocId docId = new DocId(idStr);
                System.out.println(ims.getDocument(idStr));
                Boosts boosts = ddm.getBoosts(docId);
                System.out.println("timestamp: " + boosts.getTimestamp());
                for (int i = 0; i < bc; i++) {
                    System.out.println("var["+i+"]: " + boosts.getBoost(i));
                }
                System.out.println(ddm.getCategoryValues(docId));
            } else {
                List<String> suggestions = index.getMostPopular(line);
                for (String sugg : suggestions) {
                    System.out.print(" * ");
                    System.out.println(sugg);
                }
            }
        }
    }
    
    
    public static class PopularityIndexAutomaton extends Automaton {
    	public static class State implements Automaton.State {
    		private Node innerNode;
    		private int position;

    		public State(Node node, int position) {
				this.innerNode = node;
				this.position = position;
			}

			@Override
    		public Iterable<Automaton.Transition> getTransitions() {
				if (innerNode.len == position) {
					return Iterables.transform(Lists.newArrayList(innerNode.children), new Function<Node, Automaton.Transition>() {
						@Override
						public Automaton.Transition apply(Node node) {
							return new Transition(node, node.str.charAt(State.this.innerNode.len), State.this.innerNode.len + 1);
						}
					});
				} else {
					return Sets.<Automaton.Transition>newHashSet(new Transition(innerNode, innerNode.str.charAt(position), position + 1));
				}
    		}

    		@Override
    		public boolean isAccept() {
    			return innerNode.count > 0 && innerNode.len == position;
    		}

			@Override
			public com.flaptor.indextank.suggest.Automaton.State step(char symbol) {
				if (innerNode.len != position) {
					if (innerNode.str.charAt(position) == symbol) {
						return new State(innerNode, position + 1);
					} else {
						return null;
					}
				} else {
					Node nextNode = innerNode.find(symbol);
					
					if (nextNode == null) {
						return null;
					} else {
						return new State(nextNode, position + 1);
					}
				} 
			}
    		
    	}

    	public static class Transition implements Automaton.Transition {
    		private Node destination;
    		private char symbol;
    		private int offset;
    		
    		public Transition(Node destination, char symbol, int offset) {
				this.destination = destination;
				this.symbol = symbol;
				this.offset = offset;
			}

			@Override
			public com.flaptor.indextank.suggest.Automaton.State getState() {
    			return new State(destination, offset);
			}

			@Override
			public char getSymbol() {
				return symbol;
			}
    		
    	}
    	
    	public static PopularityIndexAutomaton adapt(NewPopularityIndex innerIndex) {
    		return new PopularityIndexAutomaton(new State(innerIndex.root.find("text:"), 5));
    	}
    	
    	private PopularityIndexAutomaton(State startState) {
    		super(startState);
    	}
    	
    }

    public Map<String, String> getStats() {
        Map<String, String> stats = Maps.newHashMap();
        stats.put("autocomplete_nodes", String.valueOf(nodeCount));
        stats.put("autocomplete_terms", String.valueOf(termCount));
        stats.put("autocomplete_total_count", String.valueOf(totalCount));
        return stats;
    }

}