/* * Copyright (c) 2011 LinkedIn, Inc * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package com.flaptor.indextank.suggest; import java.io.BufferedInputStream; import java.io.BufferedOutputStream; import java.io.DataInputStream; import java.io.DataOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.util.Collections; import java.util.Comparator; import java.util.List; import java.util.Map; import java.util.Scanner; import org.apache.log4j.Logger; import com.flaptor.indextank.index.DocId; import com.flaptor.indextank.index.scorer.Boosts; import com.flaptor.indextank.index.scorer.DynamicDataManager; import com.flaptor.indextank.index.storage.InMemoryStorage; import com.flaptor.util.Execute; import com.flaptor.util.FunctionUtils; import com.google.common.base.Function; import com.google.common.base.Preconditions; import com.google.common.collect.ImmutableList; import com.google.common.collect.Iterables; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import com.google.common.collect.Sets; class NewPopularityIndex { private static final Logger logger = Logger.getLogger(Execute.whoAmI()); private static final String MAIN_FILE_NAME = "autocompleteTerms"; private static final int MAX_SUGGESTIONS = 5; private final File backupDir; private Node root; private int nodeCount = 0; private int termCount = 0; private int totalCount = 0; @SuppressWarnings("deprecation") public NewPopularityIndex(File backupDir) throws IOException { this.backupDir = backupDir; this.root = new Node("",0); File termsFile = new File(backupDir, MAIN_FILE_NAME); File oldFormatFile = new File(backupDir, PopularityIndex.MAIN_FILE_NAME); if (!termsFile.exists() && oldFormatFile.exists()) { logger.info("Found old format popularity index file. Converting to new format."); PopularityIndex old = new PopularityIndex(backupDir, true); old.writeNewFormat(termsFile); logger.info("Saved new format file"); } if (termsFile.exists()) { logger.info("Loading popularity index terms from disk."); loadTerms(termsFile); logger.info("Terms loaded"); } this.addTerm("text:"); } private void loadTerms(File termsFile) throws IOException { DataInputStream dis = new DataInputStream(new BufferedInputStream(new FileInputStream(termsFile))); while (dis.available() > 0) { String str = dis.readUTF(); int c = dis.readInt(); this.incrementTermCount(str, c); if (logger.isDebugEnabled()) { logger.debug("Loaded " + str + " " + c); } } } private synchronized void incrementTermCount(String str, int c) { root.add(str, c, this); totalCount += c; } /** * Counts how many occurrences of {@code term} we've seen. * * @param term The term to count occurrences. Never {@code null} * @return an int indicating how many times we saw {@code term}. 0 for never. */ public int getCount(String term){ Preconditions.checkNotNull(term); Node node = root.find(term); if (node == null) return 0; // if there's a matching node with the same length // return it's count if (node.len == term.length()) return node.count; // else return 0; } public List<String> getMostPopular(String prefix) { Node node = root.find(prefix); if (node == null) { return ImmutableList.of(); } List<Node> best = Lists.newArrayList(node.best); Collections.sort(best, new Comparator<Node>() { public int compare(Node o1, Node o2) { return o2.count - o1.count; } }); if (best.size() > MAX_SUGGESTIONS) { best = best.subList(0, MAX_SUGGESTIONS); } return Lists.transform(best, FunctionUtils.getToString()); } public void addTerm(String term) { if (isAscii(term)) { incrementTermCount(term, 1); } } private boolean isAscii(String term) { for (int i = 0; i < term.length(); i++) { if (term.charAt(i) > 127) { return false; } } return true; } public void dump() throws FileNotFoundException, IOException { logger.info("Dumping PopularityIndex terms file."); File termsFile = new File(backupDir, MAIN_FILE_NAME); DataOutputStream dos = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(termsFile))); try { dumpNode(root, dos); } finally { Execute.close(dos); } logger.info("PopularityIndex dumped to disk."); } private static void dumpNode(Node node, DataOutputStream dos) throws IOException { if (node.count > 0) { dos.writeUTF(node.toString()); dos.writeInt(node.count); if (logger.isDebugEnabled()) { logger.debug("Dumping " + node.toString() + " " + node.count); } } for (Node child : node.children) { dumpNode(child, dos); } } private static class Node { String str; int len; int count; Node[] children; List<Node> best = Lists.newArrayListWithCapacity(MAX_SUGGESTIONS); Node(String str, int count) { this(str, str.length(), count, new Node[0]); best.add(this); } Node(String str, int len, int count, Node[] chl) { this.str = str; this.len = len; this.count = count; this.children = chl; } /** * Adds ncount to the count of nstr. And checks * if the best list should be updated * @param newPopularityIndex */ Node add(String nstr, int ncount, NewPopularityIndex index) { Node node = this.insert(nstr, ncount, index); this.offerBestCandidate(node); return node; } /** * Increments nstr's count by ncount, creating * the necessary nodes. * @param index */ Node insert(String nstr, int ncount, NewPopularityIndex index) { if (nstr.length() == len) { // current nodes maches nstr, increment and return this.count += ncount; return this; } int p = len; char c = nstr.charAt(p); int i; for (i = 0; i < children.length; i++) { Node n = children[i]; char nc = n.str.charAt(p); if (nc == c) { // first char matches, insert at matching node return insertAt(i, n, nstr, ncount, index); } if (nc > c) { break; } } // all smaller first chars have been skipped // insert a new node at i Node[] nchildren = new Node[children.length + 1]; Node newn = new Node(nstr, ncount); index.termCount++; index.nodeCount++; index.totalCount += ncount; System.arraycopy(children, 0, nchildren, 0, i); nchildren[i] = newn; System.arraycopy(children, i, nchildren, i+1, children.length - i); children = nchildren; return newn; } private Node insertAt(int insert, Node n, String nstr, int ncount, NewPopularityIndex index) { int p = len; int minlen = Math.min(nstr.length(), n.len); // find the first non matching character betwen n and nstr while (p < minlen && nstr.charAt(p) == n.str.charAt(p)) { p++; } if (p == n.len) { // n is a prefix or equal to nstr // propagate it until the proper node // is found or created return n.add(nstr, ncount, index); } else if (p == nstr.length()) { // nstr is a prefix of n create a new node // for nstr and insert it between this and n Node newn = new Node(nstr, nstr.length(), ncount, new Node[] {n}); index.nodeCount++; index.termCount++; index.totalCount += ncount; // replace n with the new node children[insert] = newn; return newn; } else { // there a partial match between n and nstr // a new node for the matching part should be // created with both n and nstr as its children Node split; Node newn = new Node(nstr, ncount); index.nodeCount++; index.termCount++; index.totalCount += ncount; if (nstr.charAt(p) > n.str.charAt(p)) { // n is smaller than nstr split = new Node(nstr, p, 0, new Node[] {n, newn} ); index.nodeCount++; } else { // n is greater than nstr split = new Node(nstr, p, 0, new Node[] {newn, n} ); index.nodeCount++; } split.best.addAll(n.best); split.offerBestCandidate(n); split.offerBestCandidate(newn); // replace n with the new split node children[insert] = split; return newn; } } @Override public String toString() { return str.substring(0, len); } /** * Finds a node for the given prefix * If none is found, returns null */ private Node find(String prefix) { Node[] chl = children; if (prefix.length() <= len) { if (str.startsWith(prefix)) { return this; } } else if (chl.length > 0) { char x = prefix.charAt(len); int lo = 0; int hi = chl.length; while (hi - lo > 1) { int m = (lo+hi)/2; char cm = chl[m].str.charAt(len); if (cm > x) { hi = m; } else { lo = m; } } Node candidate = chl[lo]; if (candidate.str.charAt(len) == x) { return candidate.find(prefix); } } return null; } private Node find(char next) { Node[] chl = children; if (chl.length > 0) { int lo = 0; int hi = chl.length; while (hi - lo > 1) { int m = (lo+hi)/2; char cm = chl[m].str.charAt(len); if (cm > next) { hi = m; } else { lo = m; } } Node candidate = chl[lo]; if (candidate.str.charAt(len) == next) { return candidate; } } return null; } /** * Offer the given node as possible candidate for the best * suggestions list. */ public void offerBestCandidate(Node n) { // ignore this node and countless nodes best.remove(n); if (n.count > 0) { if (best.size() == MAX_SUGGESTIONS) { // swap nodes with worse ones until // in the end the worst one will be left out for (int i = 0; i < best.size(); i++) { if (best.get(i).count < n.count) { Node t = n; n = best.get(i); best.set(i, t); } } } else { // still not enough suggestions this.best.add(n); } } } } public static void main(String[] args) throws IOException { File dir = new File(args[0]); int bc = Integer.parseInt(args[1]); NewPopularityIndex index = new NewPopularityIndex(dir); InMemoryStorage ims = new InMemoryStorage(dir, true); DynamicDataManager ddm = new DynamicDataManager(bc, dir); Scanner in = new Scanner(System.in); while (in.hasNextLine()) { String line = in.nextLine(); if (line.startsWith("get ")) { String idStr = line.substring(4); DocId docId = new DocId(idStr); System.out.println(ims.getDocument(idStr)); Boosts boosts = ddm.getBoosts(docId); System.out.println("timestamp: " + boosts.getTimestamp()); for (int i = 0; i < bc; i++) { System.out.println("var["+i+"]: " + boosts.getBoost(i)); } System.out.println(ddm.getCategoryValues(docId)); } else { List<String> suggestions = index.getMostPopular(line); for (String sugg : suggestions) { System.out.print(" * "); System.out.println(sugg); } } } } public static class PopularityIndexAutomaton extends Automaton { public static class State implements Automaton.State { private Node innerNode; private int position; public State(Node node, int position) { this.innerNode = node; this.position = position; } @Override public Iterable<Automaton.Transition> getTransitions() { if (innerNode.len == position) { return Iterables.transform(Lists.newArrayList(innerNode.children), new Function<Node, Automaton.Transition>() { @Override public Automaton.Transition apply(Node node) { return new Transition(node, node.str.charAt(State.this.innerNode.len), State.this.innerNode.len + 1); } }); } else { return Sets.<Automaton.Transition>newHashSet(new Transition(innerNode, innerNode.str.charAt(position), position + 1)); } } @Override public boolean isAccept() { return innerNode.count > 0 && innerNode.len == position; } @Override public com.flaptor.indextank.suggest.Automaton.State step(char symbol) { if (innerNode.len != position) { if (innerNode.str.charAt(position) == symbol) { return new State(innerNode, position + 1); } else { return null; } } else { Node nextNode = innerNode.find(symbol); if (nextNode == null) { return null; } else { return new State(nextNode, position + 1); } } } } public static class Transition implements Automaton.Transition { private Node destination; private char symbol; private int offset; public Transition(Node destination, char symbol, int offset) { this.destination = destination; this.symbol = symbol; this.offset = offset; } @Override public com.flaptor.indextank.suggest.Automaton.State getState() { return new State(destination, offset); } @Override public char getSymbol() { return symbol; } } public static PopularityIndexAutomaton adapt(NewPopularityIndex innerIndex) { return new PopularityIndexAutomaton(new State(innerIndex.root.find("text:"), 5)); } private PopularityIndexAutomaton(State startState) { super(startState); } } public Map<String, String> getStats() { Map<String, String> stats = Maps.newHashMap(); stats.put("autocomplete_nodes", String.valueOf(nodeCount)); stats.put("autocomplete_terms", String.valueOf(termCount)); stats.put("autocomplete_total_count", String.valueOf(totalCount)); return stats; } }