/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.mahout.cf.taste.impl.model.hbase; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.HColumnDescriptor; import org.apache.hadoop.hbase.HTableDescriptor; import org.apache.hadoop.hbase.KeyValue; import org.apache.hadoop.hbase.client.Delete; import org.apache.hadoop.hbase.client.Get; import org.apache.hadoop.hbase.client.HBaseAdmin; import org.apache.hadoop.hbase.client.HTableFactory; import org.apache.hadoop.hbase.client.HTableInterface; import org.apache.hadoop.hbase.client.HTablePool; import org.apache.hadoop.hbase.client.Put; import org.apache.hadoop.hbase.client.Result; import org.apache.hadoop.hbase.client.ResultScanner; import org.apache.hadoop.hbase.client.Scan; import org.apache.hadoop.hbase.filter.FilterList; import org.apache.hadoop.hbase.filter.FirstKeyOnlyFilter; import org.apache.hadoop.hbase.filter.KeyOnlyFilter; import org.apache.hadoop.hbase.util.Bytes; import org.apache.mahout.cf.taste.common.NoSuchItemException; import org.apache.mahout.cf.taste.common.NoSuchUserException; import org.apache.mahout.cf.taste.common.Refreshable; import org.apache.mahout.cf.taste.common.TasteException; import org.apache.mahout.cf.taste.impl.common.FastIDSet; import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator; import org.apache.mahout.cf.taste.impl.model.GenericItemPreferenceArray; import org.apache.mahout.cf.taste.impl.model.GenericUserPreferenceArray; import org.apache.mahout.cf.taste.model.DataModel; import org.apache.mahout.cf.taste.model.PreferenceArray; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.Closeable; import java.io.IOException; import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.Collection; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.SortedMap; /** * <p>Naive approach of storing one preference as one value in the table. * Preferences are indexed as (user, item) and (item, user) for O(1) lookups.</p> * * <p>The default table name is "taste", this can be set through a constructor * argument. Each row has a value starting with "i" or "u" followed by the * actual id encoded as a big endian long.</p> * * <p>E.g., "u\x00\x00\x00\x00\x00\x00\x04\xd2" is user 1234L</p> * * <p>There are two column families: "users" and "items".</p> * * <p>The "users" column family holds user->item preferences. Each userID is the * column qualifier and the value is the preference.</p> * * <p>The "items" column fmaily holds item->user preferences. Each itemID is the * column qualifier and the value is the preference.</p> * * <p>User IDs and item IDs are cached in a FastIDSet since it requires a full * table scan to build these sets. Preferences are not cached since they * are pretty cheap lookups in HBase (also caching the Preferences defeats * the purpose of a scalable storage engine like HBase).</p> */ public final class HBaseDataModel implements DataModel, Closeable { private static final Logger log = LoggerFactory.getLogger(HBaseDataModel.class); private static final String DEFAULT_TABLE = "taste"; private static final byte[] USERS_CF = Bytes.toBytes("users"); private static final byte[] ITEMS_CF = Bytes.toBytes("items"); private final HTablePool pool; private final String tableName; // Cache of user and item ids private volatile FastIDSet itemIDs; private volatile FastIDSet userIDs; public HBaseDataModel(String zkConnect) throws IOException { this(zkConnect, DEFAULT_TABLE); } public HBaseDataModel(String zkConnect, String tableName) throws IOException { log.info("Using HBase table {}", tableName); Configuration conf = HBaseConfiguration.create(); conf.set("hbase.zookeeper.quorum", zkConnect); HTableFactory tableFactory = new HTableFactory(); this.pool = new HTablePool(conf, 8, tableFactory); this.tableName = tableName; bootstrap(conf); // Warm the cache refresh(null); } public HBaseDataModel(HTablePool pool, String tableName, Configuration conf) throws IOException { log.info("Using HBase table {}", tableName); this.pool = pool; this.tableName = tableName; bootstrap(conf); // Warm the cache refresh(null); } public String getTableName() { return tableName; } /** * Create the table if it doesn't exist */ private void bootstrap(Configuration conf) throws IOException { HTableDescriptor tDesc = new HTableDescriptor(Bytes.toBytes(tableName)); tDesc.addFamily(new HColumnDescriptor(USERS_CF)); tDesc.addFamily(new HColumnDescriptor(ITEMS_CF)); try (HBaseAdmin admin = new HBaseAdmin(conf)) { admin.createTable(tDesc); log.info("Created table {}", tableName); } } /** * Prefix a user id with "u" and convert to byte[] */ private static byte[] userToBytes(long userID) { ByteBuffer bb = ByteBuffer.allocate(9); bb.put((byte)0x75); // The letter "u" bb.putLong(userID); return bb.array(); } /** * Prefix an item id with "i" and convert to byte[] */ private static byte[] itemToBytes(long itemID) { ByteBuffer bb = ByteBuffer.allocate(9); bb.put((byte)0x69); // The letter "i" bb.putLong(itemID); return bb.array(); } /** * Extract the id out of a prefix byte[] id */ private static long bytesToUserOrItemID(byte[] ba) { ByteBuffer bb = ByteBuffer.wrap(ba); return bb.getLong(1); } /* DataModel interface */ @Override public LongPrimitiveIterator getUserIDs() { return userIDs.iterator(); } @Override public PreferenceArray getPreferencesFromUser(long userID) throws TasteException { Result result; try { HTableInterface table = pool.getTable(tableName); Get get = new Get(userToBytes(userID)); get.addFamily(ITEMS_CF); result = table.get(get); table.close(); } catch (IOException e) { throw new TasteException("Failed to retrieve user preferences from HBase", e); } if (result.isEmpty()) { throw new NoSuchUserException(userID); } SortedMap<byte[], byte[]> families = result.getFamilyMap(ITEMS_CF); PreferenceArray prefs = new GenericUserPreferenceArray(families.size()); prefs.setUserID(0, userID); int i = 0; for (Map.Entry<byte[], byte[]> entry : families.entrySet()) { prefs.setItemID(i, Bytes.toLong(entry.getKey())); prefs.setValue(i, Bytes.toFloat(entry.getValue())); i++; } return prefs; } @Override public FastIDSet getItemIDsFromUser(long userID) throws TasteException { Result result; try { HTableInterface table = pool.getTable(tableName); Get get = new Get(userToBytes(userID)); get.addFamily(ITEMS_CF); result = table.get(get); table.close(); } catch (IOException e) { throw new TasteException("Failed to retrieve item IDs from HBase", e); } if (result.isEmpty()) { throw new NoSuchUserException(userID); } SortedMap<byte[],byte[]> families = result.getFamilyMap(ITEMS_CF); FastIDSet ids = new FastIDSet(families.size()); for (byte[] family : families.keySet()) { ids.add(Bytes.toLong(family)); } return ids; } @Override public LongPrimitiveIterator getItemIDs() { return itemIDs.iterator(); } @Override public PreferenceArray getPreferencesForItem(long itemID) throws TasteException { Result result; try { HTableInterface table = pool.getTable(tableName); Get get = new Get(itemToBytes(itemID)); get.addFamily(USERS_CF); result = table.get(get); table.close(); } catch (IOException e) { throw new TasteException("Failed to retrieve item preferences from HBase", e); } if (result.isEmpty()) { throw new NoSuchItemException(itemID); } SortedMap<byte[], byte[]> families = result.getFamilyMap(USERS_CF); PreferenceArray prefs = new GenericItemPreferenceArray(families.size()); prefs.setItemID(0, itemID); int i = 0; for (Map.Entry<byte[], byte[]> entry : families.entrySet()) { prefs.setUserID(i, Bytes.toLong(entry.getKey())); prefs.setValue(i, Bytes.toFloat(entry.getValue())); i++; } return prefs; } @Override public Float getPreferenceValue(long userID, long itemID) throws TasteException { Result result; try { HTableInterface table = pool.getTable(tableName); Get get = new Get(userToBytes(userID)); get.addColumn(ITEMS_CF, Bytes.toBytes(itemID)); result = table.get(get); table.close(); } catch (IOException e) { throw new TasteException("Failed to retrieve user preferences from HBase", e); } if (result.isEmpty()) { throw new NoSuchUserException(userID); } if (result.containsColumn(ITEMS_CF, Bytes.toBytes(itemID))) { return Bytes.toFloat(result.getValue(ITEMS_CF, Bytes.toBytes(itemID))); } else { return null; } } @Override public Long getPreferenceTime(long userID, long itemID) throws TasteException { Result result; try { HTableInterface table = pool.getTable(tableName); Get get = new Get(userToBytes(userID)); get.addColumn(ITEMS_CF, Bytes.toBytes(itemID)); result = table.get(get); table.close(); } catch (IOException e) { throw new TasteException("Failed to retrieve user preferences from HBase", e); } if (result.isEmpty()) { throw new NoSuchUserException(userID); } if (result.containsColumn(ITEMS_CF, Bytes.toBytes(itemID))) { KeyValue kv = result.getColumnLatest(ITEMS_CF, Bytes.toBytes(itemID)); return kv.getTimestamp(); } else { return null; } } @Override public int getNumItems() { return itemIDs.size(); } @Override public int getNumUsers() { return userIDs.size(); } @Override public int getNumUsersWithPreferenceFor(long itemID) throws TasteException { PreferenceArray prefs = getPreferencesForItem(itemID); return prefs.length(); } @Override public int getNumUsersWithPreferenceFor(long itemID1, long itemID2) throws TasteException { Result[] results; try { HTableInterface table = pool.getTable(tableName); List<Get> gets = new ArrayList<>(2); gets.add(new Get(itemToBytes(itemID1))); gets.add(new Get(itemToBytes(itemID2))); gets.get(0).addFamily(USERS_CF); gets.get(1).addFamily(USERS_CF); results = table.get(gets); table.close(); } catch (IOException e) { throw new TasteException("Failed to retrieve item preferences from HBase", e); } if (results[0].isEmpty()) { throw new NoSuchItemException(itemID1); } if (results[1].isEmpty()) { throw new NoSuchItemException(itemID2); } // First item Result result = results[0]; SortedMap<byte[], byte[]> families = result.getFamilyMap(USERS_CF); FastIDSet idSet1 = new FastIDSet(families.size()); for (byte[] id : families.keySet()) { idSet1.add(Bytes.toLong(id)); } // Second item result = results[1]; families = result.getFamilyMap(USERS_CF); FastIDSet idSet2 = new FastIDSet(families.size()); for (byte[] id : families.keySet()) { idSet2.add(Bytes.toLong(id)); } return idSet1.intersectionSize(idSet2); } @Override public void setPreference(long userID, long itemID, float value) throws TasteException { try { HTableInterface table = pool.getTable(tableName); List<Put> puts = new ArrayList<>(2); puts.add(new Put(userToBytes(userID))); puts.add(new Put(itemToBytes(itemID))); puts.get(0).add(ITEMS_CF, Bytes.toBytes(itemID), Bytes.toBytes(value)); puts.get(1).add(USERS_CF, Bytes.toBytes(userID), Bytes.toBytes(value)); table.put(puts); table.close(); } catch (IOException e) { throw new TasteException("Failed to store preference in HBase", e); } } @Override public void removePreference(long userID, long itemID) throws TasteException { try { HTableInterface table = pool.getTable(tableName); List<Delete> deletes = new ArrayList<>(2); deletes.add(new Delete(userToBytes(userID))); deletes.add(new Delete(itemToBytes(itemID))); deletes.get(0).deleteColumns(ITEMS_CF, Bytes.toBytes(itemID)); deletes.get(1).deleteColumns(USERS_CF, Bytes.toBytes(userID)); table.delete(deletes); table.close(); } catch (IOException e) { throw new TasteException("Failed to remove preference from HBase", e); } } @Override public boolean hasPreferenceValues() { return true; } @Override public float getMaxPreference() { throw new UnsupportedOperationException(); } @Override public float getMinPreference() { throw new UnsupportedOperationException(); } /* Closeable interface */ @Override public void close() throws IOException { pool.close(); } /* Refreshable interface */ @Override public void refresh(Collection<Refreshable> alreadyRefreshed) { if (alreadyRefreshed == null || !alreadyRefreshed.contains(this)) { try { log.info("Refreshing item and user ID caches"); long t1 = System.currentTimeMillis(); refreshItemIDs(); refreshUserIDs(); long t2 = System.currentTimeMillis(); log.info("Finished refreshing caches in {} ms", t2 - t1); } catch (IOException e) { throw new IllegalStateException("Could not reload DataModel", e); } } } /* * Refresh the item id cache. Warning: this does a large table scan */ private synchronized void refreshItemIDs() throws IOException { // Get the list of item ids HTableInterface table = pool.getTable(tableName); Scan scan = new Scan(new byte[]{0x69}, new byte[]{0x70}); scan.setFilter(new FilterList(FilterList.Operator.MUST_PASS_ALL, new KeyOnlyFilter(), new FirstKeyOnlyFilter())); ResultScanner scanner = table.getScanner(scan); Collection<Long> ids = new LinkedList<>(); for (Result result : scanner) { ids.add(bytesToUserOrItemID(result.getRow())); } table.close(); // Copy into FastIDSet FastIDSet itemIDs = new FastIDSet(ids.size()); for (long l : ids) { itemIDs.add(l); } // Swap with the active this.itemIDs = itemIDs; } /* * Refresh the user id cache. Warning: this does a large table scan */ private synchronized void refreshUserIDs() throws IOException { // Get the list of user ids HTableInterface table = pool.getTable(tableName); Scan scan = new Scan(new byte[]{0x75}, new byte[]{0x76}); scan.setFilter(new FilterList(FilterList.Operator.MUST_PASS_ALL, new KeyOnlyFilter(), new FirstKeyOnlyFilter())); ResultScanner scanner = table.getScanner(scan); Collection<Long> ids = new LinkedList<>(); for (Result result : scanner) { ids.add(bytesToUserOrItemID(result.getRow())); } table.close(); // Copy into FastIDSet FastIDSet userIDs = new FastIDSet(ids.size()); for (long l : ids) { userIDs.add(l); } // Swap with the active this.userIDs = userIDs; } }