FileDiffStorage.java example

Explorer
mahout-rbmClassifier-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.mahout.cf.taste.impl.recommender.slopeone.file;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.Collection;
import java.util.Iterator;
import java.util.Map;
import java.util.concurrent.locks.ReadWriteLock;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import java.util.regex.Pattern;

import org.apache.mahout.cf.taste.common.Refreshable;
import org.apache.mahout.cf.taste.common.Weighting;
import org.apache.mahout.cf.taste.impl.common.FastByIDMap;
import org.apache.mahout.cf.taste.impl.common.FastIDSet;
import org.apache.mahout.cf.taste.impl.common.FullRunningAverage;
import org.apache.mahout.cf.taste.impl.common.FullRunningAverageAndStdDev;
import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
import org.apache.mahout.cf.taste.impl.common.RunningAverage;
import org.apache.mahout.cf.taste.impl.recommender.slopeone.SlopeOneRecommender;
import org.apache.mahout.cf.taste.model.PreferenceArray;
import org.apache.mahout.cf.taste.recommender.slopeone.DiffStorage;
import org.apache.mahout.common.iterator.FileLineIterator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.base.Preconditions;

/**
 * <p>
 * {@link DiffStorage} which reads pre-computed diffs from a file and stores in memory. The file should have
 * one diff per line:
 * </p>
 * 
 * {@code itemID1,itemID2,diff[,count[,mk,sk]]}
 * 
 * <p>
 * The fourth column is optional, and is a count representing the number of occurrences of the item-item pair
 * that contribute to the diff. It is assumed to be 1 if not present. The fifth and sixth arguments are
 * computed values used by {@link FullRunningAverageAndStdDev} implementations to compute a running standard deviation.
 * They are required if using {@link Weighting#WEIGHTED} with {@link SlopeOneRecommender}.
 * </p>
 *
 * <p>
 * Commas or tabs can be delimiters. This is intended for use in conjuction with the output of
 * {@link org.apache.mahout.cf.taste.hadoop.slopeone.SlopeOneAverageDiffsJob}.
 * </p>
 *
 * <p>Note that the same item-item pair should not appear on multiple lines -- one line per item-item pair.</p>
 */
public final class FileDiffStorage implements DiffStorage {
  
  private static final Logger log = LoggerFactory.getLogger(FileDiffStorage.class);
  
  private static final long MIN_RELOAD_INTERVAL_MS = 60 * 1000L; // 1 minute?
  private static final char COMMENT_CHAR = '#';
  private static final Pattern SEPARATOR = Pattern.compile("[\t,]");

  private final File dataFile;
  private long lastModified;
  private final long maxEntries;
  private final FastByIDMap<FastByIDMap<RunningAverage>> averageDiffs;
  private final FastIDSet allRecommendableItemIDs;
  private final ReadWriteLock buildAverageDiffsLock;
  
  /**
   * @param dataFile
   *          diffs file
   * @param maxEntries
   *          maximum number of diffs to store
   * @throws FileNotFoundException
   *           if data file does not exist or is a directory
   */
  public FileDiffStorage(File dataFile, long maxEntries) throws FileNotFoundException {
    Preconditions.checkArgument(dataFile != null, "dataFile is null");
    if (!dataFile.exists() || dataFile.isDirectory()) {
      throw new FileNotFoundException(dataFile.toString());
    }
    Preconditions.checkArgument(maxEntries > 0L, "maxEntries must be positive");
    log.info("Creating FileDataModel for file {}", dataFile);
    this.dataFile = dataFile.getAbsoluteFile();
    this.lastModified = dataFile.lastModified();
    this.maxEntries = maxEntries;
    this.averageDiffs = new FastByIDMap<FastByIDMap<RunningAverage>>();
    this.allRecommendableItemIDs = new FastIDSet();
    this.buildAverageDiffsLock = new ReentrantReadWriteLock();

    buildDiffs();
  }
  
  private void buildDiffs() {
    if (buildAverageDiffsLock.writeLock().tryLock()) {
      try {

        averageDiffs.clear();
        allRecommendableItemIDs.clear();
        
        FileLineIterator iterator = new FileLineIterator(dataFile, false);
        String firstLine = iterator.peek();
        while (firstLine.isEmpty() || firstLine.charAt(0) == COMMENT_CHAR) {
          iterator.next();
          firstLine = iterator.peek();
        }
        long averageCount = 0L;
        while (iterator.hasNext()) {
          averageCount = processLine(iterator.next(), averageCount);
        }
        
        pruneInconsequentialDiffs();
        updateAllRecommendableItems();
        
      } catch (IOException ioe) {
        log.warn("Exception while reloading", ioe);
      } finally {
        buildAverageDiffsLock.writeLock().unlock();
      }
    }
  }
  
  private long processLine(String line, long averageCount) {

    if (line.isEmpty() || line.charAt(0) == COMMENT_CHAR) {
      return averageCount;
    }
    
    String[] tokens = SEPARATOR.split(line);
    Preconditions.checkArgument(tokens.length >=3 && tokens.length != 5, "Bad line: %s", line);

    long itemID1 = Long.parseLong(tokens[0]);
    long itemID2 = Long.parseLong(tokens[1]);
    double diff = Double.parseDouble(tokens[2]);
    int count = tokens.length >= 4 ? Integer.parseInt(tokens[3]) : 1;
    boolean hasMkSk = tokens.length >= 5;
    
    if (itemID1 > itemID2) {
      long temp = itemID1;
      itemID1 = itemID2;
      itemID2 = temp;
    }
    
    FastByIDMap<RunningAverage> level1Map = averageDiffs.get(itemID1);
    if (level1Map == null) {
      level1Map = new FastByIDMap<RunningAverage>();
      averageDiffs.put(itemID1, level1Map);
    }
    RunningAverage average = level1Map.get(itemID2);
    if (average != null) {
      throw new IllegalArgumentException("Duplicated line for item-item pair " + itemID1 + " / " + itemID2);
    }
    if (averageCount < maxEntries) {
      if (hasMkSk) {
        double mk = Double.parseDouble(tokens[4]);
        double sk = Double.parseDouble(tokens[5]);
        average = new FullRunningAverageAndStdDev(count, diff, mk, sk);
      } else {
        average = new FullRunningAverage(count, diff);
      }
      level1Map.put(itemID2, average);
      averageCount++;
    }

    allRecommendableItemIDs.add(itemID1);
    allRecommendableItemIDs.add(itemID2);
    
    return averageCount;
  }
  
  private void pruneInconsequentialDiffs() {
    // Go back and prune inconsequential diffs. "Inconsequential" means, here, only represented by one
    // data point, so possibly unreliable
    Iterator<Map.Entry<Long,FastByIDMap<RunningAverage>>> it1 = averageDiffs.entrySet().iterator();
    while (it1.hasNext()) {
      FastByIDMap<RunningAverage> map = it1.next().getValue();
      Iterator<Map.Entry<Long,RunningAverage>> it2 = map.entrySet().iterator();
      while (it2.hasNext()) {
        RunningAverage average = it2.next().getValue();
        if (average.getCount() <= 1) {
          it2.remove();
        }
      }
      if (map.isEmpty()) {
        it1.remove();
      } else {
        map.rehash();
      }
    }
    averageDiffs.rehash();
  }
  
  private void updateAllRecommendableItems() {
    for (Map.Entry<Long,FastByIDMap<RunningAverage>> entry : averageDiffs.entrySet()) {
      allRecommendableItemIDs.add(entry.getKey());
      LongPrimitiveIterator it = entry.getValue().keySetIterator();
      while (it.hasNext()) {
        allRecommendableItemIDs.add(it.next());
      }
    }
    allRecommendableItemIDs.rehash();
  }
  
  @Override
  public RunningAverage getDiff(long itemID1, long itemID2) {

    boolean inverted = false;
    if (itemID1 > itemID2) {
      inverted = true;
      long temp = itemID1;
      itemID1 = itemID2;
      itemID2 = temp;
    }
    
    FastByIDMap<RunningAverage> level2Map;
    try {
      buildAverageDiffsLock.readLock().lock();
      level2Map = averageDiffs.get(itemID1);
    } finally {
      buildAverageDiffsLock.readLock().unlock();
    }
    RunningAverage average = null;
    if (level2Map != null) {
      average = level2Map.get(itemID2);
    }
    if (inverted) {
      return average == null ? null : average.inverse();
    } else {
      return average;
    }
  }
  
  @Override
  public RunningAverage[] getDiffs(long userID, long itemID, PreferenceArray prefs) {
    try {
      buildAverageDiffsLock.readLock().lock();
      int size = prefs.length();
      RunningAverage[] result = new RunningAverage[size];
      for (int i = 0; i < size; i++) {
        result[i] = getDiff(prefs.getItemID(i), itemID);
      }
      return result;
    } finally {
      buildAverageDiffsLock.readLock().unlock();
    }
  }
  
  @Override
  public RunningAverage getAverageItemPref(long itemID) {
    return null; // TODO can't do this without a DataModel
  }

  @Override
  public void addItemPref(long userID, long itemIDA, float prefValue) {
    // Can't do this without a DataModel; should it just be a no-op?
    throw new UnsupportedOperationException();
  }

  @Override
  public void updateItemPref(long itemID, float prefDelta) {
    try {
      buildAverageDiffsLock.readLock().lock();
      for (Map.Entry<Long,FastByIDMap<RunningAverage>> entry : averageDiffs.entrySet()) {
        boolean matchesItemID1 = itemID == entry.getKey();
        for (Map.Entry<Long,RunningAverage> entry2 : entry.getValue().entrySet()) {
          RunningAverage average = entry2.getValue();
          if (matchesItemID1) {
            average.changeDatum(-prefDelta);
          } else if (itemID == entry2.getKey()) {
            average.changeDatum(prefDelta);
          }
        }
      }
      // RunningAverage itemAverage = averageItemPref.get(itemID);
      // if (itemAverage != null) {
      // itemAverage.changeDatum(prefDelta);
      // }
    } finally {
      buildAverageDiffsLock.readLock().unlock();
    }
  }

  @Override
  public void removeItemPref(long userID, long itemIDA, float prefValue) {
    // Can't do this without a DataModel; should it just be a no-op?
    throw new UnsupportedOperationException();
  }
  
  @Override
  public FastIDSet getRecommendableItemIDs(long userID) {
    try {
      buildAverageDiffsLock.readLock().lock();
      return allRecommendableItemIDs.clone();
    } finally {
      buildAverageDiffsLock.readLock().unlock();
    }
  }
  
  @Override
  public void refresh(Collection<Refreshable> alreadyRefreshed) {
    long mostRecentModification = dataFile.lastModified();
    if (mostRecentModification > lastModified + MIN_RELOAD_INTERVAL_MS) {
      log.debug("File has changed; reloading...");
      lastModified = mostRecentModification;
      buildDiffs();
    }
  }
  
}