LastfmDataConverter.java example

Explorer
mahout-rbmClassifier-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.mahout.clustering.minhash;

import com.google.common.base.Charsets;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.io.Closeables;
import com.google.common.io.Files;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.mahout.math.SequentialAccessSparseVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;

public final class LastfmDataConverter {

  private static final Pattern TAB_PATTERN = Pattern.compile("\t");

  // we are clustering similar featureIdxs on the following dataset
  // http://www.iua.upf.es/~ocelma/MusicRecommendationDataset/index.html
  //
  // Preparation of the data set means gettting the dataset to a format which
  // can
  // be read by the min hash algorithm;
  //
  enum Lastfm {
    USERS_360K(17559530),
    USERS_1K(19150868);
    private final int totalRecords;
    Lastfm(int totalRecords) {
      this.totalRecords = totalRecords;
    }
    int getTotalRecords() {
      return totalRecords;
    }
  }

  private LastfmDataConverter() {
  }

  private static String usedMemory() {
    Runtime runtime = Runtime.getRuntime();
    return "Used Memory: [" + (runtime.totalMemory() - runtime.freeMemory()) / (1024 * 1024) + " MB] ";
  }

  /* Get the feature from the parsed record */
  private static String getFeature(String[] fields, Lastfm dataSet) {
    if (dataSet == Lastfm.USERS_360K) {
      return fields[0];
    } else {
      return fields[2];
    }
  }

  /* Get the item from the parsed record */
  private static String getItem(String[] fields, Lastfm dataSet) {
    if (dataSet == Lastfm.USERS_360K) {
      return fields[2];
    } else {
      return fields[0];
    }
  }

  /**
   * Reads the LastFm dataset and constructs a Map of (item, features). For 360K
   * Users dataset - (Item=Artist, Feature=User) For 1K Users dataset -
   * (Item=User, Feature=Artist)
   * 
   * @param inputFile
   *          Lastfm dataset file on the local file system.
   * @param dataSet
   *          Type of dataset - 360K Users or 1K Users
   */
  public static Map<String, List<Integer>> convertToItemFeatures(String inputFile, Lastfm dataSet) throws IOException {
    long totalRecords = dataSet.getTotalRecords();
    Map<String, Integer> featureIdxMap = Maps.newHashMap();
    Map<String, List<Integer>> itemFeaturesMap = Maps.newHashMap();
    String msg = usedMemory() + "Converting data to internal vector format: ";
    BufferedReader br = Files.newReader(new File(inputFile), Charsets.UTF_8);
    try {
      System.out.print(msg);
      int prevPercentDone = 1;
      double percentDone = 0.0;
      long parsedRecords = 0;
      String line;
      while ((line = br.readLine()) != null) {
        String[] fields = TAB_PATTERN.split(line);
        String feature = getFeature(fields, dataSet);
        String item = getItem(fields, dataSet);
        // get the featureIdx
        Integer featureIdx = featureIdxMap.get(feature);
        if (featureIdx == null) {
          featureIdx = featureIdxMap.size() + 1;
          featureIdxMap.put(feature, featureIdx);
        }
        // add it to the corresponding feature idx map
        List<Integer> features = itemFeaturesMap.get(item);
        if (features == null) {
          features = Lists.newArrayList();
          itemFeaturesMap.put(item, features);
        }
        features.add(featureIdx);
        parsedRecords++;
        // Update the progress
        percentDone = parsedRecords * 100.0 / totalRecords;
        msg = usedMemory() + "Converting data to internal vector format: ";
        if (percentDone > prevPercentDone) {
          System.out.print('\r' + msg + percentDone + '%');
          prevPercentDone++;
        }
        parsedRecords++;
      }
      msg = usedMemory() + "Converting data to internal vector format: ";
      System.out.print('\r' + msg + percentDone + "% Completed\n");
    } finally {
      Closeables.closeQuietly(br);
    }
    return itemFeaturesMap;
  }

  /**
   * Converts each record in (item,features) map into Mahout vector format and
   * writes it into sequencefile for minhash clustering
   */
  public static boolean writeToSequenceFile(Map<String, List<Integer>> itemFeaturesMap, Path outputPath)
    throws IOException {
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);
    fs.mkdirs(outputPath.getParent());
    long totalRecords = itemFeaturesMap.size();
    SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, outputPath, Text.class, VectorWritable.class);
    try {
      String msg = "Now writing vectorized data in sequence file format: ";
      System.out.print(msg);

      Text itemWritable = new Text();
      VectorWritable featuresWritable = new VectorWritable();

      int doneRecords = 0;
      int prevPercentDone = 1;

      for (Map.Entry<String, List<Integer>> itemFeature : itemFeaturesMap.entrySet()) {
        int numfeatures = itemFeature.getValue().size();
        itemWritable.set(itemFeature.getKey());
        Vector featureVector = new SequentialAccessSparseVector(numfeatures);
        int i = 0;
        for (Integer feature : itemFeature.getValue()) {
          featureVector.setQuick(i++, feature);
        }
        featuresWritable.set(featureVector);
        writer.append(itemWritable, featuresWritable);
        // Update the progress
        double percentDone = ++doneRecords * 100.0 / totalRecords;
        if (percentDone > prevPercentDone) {
          System.out.print('\r' + msg + percentDone + "% " + (percentDone >= 100 ? "Completed\n" : ""));
          prevPercentDone++;
        }
      }
    } finally {
      Closeables.closeQuietly(writer);
    }
    return true;
  }

  public static void main(String[] args) throws Exception {
    if (args.length < 3) {
      System.out.println("[Usage]: LastfmDataConverter <input> <output> <dataset>");
      System.out.println("   <input>: Absolute path to the local file [usersha1-artmbid-artname-plays.tsv] ");
      System.out.println("  <output>: Absolute path to the HDFS output file");
      System.out.println(" <dataset>: Either of the two Lastfm public datasets. "
          + "Must be either 'Users360K' or 'Users1K'");
      System.out.println("Note:- Hadoop configuration pointing to HDFS namenode should be in classpath");
      return;
    }
    Lastfm dataSet = Lastfm.valueOf(args[2]);
    Map<String, List<Integer>> itemFeatures = convertToItemFeatures(args[0], dataSet);
    if (itemFeatures.isEmpty()) {
      throw new IllegalStateException("Error converting the data file: [" + args[0] + ']');
    }
    Path output = new Path(args[1]);
    boolean status = writeToSequenceFile(itemFeatures, output);
    if (status) {
      System.out.println("Data converted and written successfully to HDFS location: [" + output + ']');
    } else {
      System.err.println("Error writing the converted data to HDFS location: [" + output + ']');
    }
  }
}