/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.mahout.clustering.minhash;
import com.google.common.base.Charsets;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.io.Closeables;
import com.google.common.io.Files;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.mahout.math.SequentialAccessSparseVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;
public final class LastfmDataConverter {
private static final Pattern TAB_PATTERN = Pattern.compile("\t");
// we are clustering similar featureIdxs on the following dataset
// http://www.iua.upf.es/~ocelma/MusicRecommendationDataset/index.html
//
// Preparation of the data set means gettting the dataset to a format which
// can
// be read by the min hash algorithm;
//
enum Lastfm {
USERS_360K(17559530),
USERS_1K(19150868);
private final int totalRecords;
Lastfm(int totalRecords) {
this.totalRecords = totalRecords;
}
int getTotalRecords() {
return totalRecords;
}
}
private LastfmDataConverter() {
}
private static String usedMemory() {
Runtime runtime = Runtime.getRuntime();
return "Used Memory: [" + (runtime.totalMemory() - runtime.freeMemory()) / (1024 * 1024) + " MB] ";
}
/* Get the feature from the parsed record */
private static String getFeature(String[] fields, Lastfm dataSet) {
if (dataSet == Lastfm.USERS_360K) {
return fields[0];
} else {
return fields[2];
}
}
/* Get the item from the parsed record */
private static String getItem(String[] fields, Lastfm dataSet) {
if (dataSet == Lastfm.USERS_360K) {
return fields[2];
} else {
return fields[0];
}
}
/**
* Reads the LastFm dataset and constructs a Map of (item, features). For 360K
* Users dataset - (Item=Artist, Feature=User) For 1K Users dataset -
* (Item=User, Feature=Artist)
*
* @param inputFile
* Lastfm dataset file on the local file system.
* @param dataSet
* Type of dataset - 360K Users or 1K Users
*/
public static Map<String, List<Integer>> convertToItemFeatures(String inputFile, Lastfm dataSet) throws IOException {
long totalRecords = dataSet.getTotalRecords();
Map<String, Integer> featureIdxMap = Maps.newHashMap();
Map<String, List<Integer>> itemFeaturesMap = Maps.newHashMap();
String msg = usedMemory() + "Converting data to internal vector format: ";
BufferedReader br = Files.newReader(new File(inputFile), Charsets.UTF_8);
try {
System.out.print(msg);
int prevPercentDone = 1;
double percentDone = 0.0;
long parsedRecords = 0;
String line;
while ((line = br.readLine()) != null) {
String[] fields = TAB_PATTERN.split(line);
String feature = getFeature(fields, dataSet);
String item = getItem(fields, dataSet);
// get the featureIdx
Integer featureIdx = featureIdxMap.get(feature);
if (featureIdx == null) {
featureIdx = featureIdxMap.size() + 1;
featureIdxMap.put(feature, featureIdx);
}
// add it to the corresponding feature idx map
List<Integer> features = itemFeaturesMap.get(item);
if (features == null) {
features = Lists.newArrayList();
itemFeaturesMap.put(item, features);
}
features.add(featureIdx);
parsedRecords++;
// Update the progress
percentDone = parsedRecords * 100.0 / totalRecords;
msg = usedMemory() + "Converting data to internal vector format: ";
if (percentDone > prevPercentDone) {
System.out.print('\r' + msg + percentDone + '%');
prevPercentDone++;
}
parsedRecords++;
}
msg = usedMemory() + "Converting data to internal vector format: ";
System.out.print('\r' + msg + percentDone + "% Completed\n");
} finally {
Closeables.closeQuietly(br);
}
return itemFeaturesMap;
}
/**
* Converts each record in (item,features) map into Mahout vector format and
* writes it into sequencefile for minhash clustering
*/
public static boolean writeToSequenceFile(Map<String, List<Integer>> itemFeaturesMap, Path outputPath)
throws IOException {
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(conf);
fs.mkdirs(outputPath.getParent());
long totalRecords = itemFeaturesMap.size();
SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, outputPath, Text.class, VectorWritable.class);
try {
String msg = "Now writing vectorized data in sequence file format: ";
System.out.print(msg);
Text itemWritable = new Text();
VectorWritable featuresWritable = new VectorWritable();
int doneRecords = 0;
int prevPercentDone = 1;
for (Map.Entry<String, List<Integer>> itemFeature : itemFeaturesMap.entrySet()) {
int numfeatures = itemFeature.getValue().size();
itemWritable.set(itemFeature.getKey());
Vector featureVector = new SequentialAccessSparseVector(numfeatures);
int i = 0;
for (Integer feature : itemFeature.getValue()) {
featureVector.setQuick(i++, feature);
}
featuresWritable.set(featureVector);
writer.append(itemWritable, featuresWritable);
// Update the progress
double percentDone = ++doneRecords * 100.0 / totalRecords;
if (percentDone > prevPercentDone) {
System.out.print('\r' + msg + percentDone + "% " + (percentDone >= 100 ? "Completed\n" : ""));
prevPercentDone++;
}
}
} finally {
Closeables.closeQuietly(writer);
}
return true;
}
public static void main(String[] args) throws Exception {
if (args.length < 3) {
System.out.println("[Usage]: LastfmDataConverter <input> <output> <dataset>");
System.out.println(" <input>: Absolute path to the local file [usersha1-artmbid-artname-plays.tsv] ");
System.out.println(" <output>: Absolute path to the HDFS output file");
System.out.println(" <dataset>: Either of the two Lastfm public datasets. "
+ "Must be either 'Users360K' or 'Users1K'");
System.out.println("Note:- Hadoop configuration pointing to HDFS namenode should be in classpath");
return;
}
Lastfm dataSet = Lastfm.valueOf(args[2]);
Map<String, List<Integer>> itemFeatures = convertToItemFeatures(args[0], dataSet);
if (itemFeatures.isEmpty()) {
throw new IllegalStateException("Error converting the data file: [" + args[0] + ']');
}
Path output = new Path(args[1]);
boolean status = writeToSequenceFile(itemFeatures, output);
if (status) {
System.out.println("Data converted and written successfully to HDFS location: [" + output + ']');
} else {
System.err.println("Error writing the converted data to HDFS location: [" + output + ']');
}
}
}