/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.mahout.cf.taste.hadoop.example.als.netflix;
import com.google.common.base.Preconditions;
import org.apache.commons.io.Charsets;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.mahout.cf.taste.impl.model.GenericPreference;
import org.apache.mahout.cf.taste.model.Preference;
import org.apache.mahout.common.iterator.FileLineIterable;
import org.apache.mahout.common.iterator.FileLineIterator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.BufferedWriter;
import java.io.File;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
/** converts the raw files provided by netflix to an appropriate input format */
public final class NetflixDatasetConverter {
private static final Logger log = LoggerFactory.getLogger(NetflixDatasetConverter.class);
private static final Pattern SEPARATOR = Pattern.compile(",");
private static final String MOVIE_DENOTER = ":";
private static final String TAB = "\t";
private static final String NEWLINE = "\n";
private NetflixDatasetConverter() {
}
public static void main(String[] args) throws IOException {
if (args.length != 4) {
System.err.println("Usage: NetflixDatasetConverter /path/to/training_set/ /path/to/qualifying.txt "
+ "/path/to/judging.txt /path/to/destination");
return;
}
String trainingDataDir = args[0];
String qualifyingTxt = args[1];
String judgingTxt = args[2];
Path outputPath = new Path(args[3]);
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(outputPath.toUri(), conf);
Preconditions.checkArgument(trainingDataDir != null, "Training Data location needs to be specified");
log.info("Creating training set at {}/trainingSet/ratings.tsv ...", outputPath);
try (BufferedWriter writer =
new BufferedWriter(
new OutputStreamWriter(
fs.create(new Path(outputPath, "trainingSet/ratings.tsv")), Charsets.UTF_8))){
int ratingsProcessed = 0;
for (File movieRatings : new File(trainingDataDir).listFiles()) {
try (FileLineIterator lines = new FileLineIterator(movieRatings)) {
boolean firstLineRead = false;
String movieID = null;
while (lines.hasNext()) {
String line = lines.next();
if (firstLineRead) {
String[] tokens = SEPARATOR.split(line);
String userID = tokens[0];
String rating = tokens[1];
writer.write(userID + TAB + movieID + TAB + rating + NEWLINE);
ratingsProcessed++;
if (ratingsProcessed % 1000000 == 0) {
log.info("{} ratings processed...", ratingsProcessed);
}
} else {
movieID = line.replaceAll(MOVIE_DENOTER, "");
firstLineRead = true;
}
}
}
}
log.info("{} ratings processed. done.", ratingsProcessed);
}
log.info("Reading probes...");
List<Preference> probes = new ArrayList<>(2817131);
long currentMovieID = -1;
for (String line : new FileLineIterable(new File(qualifyingTxt))) {
if (line.contains(MOVIE_DENOTER)) {
currentMovieID = Long.parseLong(line.replaceAll(MOVIE_DENOTER, ""));
} else {
long userID = Long.parseLong(SEPARATOR.split(line)[0]);
probes.add(new GenericPreference(userID, currentMovieID, 0));
}
}
log.info("{} probes read...", probes.size());
log.info("Reading ratings, creating probe set at {}/probeSet/ratings.tsv ...", outputPath);
try (BufferedWriter writer =
new BufferedWriter(new OutputStreamWriter(
fs.create(new Path(outputPath, "probeSet/ratings.tsv")), Charsets.UTF_8))){
int ratingsProcessed = 0;
for (String line : new FileLineIterable(new File(judgingTxt))) {
if (line.contains(MOVIE_DENOTER)) {
currentMovieID = Long.parseLong(line.replaceAll(MOVIE_DENOTER, ""));
} else {
float rating = Float.parseFloat(SEPARATOR.split(line)[0]);
Preference pref = probes.get(ratingsProcessed);
Preconditions.checkState(pref.getItemID() == currentMovieID);
ratingsProcessed++;
writer.write(pref.getUserID() + TAB + pref.getItemID() + TAB + rating + NEWLINE);
if (ratingsProcessed % 1000000 == 0) {
log.info("{} ratings processed...", ratingsProcessed);
}
}
}
log.info("{} ratings processed. done.", ratingsProcessed);
}
}
}