/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.mahout.cf.taste.example.bookcrossing;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.regex.Pattern;
import com.google.common.base.Charsets;
import com.google.common.io.Closeables;
import org.apache.mahout.cf.taste.example.grouplens.GroupLensDataModel;
import org.apache.mahout.cf.taste.impl.model.file.FileDataModel;
import org.apache.mahout.common.iterator.FileLineIterable;
/**
* See <a href="http://www.informatik.uni-freiburg.de/~cziegler/BX/BX-CSV-Dump.zip">download</a> for
* data needed by this class. The BX-Book-Ratings.csv file is needed.
*/
public final class BookCrossingDataModel extends FileDataModel {
private static final Pattern NON_DIGIT_SEMICOLON_PATTERN = Pattern.compile("[^0-9;]");
public BookCrossingDataModel(boolean ignoreRatings) throws IOException {
this(GroupLensDataModel.readResourceToTempFile(
"/org/apache/mahout/cf/taste/example/bookcrossing/BX-Book-Ratings.csv"),
ignoreRatings);
}
/**
* @param ratingsFile BookCrossing ratings file in its native format
* @throws IOException if an error occurs while reading or writing files
*/
public BookCrossingDataModel(File ratingsFile, boolean ignoreRatings) throws IOException {
super(convertBCFile(ratingsFile, ignoreRatings));
}
private static File convertBCFile(File originalFile, boolean ignoreRatings) throws IOException {
if (!originalFile.exists()) {
throw new FileNotFoundException(originalFile.toString());
}
File resultFile = new File(new File(System.getProperty("java.io.tmpdir")), "taste.bookcrossing.txt");
resultFile.delete();
Writer writer = null;
try {
writer = new OutputStreamWriter(new FileOutputStream(resultFile), Charsets.UTF_8);
for (String line : new FileLineIterable(originalFile, true)) {
// 0 ratings are basically "no rating", ignore them (thanks h.9000)
if (line.endsWith("\"0\"")) {
continue;
}
// Delete replace anything that isn't numeric, or a semicolon delimiter. Make comma the delimiter.
String convertedLine = NON_DIGIT_SEMICOLON_PATTERN.matcher(line)
.replaceAll("").replace(';', ',');
// If this means we deleted an entire ID -- few cases like that -- skip the line
if (convertedLine.contains(",,")) {
continue;
}
if (ignoreRatings) {
// drop rating
convertedLine = convertedLine.substring(0, convertedLine.lastIndexOf(','));
}
writer.write(convertedLine);
writer.write('\n');
}
writer.flush();
} catch (IOException ioe) {
resultFile.delete();
throw ioe;
} finally {
Closeables.closeQuietly(writer);
}
return resultFile;
}
@Override
public String toString() {
return "BookCrossingDataModel";
}
}