Java Examples for org.apache.mahout.common.iterator.sequencefile.PathFilters
The following java examples will help you to understand the usage of org.apache.mahout.common.iterator.sequencefile.PathFilters. These source code samples are taken from different open source projects.
Example 1
Project: mahout-rbmClassifier-master File: RBMClassifierTrainingJob.java View source code |
/**
* Change and save model.
*
* @param output the output
* @param batchsize the batchsize
* @param momentum the momentum
* @throws IOException Signals that an I/O exception has occurred.
*/
private void changeAndSaveModel(Path output, long batchsize, double momentum) throws IOException {
Map<Integer, Matrix> updates = new HashMap<Integer, Matrix>();
for (Pair<IntWritable, MatrixWritable> record : new SequenceFileDirIterable<IntWritable, MatrixWritable>(getTempPath(WEIGHT_UPDATES), PathType.LIST, PathFilters.partFilter(), getConf())) {
if (!updates.containsKey(record.getFirst().get()))
updates.put(record.getFirst().get(), record.getSecond().get());
else
updates.put(record.getFirst().get(), record.getSecond().get().plus(updates.get(record.getFirst().get())));
}
updateRbmCl(batchsize, momentum, updates);
//serialization for mappers to have actual version of the dbm
rbmCl.serialize(output, getConf());
}
Example 2
Project: mahout-commits-master File: SplitInput.java View source code |
/*
* See also splitDirectory(Path inputDir)
* */
public void splitDirectory(Configuration conf, Path inputDir) throws IOException, ClassNotFoundException, InterruptedException {
FileSystem fs = inputDir.getFileSystem(conf);
if (fs.getFileStatus(inputDir) == null) {
throw new IOException(inputDir + " does not exist");
}
if (!fs.getFileStatus(inputDir).isDir()) {
throw new IOException(inputDir + " is not a directory");
}
if (useMapRed) {
SplitInputJob.run(conf, inputDir, mapRedOutputDirectory, keepPct, testRandomSelectionPct);
} else {
// input dir contains one file per category.
FileStatus[] fileStats = fs.listStatus(inputDir, PathFilters.logsCRCFilter());
for (FileStatus inputFile : fileStats) {
if (!inputFile.isDir()) {
splitFile(inputFile.getPath());
}
}
}
}
Example 3
Project: publications-ssnmm-master File: Evaluate.java View source code |
public static void main(String[] args) throws IOException {
int numUsers = 1823179;
int numItems = 136736;
double mu = 3.157255412010664;
String distributedSimilarityMatrixPath = "/home/ssc/Desktop/yahoo/similarityMatrix/";
String itemBiasesFilePath = "/home/ssc/Desktop/yahoo/itemBiases.tsv";
String userBiasesFilePath = "/home/ssc/Desktop/yahoo/userBiases.tsv";
String trainingSetPath = "/home/ssc/Entwicklung/datasets/yahoo-songs/songs.tsv";
String holdoutSetPath = "home/ssc/Entwicklung/datasets/yahoo-songs/holdout.tsv";
Matrix similarities = new SparseRowMatrix(numItems, numItems);
System.out.println("Reading similarities...");
int similaritiesRead = 0;
Configuration conf = new Configuration();
for (Pair<IntWritable, VectorWritable> pair : new SequenceFileDirIterable<IntWritable, VectorWritable>(new Path(distributedSimilarityMatrixPath), PathType.LIST, PathFilters.partFilter(), conf)) {
int item = pair.getFirst().get();
Iterator<Vector.Element> elements = pair.getSecond().get().iterateNonZero();
while (elements.hasNext()) {
Vector.Element elem = elements.next();
similarities.setQuick(item, elem.index(), elem.get());
similaritiesRead++;
}
}
System.out.println("Found " + similaritiesRead + " similarities");
Pattern sep = Pattern.compile("\t");
double[] itemBiases = new double[numItems];
double[] userBiases = new double[numUsers];
System.out.println("Reading item biases");
for (String line : new FileLineIterable(new File(itemBiasesFilePath))) {
String[] parts = sep.split(line);
itemBiases[Integer.parseInt(parts[0])] = Double.parseDouble(parts[1]);
}
System.out.println("Reading user biases");
for (String line : new FileLineIterable(new File(userBiasesFilePath))) {
String[] parts = sep.split(line);
userBiases[Integer.parseInt(parts[0])] = Double.parseDouble(parts[1]);
}
Iterator<Rating> trainRatings = new RatingsIterable(new File(trainingSetPath)).iterator();
Iterator<Rating> heldOutRatings = new RatingsIterable(new File(holdoutSetPath)).iterator();
int currentUser = 0;
OpenIntDoubleHashMap prefs = new OpenIntDoubleHashMap();
int usersProcessed = 0;
RunningAverage rmse = new FullRunningAverage();
RunningAverage mae = new FullRunningAverage();
RunningAverage rmseBase = new FullRunningAverage();
RunningAverage maeBase = new FullRunningAverage();
while (trainRatings.hasNext()) {
Rating rating = trainRatings.next();
if (rating.user() != currentUser) {
for (int n = 0; n < 10; n++) {
Rating heldOutRating = heldOutRatings.next();
Preconditions.checkState(heldOutRating.user() == currentUser);
double preference = 0.0;
double totalSimilarity = 0.0;
int count = 0;
Iterator<Vector.Element> similarItems = similarities.viewRow(heldOutRating.item()).iterateNonZero();
while (similarItems.hasNext()) {
Vector.Element similarity = similarItems.next();
int similarItem = similarity.index();
if (prefs.containsKey(similarItem)) {
preference += similarity.get() * (prefs.get(similarItem) - (mu + userBiases[currentUser] + itemBiases[similarItem]));
totalSimilarity += Math.abs(similarity.get());
count++;
}
}
double baselineEstimate = mu + userBiases[currentUser] + itemBiases[heldOutRating.item()];
double estimate = baselineEstimate;
if (count > 1) {
estimate += preference / totalSimilarity;
}
double baseError = Math.abs(heldOutRating.rating() - baselineEstimate);
maeBase.addDatum(baseError);
rmseBase.addDatum(baseError * baseError);
double error = Math.abs(heldOutRating.rating() - estimate);
mae.addDatum(error);
rmse.addDatum(error * error);
}
if (++usersProcessed % 10000 == 0) {
System.out.println(usersProcessed + " users processed, MAE " + mae.getAverage() + ", RMSE " + Math.sqrt(rmse.getAverage()) + " | baseline MAE " + maeBase.getAverage() + ", baseline RMSE " + Math.sqrt(rmseBase.getAverage()));
}
currentUser = rating.user();
prefs.clear();
}
prefs.put(rating.item(), rating.rating());
}
System.out.println(usersProcessed + " users processed, MAE " + mae.getAverage() + ", RMSE " + Math.sqrt(rmse.getAverage()) + " | baseline MAE " + maeBase.getAverage() + ", baseline RMSE " + Math.sqrt(rmseBase.getAverage()));
}
Example 4
Project: collaborative-filtering-experiment-master File: ALSMatrixUtil.java View source code |
/**
* read entire matrix into memory using row_id as key from Mapper`s setup().
* caution, caller of this usually in Mapper, so when source matrix is large map task usually runs out of heap space.
* to address this make sure caller force only proper number of map task instance runs simultaneously on each datanode.
* (I have tried file lock on HDFS since it is easy and good enough for my needs but will try with Zookeeper later)
* @param dir, HDFS path for DistributedRowMatrix.
* @param conf, Configuration of this cluster.
* @return Hash<Integer, Vector>, whole file based DistributedRowMatrix rows.key is row_id
*/
public static OpenIntObjectHashMap<Vector> readMatrixByRows(Path dir, Configuration conf) {
OpenIntObjectHashMap<Vector> matrix = new OpenIntObjectHashMap<Vector>();
for (Pair<IntWritable, VectorWritable> pair : new SequenceFileDirIterable<IntWritable, VectorWritable>(dir, PathType.LIST, PathFilters.partFilter(), conf)) {
int rowIndex = pair.getFirst().get();
Vector row = pair.getSecond().get().clone();
matrix.put(rowIndex, row);
}
return matrix;
}