Java Examples for org.apache.mahout.common.iterator.sequencefile.PathFilters

The following java examples will help you to understand the usage of org.apache.mahout.common.iterator.sequencefile.PathFilters. These source code samples are taken from different open source projects.

Example 1

Project: mahout-rbmClassifier-master File: RBMClassifierTrainingJob.java View source code

/**
	 * Change and save model.
	 *
	 * @param output the output
	 * @param batchsize the batchsize
	 * @param momentum the momentum
	 * @throws IOException Signals that an I/O exception has occurred.
	 */
private void changeAndSaveModel(Path output, long batchsize, double momentum) throws IOException {
    Map<Integer, Matrix> updates = new HashMap<Integer, Matrix>();
    for (Pair<IntWritable, MatrixWritable> record : new SequenceFileDirIterable<IntWritable, MatrixWritable>(getTempPath(WEIGHT_UPDATES), PathType.LIST, PathFilters.partFilter(), getConf())) {
        if (!updates.containsKey(record.getFirst().get()))
            updates.put(record.getFirst().get(), record.getSecond().get());
        else
            updates.put(record.getFirst().get(), record.getSecond().get().plus(updates.get(record.getFirst().get())));
    }
    updateRbmCl(batchsize, momentum, updates);
    //serialization for mappers to have actual version of the dbm
    rbmCl.serialize(output, getConf());
}

Example 2

Project: mahout-commits-master File: SplitInput.java View source code

/*
   * See also splitDirectory(Path inputDir)
   * */
public void splitDirectory(Configuration conf, Path inputDir) throws IOException, ClassNotFoundException, InterruptedException {
    FileSystem fs = inputDir.getFileSystem(conf);
    if (fs.getFileStatus(inputDir) == null) {
        throw new IOException(inputDir + " does not exist");
    }
    if (!fs.getFileStatus(inputDir).isDir()) {
        throw new IOException(inputDir + " is not a directory");
    }
    if (useMapRed) {
        SplitInputJob.run(conf, inputDir, mapRedOutputDirectory, keepPct, testRandomSelectionPct);
    } else {
        // input dir contains one file per category.
        FileStatus[] fileStats = fs.listStatus(inputDir, PathFilters.logsCRCFilter());
        for (FileStatus inputFile : fileStats) {
            if (!inputFile.isDir()) {
                splitFile(inputFile.getPath());
            }
        }
    }
}

Example 3

Project: publications-ssnmm-master File: Evaluate.java View source code

public static void main(String[] args) throws IOException {
    int numUsers = 1823179;
    int numItems = 136736;
    double mu = 3.157255412010664;
    String distributedSimilarityMatrixPath = "/home/ssc/Desktop/yahoo/similarityMatrix/";
    String itemBiasesFilePath = "/home/ssc/Desktop/yahoo/itemBiases.tsv";
    String userBiasesFilePath = "/home/ssc/Desktop/yahoo/userBiases.tsv";
    String trainingSetPath = "/home/ssc/Entwicklung/datasets/yahoo-songs/songs.tsv";
    String holdoutSetPath = "home/ssc/Entwicklung/datasets/yahoo-songs/holdout.tsv";
    Matrix similarities = new SparseRowMatrix(numItems, numItems);
    System.out.println("Reading similarities...");
    int similaritiesRead = 0;
    Configuration conf = new Configuration();
    for (Pair<IntWritable, VectorWritable> pair : new SequenceFileDirIterable<IntWritable, VectorWritable>(new Path(distributedSimilarityMatrixPath), PathType.LIST, PathFilters.partFilter(), conf)) {
        int item = pair.getFirst().get();
        Iterator<Vector.Element> elements = pair.getSecond().get().iterateNonZero();
        while (elements.hasNext()) {
            Vector.Element elem = elements.next();
            similarities.setQuick(item, elem.index(), elem.get());
            similaritiesRead++;
        }
    }
    System.out.println("Found " + similaritiesRead + " similarities");
    Pattern sep = Pattern.compile("\t");
    double[] itemBiases = new double[numItems];
    double[] userBiases = new double[numUsers];
    System.out.println("Reading item biases");
    for (String line : new FileLineIterable(new File(itemBiasesFilePath))) {
        String[] parts = sep.split(line);
        itemBiases[Integer.parseInt(parts[0])] = Double.parseDouble(parts[1]);
    }
    System.out.println("Reading user biases");
    for (String line : new FileLineIterable(new File(userBiasesFilePath))) {
        String[] parts = sep.split(line);
        userBiases[Integer.parseInt(parts[0])] = Double.parseDouble(parts[1]);
    }
    Iterator<Rating> trainRatings = new RatingsIterable(new File(trainingSetPath)).iterator();
    Iterator<Rating> heldOutRatings = new RatingsIterable(new File(holdoutSetPath)).iterator();
    int currentUser = 0;
    OpenIntDoubleHashMap prefs = new OpenIntDoubleHashMap();
    int usersProcessed = 0;
    RunningAverage rmse = new FullRunningAverage();
    RunningAverage mae = new FullRunningAverage();
    RunningAverage rmseBase = new FullRunningAverage();
    RunningAverage maeBase = new FullRunningAverage();
    while (trainRatings.hasNext()) {
        Rating rating = trainRatings.next();
        if (rating.user() != currentUser) {
            for (int n = 0; n < 10; n++) {
                Rating heldOutRating = heldOutRatings.next();
                Preconditions.checkState(heldOutRating.user() == currentUser);
                double preference = 0.0;
                double totalSimilarity = 0.0;
                int count = 0;
                Iterator<Vector.Element> similarItems = similarities.viewRow(heldOutRating.item()).iterateNonZero();
                while (similarItems.hasNext()) {
                    Vector.Element similarity = similarItems.next();
                    int similarItem = similarity.index();
                    if (prefs.containsKey(similarItem)) {
                        preference += similarity.get() * (prefs.get(similarItem) - (mu + userBiases[currentUser] + itemBiases[similarItem]));
                        totalSimilarity += Math.abs(similarity.get());
                        count++;
                    }
                }
                double baselineEstimate = mu + userBiases[currentUser] + itemBiases[heldOutRating.item()];
                double estimate = baselineEstimate;
                if (count > 1) {
                    estimate += preference / totalSimilarity;
                }
                double baseError = Math.abs(heldOutRating.rating() - baselineEstimate);
                maeBase.addDatum(baseError);
                rmseBase.addDatum(baseError * baseError);
                double error = Math.abs(heldOutRating.rating() - estimate);
                mae.addDatum(error);
                rmse.addDatum(error * error);
            }
            if (++usersProcessed % 10000 == 0) {
                System.out.println(usersProcessed + " users processed, MAE " + mae.getAverage() + ", RMSE " + Math.sqrt(rmse.getAverage()) + " | baseline MAE " + maeBase.getAverage() + ", baseline RMSE " + Math.sqrt(rmseBase.getAverage()));
            }
            currentUser = rating.user();
            prefs.clear();
        }
        prefs.put(rating.item(), rating.rating());
    }
    System.out.println(usersProcessed + " users processed, MAE " + mae.getAverage() + ", RMSE " + Math.sqrt(rmse.getAverage()) + " | baseline MAE " + maeBase.getAverage() + ", baseline RMSE " + Math.sqrt(rmseBase.getAverage()));
}

Example 4

Project: collaborative-filtering-experiment-master File: ALSMatrixUtil.java View source code

/**
   * read entire matrix into memory using row_id as key from Mapper`s setup().
   * caution, caller of this usually in Mapper,  so when source matrix is large map task usually runs out of heap space.
   * to address this make sure caller force only proper number of map task instance runs simultaneously on each datanode.
   * (I have tried file lock on HDFS since it is easy and good enough for my needs but will try with Zookeeper later)
   * @param dir, HDFS path for DistributedRowMatrix.
   * @param conf, Configuration of this cluster. 
   * @return Hash<Integer, Vector>, whole file based DistributedRowMatrix rows.key is row_id
   */
public static OpenIntObjectHashMap<Vector> readMatrixByRows(Path dir, Configuration conf) {
    OpenIntObjectHashMap<Vector> matrix = new OpenIntObjectHashMap<Vector>();
    for (Pair<IntWritable, VectorWritable> pair : new SequenceFileDirIterable<IntWritable, VectorWritable>(dir, PathType.LIST, PathFilters.partFilter(), conf)) {
        int rowIndex = pair.getFirst().get();
        Vector row = pair.getSecond().get().clone();
        matrix.put(rowIndex, row);
    }
    return matrix;
}