/** * Copyright 2012 Anjuke Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.anjuke.romar.mahout.similarity.file; import java.io.File; import java.io.IOException; import java.nio.ByteBuffer; import java.util.Iterator; import java.util.regex.Pattern; import org.apache.mahout.cf.taste.impl.similarity.GenericItemSimilarity.ItemItemSimilarity; import org.apache.mahout.cf.taste.impl.similarity.GenericUserSimilarity.UserUserSimilarity; import org.apache.mahout.common.iterator.FileLineIterator; import com.google.common.base.Function; import com.google.common.collect.ForwardingIterator; import com.google.common.collect.Iterators; public final class RomarFileSimilarityIterator<T> extends ForwardingIterator<T> { private static final Pattern SEPARATOR = Pattern.compile("[,\t]"); static final int DATA_SIZE = (2 * Long.SIZE + Double.SIZE) / Byte.SIZE; private final Iterator<T> _delegate; private RomarFileSimilarityIterator(DataFileIterator fileIterator, final SimilarityBuilder<T> builder) { _delegate = Iterators.transform(fileIterator, new Function<byte[], T>() { public T apply(byte[] input) { ByteBuffer buffer = ByteBuffer.wrap(input); return builder.create(buffer.getLong(), buffer.getLong(), buffer.getDouble()); } }); } private RomarFileSimilarityIterator(FileLineIterator fileIterator, final SimilarityBuilder<T> builder) { _delegate = Iterators.transform(fileIterator, new Function<String, T>() { public T apply(String input) { String[] tokens = SEPARATOR.split(input); double value = Double.parseDouble(tokens[2]); // fix for mahout hadoop compute some value like 1.00000000002 if (value > 1.0 && value - 1.0 < 0.00001) { value = 1.0; } return builder.create(Long.parseLong(tokens[0]), Long.parseLong(tokens[1]), value); } }); } @Override protected Iterator<T> delegate() { return _delegate; } private interface SimilarityBuilder<T> { T create(long id1, long id2, double value); } private static class ItemSimilarityBuilder implements SimilarityBuilder<ItemItemSimilarity> { @Override public ItemItemSimilarity create(long id1, long id2, double value) { return new ItemItemSimilarity(id1, id2, value); } } public interface IteratorBuiler<T> { Iterator<T> build(File file); } private static class UserSimilarityBuilder implements SimilarityBuilder<UserUserSimilarity> { @Override public UserUserSimilarity create(long id1, long id2, double value) { return new UserUserSimilarity(id1, id2, value); } } public static IteratorBuiler<ItemItemSimilarity> lineFileItemIteratorBuilder() { return new IteratorBuiler<ItemItemSimilarity>() { @Override public Iterator<ItemItemSimilarity> build(File file) { try { return new RomarFileSimilarityIterator<ItemItemSimilarity>( new FileLineIterator(file), new ItemSimilarityBuilder()); } catch (IOException e) { throw new IllegalStateException("Can't read " + file, e); } } }; } public static Iterator<ItemItemSimilarity> lineFileItemIterator(File file) { return lineFileItemIteratorBuilder().build(file); } public static IteratorBuiler<ItemItemSimilarity> dataFileItemIteratorBuilder() { return new IteratorBuiler<ItemItemSimilarity>() { @Override public Iterator<ItemItemSimilarity> build(File file) { try { return new RomarFileSimilarityIterator<ItemItemSimilarity>( new DataFileIterator(file, DATA_SIZE), new ItemSimilarityBuilder()); } catch (IOException e) { throw new IllegalStateException("Can't read " + file, e); } } }; } public static Iterator<ItemItemSimilarity> dataFileItemIterator(File file) { return dataFileItemIteratorBuilder().build(file); } public static IteratorBuiler<UserUserSimilarity> lineFileUserIteratorBuilder() { return new IteratorBuiler<UserUserSimilarity>() { @Override public Iterator<UserUserSimilarity> build(File file) { try { return new RomarFileSimilarityIterator<UserUserSimilarity>( new FileLineIterator(file), new UserSimilarityBuilder()); } catch (IOException e) { throw new IllegalStateException("Can't read " + file, e); } } }; } public static Iterator<UserUserSimilarity> lineFileUserIterator(File file) { return lineFileUserIteratorBuilder().build(file); } public static IteratorBuiler<UserUserSimilarity> dataFileUserIteratorBuilder() { return new IteratorBuiler<UserUserSimilarity>() { @Override public Iterator<UserUserSimilarity> build(File file) { try { return new RomarFileSimilarityIterator<UserUserSimilarity>( new DataFileIterator(file, DATA_SIZE), new UserSimilarityBuilder()); } catch (IOException e) { throw new IllegalStateException("Can't read " + file, e); } } }; } public static Iterator<UserUserSimilarity> dataFileUserIterator(File file) { return dataFileUserIteratorBuilder().build(file); } }