/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.mahout.cf.taste.example.kddcup; import java.io.Closeable; import java.io.File; import java.io.IOException; import java.util.regex.Pattern; import com.google.common.collect.AbstractIterator; import com.google.common.io.Closeables; import org.apache.mahout.cf.taste.impl.common.SkippingIterator; import org.apache.mahout.cf.taste.impl.model.GenericUserPreferenceArray; import org.apache.mahout.cf.taste.model.PreferenceArray; import org.apache.mahout.common.iterator.FileLineIterator; import org.apache.mahout.common.Pair; /** * <p>An {@link java.util.Iterator} which iterates over any of the KDD Cup's rating files. These include the files * {train,test,validation}Idx{1,2}}.txt. See http://kddcup.yahoo.com/. Each element in the iteration corresponds * to one user's ratings as a {@link PreferenceArray} and corresponding timestamps as a parallel {@code long} * array.</p> * * <p>Timestamps in the data set are relative to some unknown point in time, for anonymity. They are assumed * to be relative to the epoch, time 0, or January 1 1970, for purposes here.</p> */ public final class DataFileIterator extends AbstractIterator<Pair<PreferenceArray,long[]>> implements SkippingIterator<Pair<PreferenceArray,long[]>>, Closeable { private static final Pattern COLON_PATTERN = Pattern.compile(":"); private static final Pattern PIPE_PATTERN = Pattern.compile("\\|"); private static final Pattern TAB_PATTERN = Pattern.compile("\t"); private final FileLineIterator lineIterator; public DataFileIterator(File dataFile) throws IOException { if (dataFile == null || dataFile.isDirectory() || !dataFile.exists()) { throw new IllegalArgumentException("Bad data file: " + dataFile); } lineIterator = new FileLineIterator(dataFile); } @Override protected Pair<PreferenceArray, long[]> computeNext() { if (!lineIterator.hasNext()) { return endOfData(); } String line = lineIterator.next(); // First a userID|ratingsCount line String[] tokens = PIPE_PATTERN.split(line); long userID = Long.parseLong(tokens[0]); int ratingsLeftToRead = Integer.parseInt(tokens[1]); int ratingsRead = 0; PreferenceArray currentUserPrefs = new GenericUserPreferenceArray(ratingsLeftToRead); long[] timestamps = new long[ratingsLeftToRead]; while (ratingsLeftToRead > 0) { line = lineIterator.next(); // Then a data line. May be 1-4 tokens depending on whether preference info is included (it's not in test data) // or whether date info is included (not inluded in track 2). Item ID is always first, and date is the last // two fields if it exists. tokens = TAB_PATTERN.split(line); boolean hasPref = tokens.length == 2 || tokens.length == 4; boolean hasDate = tokens.length > 2; long itemID = Long.parseLong(tokens[0]); currentUserPrefs.setUserID(0, userID); currentUserPrefs.setItemID(ratingsRead, itemID); if (hasPref) { float preference = Float.parseFloat(tokens[1]); currentUserPrefs.setValue(ratingsRead, preference); } if (hasDate) { long timestamp; if (hasPref) { timestamp = parseFakeTimestamp(tokens[2], tokens[3]); } else { timestamp = parseFakeTimestamp(tokens[1], tokens[2]); } timestamps[ratingsRead] = timestamp; } ratingsRead++; ratingsLeftToRead--; } return new Pair<PreferenceArray,long[]>(currentUserPrefs, timestamps); } @Override public void skip(int n) { for (int i = 0; i < n; i++) { if (lineIterator.hasNext()) { String line = lineIterator.next(); // First a userID|ratingsCount line String[] tokens = PIPE_PATTERN.split(line); int linesToSKip = Integer.parseInt(tokens[1]); lineIterator.skip(linesToSKip); } else { break; } } } @Override public void close() { endOfData(); Closeables.closeQuietly(lineIterator); } /** * @param dateString "date" in days since some undisclosed date, which we will arbitrarily assume to be the * epoch, January 1 1970. * @param timeString time of day in HH:mm:ss format * @return the UNIX timestamp for this moment in time */ private static long parseFakeTimestamp(String dateString, CharSequence timeString) { int days = Integer.parseInt(dateString); String[] timeTokens = COLON_PATTERN.split(timeString); int hours = Integer.parseInt(timeTokens[0]); int minutes = Integer.parseInt(timeTokens[1]); int seconds = Integer.parseInt(timeTokens[2]); return 86400L * days + 3600L + hours + 60L * minutes + seconds; } }