/* TagRecommender: A framework to implement and evaluate algorithms for the recommendation of tags. Copyright (C) 2013 Dominik Kowald This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. */ package file; import java.io.BufferedWriter; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.HashSet; import java.util.List; import java.util.Set; import com.google.common.primitives.Ints; import common.Bookmark; import common.Utilities; import file.preprocessing.CoreFiltering; public class BookmarkSplitter { private BookmarkReader reader; public BookmarkSplitter(BookmarkReader reader) { this.reader = reader; } public List<Bookmark> getUserPercentage(int percentage, boolean usePercentage) { List<Bookmark> lines = new ArrayList<Bookmark>(); int userSize = this.reader.getUsers().size(); int userLimit = (usePercentage ? userSize * percentage / 100 : percentage); List<Integer> randomIndices = Utilities.getRandomIndices(0, userSize - 1).subList(0, userLimit); int currentUser = -1; boolean takeUser = false; for (Bookmark data : this.reader.getBookmarks()) { if (currentUser != data.getUserID()) { // new user currentUser = data.getUserID(); takeUser = randomIndices.contains(currentUser); } if (takeUser) { lines.add(data); } } return lines; } // randomly gets x percentage of the user-profiles from the dataset public void splitUserPercentage(String filename, int percentage, boolean usePercentage, int count) { for (int i = 1; i <= count; i++) { List<Bookmark> lines = getUserPercentage(percentage, usePercentage); BookmarkWriter.writeSample(this.reader, lines, filename /*+ "_" + percentage + "_perc_" */ + i, null, false); } } // randomly splits the bookmarks public void splitFile(String filename, int testPercentage) { int testUserSize = this.reader.getBookmarks().size() * testPercentage / 100; int trainUserSize = this.reader.getBookmarks().size() - testUserSize; Collections.shuffle(this.reader.getBookmarks()); List<Bookmark> userSample = this.reader.getBookmarks().subList(0, trainUserSize + testUserSize); List<Bookmark> trainUserSample = userSample.subList(0, trainUserSize); List<Bookmark> testUserSample = userSample.subList(trainUserSize, trainUserSize + testUserSize); BookmarkWriter.writeSample(this.reader, trainUserSample, filename + "_train", null, false); BookmarkWriter.writeSample(this.reader, testUserSample, filename + "_test", null, false); BookmarkWriter.writeSample(this.reader, userSample, filename, null, false); } // puts the last bookmark of each user into the testset public void leaveLastOutSplit(String filename, boolean coldStart, boolean realNames, String userWhiteList) { List<Bookmark> trainLines = new ArrayList<Bookmark>(); List<Bookmark> testLines = new ArrayList<Bookmark>(); List<String> testUsers = null; if (userWhiteList != null) { testUsers = Utilities.readFileToStringList("data/csv/" + userWhiteList); } int currentUser = -1, userIndex = 1, userSize = -1; for (Bookmark data : this.reader.getBookmarks()) { if (currentUser != data.getUserID()) { // new user currentUser = data.getUserID(); userSize = this.reader.getUserCounts().get(currentUser); userIndex = 1; } if (userIndex++ == userSize) { // check if user has enough lines and if user is in the whitelist if ((coldStart || userSize > 1) && (testUsers == null || testUsers.contains(this.reader.getUsers().get(data.getUserID())))) { testLines.add(data); } else { trainLines.add(data); } } else { trainLines.add(data); } } BookmarkWriter.writeSample(this.reader, trainLines, filename + "_train", null, realNames); BookmarkWriter.writeSample(this.reader, testLines, filename + "_test", null, realNames); trainLines.addAll(testLines); BookmarkWriter.writeSample(this.reader, trainLines, filename, null, realNames); } public void leaveLastOutSplitWithCondition(String filename, boolean realNames, String userWhiteList, String titleCondition) { List<Bookmark> trainLines = new ArrayList<Bookmark>(); List<Bookmark> testLines = new ArrayList<Bookmark>(); List<String> testUsers = null; if (userWhiteList != null) { testUsers = Utilities.readFileToStringList("data/csv/" + userWhiteList); } Collections.sort(this.reader.getBookmarks()); List<List<Bookmark>> userBookmarks = Utilities.getBookmarks(this.reader.getBookmarks(), false); for (List<Bookmark> uBookmarks : userBookmarks) { boolean isColdStartUser = uBookmarks.size() < 2; boolean putTestB = false; boolean putTrainB = false; Bookmark testB = null; for (int i = uBookmarks.size() - 1; i >= 0; i--) { Bookmark b = uBookmarks.get(i); boolean isTestUser = testUsers.contains(this.reader.getUsers().get(b.getUserID())); if (isTestUser && !isColdStartUser && !putTestB) { if (!b.getTitle().equals(titleCondition)) { testB = b; putTestB = true; }/* else { trainLines.add(b); putTrainB = true; }*/ } else { trainLines.add(b); putTrainB = true; } } if (testB != null) { if (putTrainB) { testLines.add(testB); } else { trainLines.add(testB); } } } BookmarkWriter.writeSample(this.reader, trainLines, filename + "_train", null, realNames); BookmarkWriter.writeSample(this.reader, testLines, filename + "_test", null, realNames); trainLines.addAll(testLines); BookmarkWriter.writeSample(this.reader, trainLines, filename, null, realNames); } // puts one bookmark at random of each user into the testset public void leaveOneRandOutSplit(String filename) { List<Bookmark> trainLines = new ArrayList<Bookmark>(); List<Bookmark> testLines = new ArrayList<Bookmark>(); int currentUser = -1, userIndex = -1, index = -1, userSize = -1; for (Bookmark data : this.reader.getBookmarks()) { if (currentUser != data.getUserID()) { // new user currentUser = data.getUserID(); userSize = this.reader.getUserCounts().get(currentUser); userIndex = 1; index = 1 + (int)(Math.random() * ((userSize - 1) + 1)); } if (userIndex++ == index) { testLines.add(data); } else { trainLines.add(data); } } BookmarkWriter.writeSample(this.reader, trainLines, filename + "_train", null, false); BookmarkWriter.writeSample(this.reader, testLines, filename + "_test", null, false); trainLines.addAll(testLines); BookmarkWriter.writeSample(this.reader, trainLines, filename, null, false); } public void leavePercentageOutSplit(String filename, int percentage, boolean last, Integer userNumber, boolean tagRec, boolean realNames, boolean coldStart) { List<Bookmark> trainLines = new ArrayList<Bookmark>(); List<Bookmark> testLines = new ArrayList<Bookmark>(); Set<Integer> indices = new HashSet<Integer>(); int currentUser = -1, userIndex = -1, userSize = -1; List<Bookmark> allLines = null; if (userNumber == null) { allLines = this.reader.getBookmarks(); } else { allLines = getUserPercentage(userNumber, false); } for (int i = 0; i < allLines.size(); i++) { Bookmark data = allLines.get(i); if (currentUser != data.getUserID()) { // new user currentUser = data.getUserID(); userSize = this.reader.getUserCounts().get(currentUser); userIndex = 1; indices.clear(); int limit = (int)((double)percentage / 100.0 * (double)userSize); if (tagRec && limit == 0 && userSize > 1) { limit++; } if (!tagRec && coldStart && limit == 0) { limit++; } while (indices.size() < limit) { if (last) { indices.add(userSize - indices.size()); } else { indices.add(1 + (int)(Math.random() * (userSize + 1))); } } } if (indices.contains(userIndex++)) { testLines.add(data); } else { trainLines.add(data); } } Collections.sort(trainLines); Collections.sort(testLines); BookmarkWriter.writeSample(this.reader, trainLines, filename + "_train", null, realNames); BookmarkWriter.writeSample(this.reader, testLines, filename + "_test", null, realNames); trainLines.addAll(testLines); BookmarkWriter.writeSample(this.reader, trainLines, filename, null, realNames); } // Statics ------------------------------------------------------------------------------------------------------------------------------------------- public static void splitSample(String filename, String sampleName, int count, int percentage, boolean tagRec, boolean coldStart, boolean realNames, String userWhiteList, String condition) { BookmarkReader reader = new BookmarkReader(0, false); reader.readFile(filename); Collections.sort(reader.getBookmarks()); BookmarkSplitter splitter = new BookmarkSplitter(reader); for (int i = 1; i <= count; i++) { if (percentage > 0) { splitter.leavePercentageOutSplit(sampleName, percentage, true, null, tagRec, realNames, coldStart); } else { if (condition == null) { splitter.leaveLastOutSplit(sampleName, coldStart, realNames, userWhiteList); } else { splitter.leaveLastOutSplitWithCondition(sampleName, realNames, userWhiteList, condition); } } } } public static void drawUserPercentageSample(String filename, int percentage, int count) { BookmarkReader reader = new BookmarkReader(0, false); reader.readFile(filename); BookmarkSplitter splitter = new BookmarkSplitter(reader); Collections.sort(reader.getBookmarks()); splitter.splitUserPercentage(filename, percentage, true, count); } public static void calculateCore(String filename, String sampleName, int userLevel, int resLevel, int tagLevel) { String resultfile = sampleName;// + "_core_u" + userLevel + "_r" + resLevel + "_t" + tagLevel; BookmarkReader reader = new BookmarkReader(0, false); reader.readFile(filename); System.out.println("Unique users before filtering: " + reader.getUsers().size()); System.out.println("Unique resources before filtering: " + reader.getResources().size()); System.out.println("Unique tags before filtering: " + reader.getTags().size()); System.out.println("Lines before filtering: " + reader.getBookmarks().size()); if (userLevel > 0 || resLevel > 0 || tagLevel > 0) { int i = 0; while (true) { System.out.println("Core iteration: " + i); int size = reader.getBookmarks().size(); CoreFiltering filtering = new CoreFiltering(reader); reader = filtering.filterOrphansIterative(userLevel, resLevel, tagLevel); String coreResultfile = resultfile;// + "_c" + ++i; BookmarkWriter.writeSample(reader, reader.getBookmarks(), coreResultfile, null, false); if (reader.getBookmarks().size() >= size) { return; } // re-read the filtered dataset reader = new BookmarkReader(0, false); reader.readFile(coreResultfile); File file = new File("./data/csv/" + coreResultfile + ".txt"); file.delete(); } } } }