/*
TagRecommender:
A framework to implement and evaluate algorithms for the recommendation
of tags.
Copyright (C) 2013 Dominik Kowald
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package file;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.commons.lang3.StringUtils;
import common.Bookmark;
import file.stemming.EnglishStemmer;
public class BookmarkReader {
private final int countLimit;
private List<Bookmark> userLines;
private List<Bookmark> testLines;
private List<String> categories;
private List<String> tags;
private Map<String, Integer> tagMap;
private List<Integer> tagCounts;
private List<String> resources;
private Map<String, Integer> resourceMap;
private List<Integer> resourceCounts;
private List<String> users;
private Map<String, Integer> userMap;
private List<Integer> userCounts;
private EnglishStemmer stemmer;
private boolean hasTimestamp = false;
private long firstTimestamp = Long.MAX_VALUE;
private long lastTimestamp = Long.MIN_VALUE;
private Set<String> userResPairs = new HashSet<String>();
public BookmarkReader(int countLimit, boolean stemming) {
this.countLimit = countLimit;
this.userLines = new ArrayList<Bookmark>();
this.testLines = null;
this.categories = new ArrayList<String>();
this.tags = new ArrayList<String>();
this.tagMap = new HashMap<String, Integer>();
this.tagCounts = new ArrayList<Integer>();
this.resources = new ArrayList<String>();
this.resourceMap = new HashMap<String, Integer>();
this.resourceCounts = new ArrayList<Integer>();
this.users = new ArrayList<String>();
this.userMap = new HashMap<String, Integer>();
this.userCounts = new ArrayList<Integer>();
if (stemming) {
this.stemmer = new EnglishStemmer();
}
}
public boolean readFile(String filename) {
return doReadFile(null, filename);
}
public boolean readFile(String path, String filename) {
return doReadFile(path, filename);
}
private boolean doReadFile(String path, String filename) {
try {
String filePath = "";
if (path == null) {
filePath = "./data/csv/" + filename + ".txt";
} else {
filePath = path + filename;
}
//FileReader reader = new FileReader(new File(filePath));
InputStreamReader reader = new InputStreamReader(new FileInputStream(new File(filePath)), "UTF8");
BufferedReader br = new BufferedReader(reader);
List<String> categories = new ArrayList<String>(), tags = new ArrayList<String>();
Bookmark userData = null;
String userID = "", resID = "", timestamp = "";
String[] lineParts = null;
String line;
while ((line = br.readLine()) != null) {
lineParts = line.split("\";\"");
if (lineParts.length < 4) {
System.out.println("Line too short: " + this.userLines.size());
continue;
}
//if (resID != "" && !resourceMap.containsKey(resID)) { // code for filtering tweets
processUserData(userID, userData, tags, categories, resID);
//}
// reset userdata
userID = lineParts[0].replace("\"", "");
resID = lineParts[1].replace("\"", "");
timestamp = lineParts[2].replace("\"", "");
userData = new Bookmark(-1, -1, timestamp);
categories.clear();
tags.clear();
for (String tag : lineParts[3].replace("\"", "").split(",")) {
String stemmedTag = tag.toLowerCase();
if (!stemmedTag.isEmpty() && !tags.contains(stemmedTag)) {
if (this.stemmer != null) {
this.stemmer.setCurrent(stemmedTag);
this.stemmer.stem();
stemmedTag = this.stemmer.getCurrent();
}
tags.add(stemmedTag);
}
}
if (lineParts.length > 4) { // are there categories
for (String cat : lineParts[4].replace("\"", "").split(",")) {
if (!cat.isEmpty()) {
//if (cat.contains("_")) {
// categories.add(cat.substring(0, cat.indexOf("_")).toLowerCase());
//} else {
categories.add(cat.toLowerCase());
//}
}
}
}
//if (lineParts.length > 5) { // is there a rating?
// try {
// userData.setRating(Double.parseDouble(lineParts[5].replace("\"", "")));
// } catch (Exception e) { /* do nothing */ }
//}
// TODO ----------------------
// extend common/Bookmark class with fields for title (= lineParts[6]) and description (= lineParts[7])
//if (lineParts.length > 6) { // is there a rating?
// try {
// userData.setTitle(lineParts[6].replace("\"", ""));
// } catch (Exception e) { /* do nothing */ }
//}
}
processUserData(userID, userData, tags, categories, resID); // last user
br.close();
return true;
} catch (Exception e) {
System.out.println("ERROR");
e.printStackTrace();
}
return false;
}
private void processUserData(String userID, Bookmark userData, List<String> tags, List<String> categories, String resID) {
if (userID != "" /*&& tags.size() > 0 && !userData.getTimestamp().isEmpty()*/) {
if (!userData.getTimestamp().isEmpty()) {
if (!StringUtils.isNumeric(userData.getTimestamp())) {
System.out.println("Invalid timestamp");
return;
}
Long timestamp = userData.getTimestampAsLong();
if (timestamp < this.firstTimestamp) {
this.firstTimestamp = timestamp;
} else if (timestamp > this.lastTimestamp) {
this.lastTimestamp = timestamp;
}
this.hasTimestamp = true;
}
boolean doCount = (this.countLimit == 0 || this.userLines.size() < this.countLimit);
Integer userIndex = this.userMap.get(userID);
if (userIndex == null) {
this.users.add(userID);
if (doCount) {
this.userCounts.add(1);
} else {
this.userCounts.add(0);
}
userIndex = this.users.size() - 1;
this.userMap.put(userID, userIndex);
} else if (doCount) {
this.userCounts.set(userIndex, this.userCounts.get(userIndex) + 1);
}
userData.setUserID(userIndex);
Integer resIndex = this.resourceMap.get(resID);
if (resIndex == null) {
this.resources.add(resID);
if (doCount) {
this.resourceCounts.add(1);
} else {
this.resourceCounts.add(0);
}
resIndex = this.resources.size() - 1;
this.resourceMap.put(resID, resIndex);
} else if (doCount) {
this.resourceCounts.set(resIndex, this.resourceCounts.get(resIndex) + 1);
}
userData.setWikiID(resIndex);
for (String cat : categories) {
int index = 0;
if (!this.categories.contains(cat)) {
this.categories.add(cat);
index = this.categories.size() - 1;
} else {
index = this.categories.indexOf(cat);
}
userData.getCategories().add(index);
}
for (String tag : tags) {
//int tagIndex = this.tags.indexOf(tag);
Integer tagIndex = this.tagMap.get(tag);
if (tagIndex == null) { // new tag
this.tags.add(tag);
if (doCount) {
this.tagCounts.add(1);
} else {
this.tagCounts.add(0);
}
tagIndex = this.tags.size() - 1;
this.tagMap.put(tag, tagIndex);
} else if (doCount) {
this.tagCounts.set(tagIndex, this.tagCounts.get(tagIndex) + 1);
}
userData.getTags().add(tagIndex);
}
//if (checkForDuplicate(userData)) {
// System.out.println("WARNING: Duplicate entry");
//}
this.userLines.add(userData);
//if (this.userLines.size() % 100000 == 0) {
// System.out.println("Read in 10000000 lines");
//}
}
}
private boolean checkForDuplicate(Bookmark userData) {
boolean dup = false;
if (this.userResPairs.contains(userData.getUserID() + "_" + userData.getResourceID())) {
dup = true;
}
this.userResPairs.add(userData.getUserID() + "_" + userData.getResourceID());
return dup;
}
// Getter + setter --------------------------------------------------------------------------------------------------------------------
public int getTagAssignmentsCount() {
int sum = 0;
int count = 0;
for (Bookmark data : this.userLines) {
if (this.countLimit == 0 || count++ < this.countLimit) {
sum += data.getTags().size();
}
}
return sum;
}
public List<Bookmark> getBookmarks() {
return this.userLines;
}
public void setBookmarks(List<Bookmark> userLines) {
this.userLines = userLines;
}
public List<Bookmark> getTestLines() {
return this.testLines;
}
public void setTestLines(List<Bookmark> userLines) {
this.testLines = userLines;
}
public List<String> getCategories() {
return this.categories;
}
public List<String> getTags() {
return this.tags;
}
public List<Integer> getTagCounts() {
return this.tagCounts;
}
public Map<String, Integer> getTagMap() {
return this.tagMap;
}
public List<String> getResources() {
return this.resources;
}
public Map<String, Integer> getResourceMap() {
return this.resourceMap;
}
public List<Integer> getResourceCounts() {
return this.resourceCounts;
}
public List<String> getUsers() {
return this.users;
}
public List<Integer> getUserCounts() {
return this.userCounts;
}
public Map<String, Integer> getUserMap() {
return this.userMap;
}
public int getCountLimit() {
return this.countLimit;
}
public boolean hasTimestamp() {
return this.hasTimestamp;
}
public Date getFirstTimestamp() {
return new Date(this.firstTimestamp * 1000);
}
public Date getLastTimestamp() {
return new Date(this.lastTimestamp * 1000);
}
public List<Integer> getUniqueUserListFromTestSet(int trainSize) {
Set<Integer> userList = new HashSet<Integer>();
if (trainSize == -1) {
trainSize = 0;
}
for (int i = trainSize; i < this.userLines.size(); i++) {
Bookmark data = getBookmarks().get(i);
userList.add(data.getUserID());
}
List<Integer> result = new ArrayList<Integer>(userList);
//Collections.sort(result);
return result;
}
public Map<Integer, List<Integer>> getResourcesOfTestUsers(int trainSize) {
Map<Integer, List<Integer>> resourcesMap = new HashMap<Integer, List<Integer>>();
if (trainSize == -1) {
trainSize = 0;
}
for (int i = trainSize; i < getBookmarks().size(); i++) {
Bookmark data = getBookmarks().get(i);
int userID = data.getUserID();
List<Integer> resources = resourcesMap.get(userID);
if (resources == null) {
resources = new ArrayList<Integer>();
}
resources.add(data.getResourceID());
resourcesMap.put(userID, resources);
}
return resourcesMap;
}
}