package file.preprocessing;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.sql.Timestamp;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
public class PintsProcessor {
// used for datasets from PINTS: Delicious (big) and Flickr
public static boolean processFile(String dir, String inputFile, String outputFile) {
Map<String, Set<String>> tagMap = new LinkedHashMap<String, Set<String>>();
List<String> timestamps = new ArrayList<String>();
try {
FileReader reader = new FileReader(new File("./data/csv/" + dir + "/" + inputFile));
FileWriter writer = new FileWriter(new File("./data/csv/" + dir + "/" + outputFile + ".txt"));
BufferedReader br = new BufferedReader(reader);
BufferedWriter bw = new BufferedWriter(writer);
String line = null;
String resID = "", userHash = "", timestamp = "", tag = "";
while ((line = br.readLine()) != null) {
String[] lineParts = line.split("\t");
if (lineParts.length < 4) {
continue;
}
timestamp = lineParts[0];
userHash = lineParts[1];
resID = lineParts[2];
tag = lineParts[3].toLowerCase();
if (!(!tag.isEmpty() && !tag.equals("no-tag") && !tag.contains("-import") && !tag.contains("-export") && !tag.contains("sys:") && !tag.contains("system:") && !tag.contains("imported"))) {
continue;
}
Set<String> tags = tagMap.get(userHash + "_" + resID);
if (tags == null) {
tags = new LinkedHashSet<String>();
tagMap.put(userHash + "_" + resID, tags);
timestamps.add(timestamp);
if (timestamps.size() % 100000 == 0) {
System.out.println("READ 100000 bookmarks");
}
}
tags.add(tag);
}
int i = 0;
for (Map.Entry<String, Set<String>> entry : tagMap.entrySet()) {
Set<String> tags = entry.getValue();
String[] parts = entry.getKey().split("_");
userHash = parts[0];
resID = parts[1];
timestamp = timestamps.get(i++);
writeLine(bw, resID, userHash, timestamp, tags);
if (i % 100000 == 0) {
System.out.println("WROTE 100000 bookmarks");
}
}
br.close();
bw.flush();
bw.close();
return true;
} catch (Exception e) {
e.printStackTrace();
}
return false;
}
private static boolean writeLine(BufferedWriter bw, String resID, String userHash, String timestamp, Set<String> tags) {
try {
if (tags.size() == 0) {
return false;
}
String tagString = "";
for (String tag : tags) {
tagString += (tag + ",");
}
tagString = tagString.length() > 0 ? tagString.substring(0, tagString.length() - 1) : "";
bw.write("\"" + userHash + "\";\"" + resID + "\";\"" + processTimestamp(timestamp) + "\";\"" + tagString + "\";\"\"\n");
return true;
} catch (IOException e) {
e.printStackTrace();
}
return false;
}
private static long processTimestamp(String timestamp) {
return Timestamp.valueOf(timestamp).getTime() / 1000; // because of seconds
}
}