package file.preprocessing; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.sql.Timestamp; import java.util.ArrayList; import java.util.LinkedHashMap; import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.Scanner; import java.util.Set; import common.Bookmark; import common.DBManager; public class BibsonomyProcessor { public static boolean processFile(String dir, String inputFile, String outputFile) { try { readBibtexFile(dir); readBookmarkFile(dir); FileReader reader = new FileReader(new File("./data/csv/bib_core/" + inputFile)); FileWriter writer = new FileWriter(new File("./data/csv/bib_core/" + outputFile)); BufferedReader br = new BufferedReader(reader); BufferedWriter bw = new BufferedWriter(writer); String line = null; String resID = "", userHash = "", timestamp = ""; Set<String> tags = new LinkedHashSet<String>(); while ((line = br.readLine()) != null) { String[] lineParts = line.split("\t"); String tag = lineParts[1].toLowerCase(); String type = lineParts[3]; if (type.equals("2") && !tag.isEmpty() && !tag.equals("no-tag") && !tag.contains("-import") && !tag.contains("-export") && !tag.contains("sys:") && !tag.contains("system:") && !tag.contains("imported")) { if (!resID.isEmpty() && !userHash.isEmpty() && (!resID.equals(lineParts[2]) || !userHash.equals(lineParts[0]))) { if (resID != null) { BibBookmark bookmark = (type.equals("1") ? getBookmark(resID) : getBibtex(resID)); if (bookmark != null) { writeLine(bw, bookmark.urlHash, userHash, timestamp, tags, bookmark); } } tags.clear(); } resID = lineParts[2]; userHash = lineParts[0]; timestamp = lineParts[4]; tags.add(tag); } } writeLine(bw, resID, userHash, timestamp, tags, null); br.close(); bw.flush(); bw.close(); return true; } catch (Exception e) { e.printStackTrace(); } return false; } public static boolean processUnsortedFile(String dir, String inputFile, String outputFile) { DBManager manager = new DBManager("bibsonomy"); //readBookmarkFile(dir); //readBibtexFile(dir); bookmarkFile = readDatabaseTable(manager, "bookmark", "url_hash"); bibtexFile = readDatabaseTable(manager, "bibtex", "simhash1"); int lineCount = 0; Map<String, Integer> resources = new LinkedHashMap<String, Integer>(); Map<String, Set<String>> tagMap = new LinkedHashMap<String, Set<String>>(); List<String> timestamps = new ArrayList<String>(); List<BibBookmark> bookmarks = new ArrayList<BibBookmark>(); try { FileInputStream reader = new FileInputStream(new File("./data/csv/" + dir + inputFile)); BufferedReader br = new BufferedReader(new InputStreamReader(reader)); FileOutputStream writer = new FileOutputStream(new File("./data/csv/" + dir + outputFile + ".txt")); BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(writer)); String line = null; BibBookmark bookmark = null; String resID = "", userHash = "", timestamp = "", tag = "", type = "", resHash = ""; while ((line = br.readLine()) != null) { String[] lineParts = line.split("\t"); type = lineParts[3]; //if (!type.equals("2")) { // skip bookmark or bibtex! // continue; //} lineCount++; timestamp = lineParts[4]; userHash = lineParts[0]; if (type.equals("1")) { bookmark = getBookmark(lineParts[2]); if (bookmark == null) { //bookmark = new BibBookmark(); //bookmark.urlHash = lineParts[2]; continue; } resHash = bookmark.urlHash;// + "1"; if (resources.containsKey(resHash)) { resID = resources.get(resHash).toString(); } else { int id = Integer.parseInt(lineParts[2]); resources.put(resHash, id); resID = Integer.toString(id); } } else if (type.equals("2")) { bookmark = getBibtex(lineParts[2]); if (bookmark == null) { //bookmark = new BibBookmark(); //bookmark.urlHash = lineParts[2]; continue; } resHash = bookmark.urlHash;// + "2"; if (resources.containsKey(resHash)) { resID = resources.get(resHash).toString(); } else { int id = Integer.parseInt(lineParts[2]); resources.put(resHash, id); resID = Integer.toString(id); } } if (resID != null) { tag = lineParts[1].toLowerCase(); Set<String> tags = tagMap.get(userHash + "_" + resHash); if (tags == null) { tags = new LinkedHashSet<String>(); tagMap.put(userHash + "_" + resHash, tags); timestamps.add(timestamp); bookmarks.add(bookmark); } //if (!tag.isEmpty() && !tag.equals("no-tag") && !tag.contains("-import") && !tag.contains("-export") && !tag.contains("sys:") && !tag.contains("system:") && !tag.contains("imported")) { tags.add(tag); //} } } int i = 0; for (Map.Entry<String, Set<String>> entry : tagMap.entrySet()) { Set<String> tags = entry.getValue(); if (tags.size() > 0) { String[] parts = entry.getKey().split("_"); if (parts.length >= 2) { userHash = parts[0]; resID = parts[1]; timestamp = timestamps.get(i); bookmark = bookmarks.get(i); i++; writeLine(bw, resID, userHash, timestamp, tags, null/*bookmark*/); // do not use content here! } } } br.close(); bw.flush(); bw.close(); System.out.println("TAS with bib: " + lineCount); return true; } catch (Exception e) { e.printStackTrace(); } return false; } private static boolean writeLine(BufferedWriter bw, String resID, String userHash, String timestamp, Set<String> tags, BibBookmark bookmark) { try { String tagString = ""; for (String tag : tags) { tagString += (tag + ","); } tagString = tagString.length() > 0 ? tagString.substring(0, tagString.length() - 1) : ""; bw.write("\"" + userHash + "\";\"" + resID + "\";\"" + processTimestamp(timestamp) + "\";\"" + tagString + "\";\"\";\"\""); if (bookmark != null) { bw.write(";\"" + bookmark.title + "\";\"" + bookmark.desc + (bookmark.extDesc != "" ? (" " + bookmark.extDesc) : "")+ "\""); } bw.write("\n"); return true; } catch (IOException e) { e.printStackTrace(); } return false; } private static long processTimestamp(String timestamp) { return Timestamp.valueOf(timestamp).getTime() / 1000; // because of seconds } private static Map<String, BibBookmark> bookmarkFile; private static Map<String, BibBookmark> readDatabaseTable(DBManager manager, String tableName, String urlFieldName) { Map<String, BibBookmark> bMap = new LinkedHashMap<String, BibBookmark>(); List<BibBookmark> bookmarks = manager.getBibBookmarks(tableName, "content_id", urlFieldName); for (BibBookmark b : bookmarks) { bMap.put(b.id, b); } return bMap; } private static void readBookmarkFile(String dir) { bookmarkFile = new LinkedHashMap<String, BibBookmark>(); String line = null; try { FileReader bookmarkReader = new FileReader(new File("./data/csv/" + dir + "bookmark")); BufferedReader bookmarkBr = new BufferedReader(bookmarkReader); while ((line = bookmarkBr.readLine()) != null) { String[] lineParts = line.split("\t"); BibBookmark bookmark = new BibBookmark(); if (lineParts.length >= 2) { bookmark.urlHash = lineParts[1]; if (lineParts.length >= 3) { bookmark.title = lineParts[2].replace("\n", "").replace(";", "").replace("\r", ""); if (lineParts.length >= 4) { bookmark.desc = lineParts[3].replace("\n", "").replace(";", "").replace("\r", ""); if (lineParts.length >= 5) { bookmark.extDesc = lineParts[4].replace("\n", "").replace(";", "").replace("\r", ""); } } } bookmarkFile.put(lineParts[0], bookmark); } } bookmarkBr.close(); } catch (Exception e) { e.printStackTrace(); } System.out.println("Bookmark lines: " + bookmarkFile.size()); } private static Map<String, BibBookmark> bibtexFile; private static void readBibtexFile(String dir) { bibtexFile = new LinkedHashMap<String, BibBookmark>(); String line = null; int lineCount = 0; try { FileReader bookmarkReader = new FileReader(new File("./data/csv/" + dir + "bibtex")); BufferedReader bookmarkBr = new BufferedReader(bookmarkReader); while ((line = bookmarkBr.readLine()) != null) { lineCount++; String[] lineParts = line.split("\t"); BibBookmark bookmark = new BibBookmark(); if (lineParts.length >= 29) { bookmark.urlHash = lineParts[28]; if (lineParts.length >= 19) { bookmark.desc = lineParts[18].replace("\n", "").replace(";", "").replace("\r", ""); if (bookmark.desc.equals("\\N")) { bookmark.desc = ""; } if (lineParts.length >= 27) { bookmark.extDesc = lineParts[26].replace("\n", "").replace(";", "").replace("\r", ""); if (bookmark.extDesc.equals("\\N")) { bookmark.extDesc = ""; } if (lineParts.length >= 32) { bookmark.title = lineParts[31].replace("\n", "").replace(";", "").replace("\r", ""); } if (bookmark.title.equals("\\N")) { bookmark.title = ""; } } } bibtexFile.put(lineParts[0], bookmark); } } bookmarkBr.close(); } catch (Exception e) { e.printStackTrace(); } System.out.println("Bibtex lines: " + bibtexFile.size()); System.out.println("Bibtex line-count: " + lineCount); } private static BibBookmark getBookmark(String resID) { if (bookmarkFile != null) { return bookmarkFile.get(resID); } else { return null; } } private static BibBookmark getBibtex(String resID) { if (bibtexFile != null) { return bibtexFile.get(resID); } else { return null; } } }