package de.lemo.dms.processing;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.util.ArrayList;
import java.util.BitSet;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
public class FrequentPath {
private static File TESTFILE = new File(
"/Users/forte/eclipse/SPM/data/bwl.csv");
private static List<Long> TIMESTAMPS = new ArrayList<Long>();
private int[] _p; // path as array of int values
private FrequentPath _left;
private FrequentPath _right;
private FrequentPath(int[] p, FrequentPath left, FrequentPath right) {
_p = p;
_left = left;
_right = right;
}
/**
* compute frequent paths using the apriori algorithm
*
* @param userPaths
* list of users paths
* @param support
* @return list of lists of frequent paths, starting with length 1
*/
public static List<List<List<Integer>>> apriori(
List<List<Integer>> userPaths, double support) {
TIMESTAMPS.add(new Long(System.currentTimeMillis()));
// number of required user paths
int minpaths = (int) Math.ceil(support * userPaths.size());
// list of FrequentPath objects of length k
List<FrequentPath> fps;
// alternative representation of user paths as int[]
List<int[]> ups;
List<List<List<Integer>>> results = new ArrayList<List<List<Integer>>>();
// initialize
ups = aprioriInitialize(userPaths);
TIMESTAMPS.add(new Long(System.currentTimeMillis()));
// frequent paths of length 1
fps = aprioriLength1(userPaths, minpaths);
// iteration: compute paths of length k+1 from paths of length k
while (aprioriNewPaths(fps, results)) {
TIMESTAMPS.add(new Long(System.currentTimeMillis()));
fps = aprioriLengthK(fps, ups, minpaths);
}
TIMESTAMPS.add(new Long(System.currentTimeMillis()));
return results;
}
/**
* transform path representation from List<Integer> to int[]
*/
private static List<int[]> aprioriInitialize(List<List<Integer>> userPaths) {
List<int[]> ups = new ArrayList<int[]>();
for (List<Integer> path : userPaths) {
int k = path.size();
int[] p = new int[k];
int i = 0;
for (Integer x : path) {
p[i] = x.intValue();
i++;
}
ups.add(p);
}
return ups;
}
/**
* create frequent paths of length 1 first compute set of values for
* learning objects then, create a frequent path for each value, if support
* is sufficient
*/
private static List<FrequentPath> aprioriLength1(
List<List<Integer>> userPaths, int minpaths) {
List<FrequentPath> fps = new ArrayList<FrequentPath>();
Set<Integer> values = new HashSet<Integer>();
for (List<Integer> path : userPaths) {
values.addAll(path);
}
for (Integer x : values) {
if (isSupported(x, userPaths, minpaths)) {
int[] p = new int[1];
p[0] = x.intValue();
fps.add(new FrequentPath(p, null, null));
}
}
return fps;
}
private static boolean isSupported(Integer x,
List<List<Integer>> userPaths, int minpaths) {
int npaths = 0;
for (List<Integer> path : userPaths) {
if (path.contains(x)) {
npaths++;
if (npaths == minpaths) {
return true;
}
}
}
return false;
}
/**
* given frequent paths of length k, create frequent paths of length k+1 if
* the rightmost k-1 values of a frequent path fp1 (left) coincide with the
* leftmost k-1 values of fp2 (right), then an overlay of fp1 with fp2 is a
* new candidate for a frequent path of length k+1 this check can be done by
* comparing object references (left._right == right._left) then, to
* determine if the new path is supported by user path upi, it is necessary
* that left._p is supported by upi AND right._p is supported by upi if this
* condition is true, it has to be tested if the new path is contained in
* upi this reduces the number of tests dramatically
*/
private static List<FrequentPath> aprioriLengthK(List<FrequentPath> fps,
List<int[]> ups, int minpaths) {
List<FrequentPath> next = new ArrayList<FrequentPath>();
for (FrequentPath left : fps) {
int k = left._p.length;
for (FrequentPath right : fps) {
if (left._right == right._left) {
int[] p = new int[k + 1];
for (int i = 0; i < k; i++) {
p[i] = left._p[i];
}
p[k] = right._p[k - 1];
if (isSupported(p, ups, minpaths)) {
next.add(new FrequentPath(p, left, right));
}
}
}
}
return next;
}
private static boolean isSupported(int[] p1, List<int[]> ups, int minpaths) {
int npaths = 0;
for (int[] p2 : ups) {
int i1 = p1.length - 1;
int i2 = p2.length - 1;
while (i2 >= i1) {
if (i1 < 0) {
npaths++;
if (npaths == minpaths) {
return true;
}
break;
}
if (p1[i1] == p2[i2]) {
i1--;
i2--;
} else {
i2--;
}
}
}
return false;
}
// private static boolean isSupported(int[] p1, List<int[]> ups, int minpaths) {
// int npaths = 0;
// for (int[] p2 : ups) {
// int i1 = p1.length-1;
// int i2 = p2.length-1;
// boolean alive = i1 <= i2;
// while (alive && (i1 >= 0)) {
// if (p1[i1] != p2[i2]) {
// i2--;
// if (i1 > i2) alive = false;
// } else {
// i1--; i2--;
// }
// }
// if (alive) npaths++;
// if (npaths == minpaths) return true;
// }
// return false;
// }
/**
* transforms the representation of a FrequentPath object to a List<Integer>
*/
private static boolean aprioriNewPaths(List<FrequentPath> fps,
List<List<List<Integer>>> results) {
if (fps.isEmpty())
return false;
List<List<Integer>> paths = new ArrayList<List<Integer>>();
for (FrequentPath fp : fps) {
int k = fp._p.length;
List<Integer> path = new ArrayList<Integer>();
for (int i = 0; i < k; i++) {
path.add(new Integer(fp._p[i]));
}
paths.add(path);
}
results.add(paths);
return true;
}
/**
* TESTING
*
* @param args
* support value (in %)
* @throws Exception
*/
public static void main(String[] args) throws Exception {
if (args.length != 1) {
throw new Exception(
"Usage: java spm.FrequentPathsBooleanArray <support>");
}
double support = Double.parseDouble(args[0]) / 100;
List<List<Integer>> userPaths = readUserPaths();
List<List<List<Integer>>> frequentPaths = apriori(userPaths, support);
printStatistics(support, userPaths, frequentPaths);
// printPaths(userPaths, userPaths.size() + " user paths");
// int k = 1;
// for (List<List<Integer>> paths : frequentPaths) {
// printPaths(paths, paths.size() + " frequent paths of length " + k);
// k++;
// }
}
private static List<List<Integer>> readUserPaths() throws Exception {
List<List<Integer>> userPaths = new ArrayList<List<Integer>>();
BufferedReader br = new BufferedReader(new FileReader(TESTFILE));
String str;
String[] values;
str = br.readLine();
while (str != null) {
values = str.split(",");
List<Integer> path = new ArrayList<Integer>();
for (String x : values) {
path.add(new Integer(Integer.parseInt(x)));
}
userPaths.add(path);
str = br.readLine();
}
br.close();
return userPaths;
}
private static void printStatistics(double support,
List<List<Integer>> userPaths,
List<List<List<Integer>>> frequentPaths) {
System.out.println(userPaths.size() + " user paths");
System.out.println("support " + (support * 100) + " %");
int i = 0;
Long t1, t2;
long t, total = 0;
t1 = TIMESTAMPS.get(i++);
t2 = TIMESTAMPS.get(i++);
t = t2.longValue() - t1.longValue();
total += t;
t1 = t2;
System.out.println("time for initialization: " + t + " ms");
int k = 1;
for (List<List<Integer>> paths : frequentPaths) {
t2 = TIMESTAMPS.get(i++);
t = t2.longValue() - t1.longValue();
total += t;
t1 = t2;
System.out.println("time for " + paths.size()
+ " frequent paths of length " + k + ": " + t + " ms");
k++;
}
System.out.println("total time: " + total + " ms");
}
private static void printPaths(List<List<Integer>> userPaths, String str) {
System.out.println(str);
for (List<Integer> path : userPaths) {
for (Integer x : path) {
System.out.print(" " + x.intValue());
}
System.out.println();
}
}
private static void printPath(int[] p) {
int n = p.length;
for (int i = 0; i < n - 1; i++) {
System.out.print(p[i] + ",");
}
System.out.println(p[n - 1]);
}
private static void printBitSet(BitSet bs, int n) {
for (int i = 0; i < n; i++) {
if (bs.get(i)) {
System.out.print("1");
} else {
System.out.print("0");
}
}
System.out.println(" " + bs.cardinality());
}
}