package ca.pfv.spmf.algorithms.sequentialpatterns.goKrimp; import java.io.BufferedWriter; import java.io.FileWriter; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.Iterator; /** * * This is an implementation of the GoKrimp and SedKrimp algorithms. GoKrimp: * direct look for compressing sequential patterns from a database of sequences * SeqKrimp: read a set of candidate patterns and find a good subset of * compressing sequential patterns For more information please refer to the * paper Mining Compressing Sequential Patterns in the Journal Statistical * Analysis and Data Mining * <br/> * <br/> * * Copyright (c) 2014 Hoang Thanh Lam (TU Eindhoven and IBM Research) Toon * Calders (Université Libre de Bruxelles), Fabian Moerchen (Amazon.com inc) * and Dmitriy Fradkin (Siemens Corporate Research) <br/> * <br/> * * This file is part of the SPMF DATA MINING SOFTWARE * (http://www.philippe-fournier-viger.com/spmf). <br/> * <br/> * * SPMF is free software: you can redistribute it and/or modify it under the * terms of the GNU General Public License as published by the Free Software * Foundation, either version 3 of the License, or (at your option) any later * version. <br/> * <br/> * * SPMF is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR * A PARTICULAR PURPOSE. See the GNU General Public License for more details. <br/> * <br/> * * You should have received a copy of the GNU General Public License along with * SPMF. If not, see <http://www.gnu.org/licenses/>. * * @see DataReader * @see Event * @see MyPattern * @see SignTest * @author Hoang Thanh Lam (TU Eindhoven and IBM Research) */ public class AlgoGoKrimp { ArrayList<Integer> characters; // map from characters to its indices in the // dictionary ArrayList<ArrayList<Event>> data; // a database of sequences ArrayList<MyPattern> patterns; // the set of patterns, the dictionary in // this implementation ArrayList<MyPattern> candidates; // the set of candidates HashMap<Integer, String> labels; // event labels HashMap<Integer, ArrayList<Integer>> related_events; // map from events to // related events ArrayList<Integer> classlabels; // class labels of each sequence int Nword; // the number of encoded words double comp_size; // size (in bits) of the compressed data double uncomp_size; // the size (in bits) of the uncompressed data // (representation by the Huffman codes) static final int NSTART = 1000; // the maximum number of candidate events as // starting points for extending to find // compressing patterns static final int NRELATED = 1000; // the maximum number of candidate events // as starting points for extending to // find compressing patterns BufferedWriter writer; // object to write output file. If null, result is // printed to console. Otherwise, the result is // written to a file. /** * find compressing patterns by greedily extending initial candidate events * * @throws IOException */ public void gokrimp() throws IOException { long startTime = System.currentTimeMillis(); initialization(); ArrayList<MyPattern> ie = get_Initial_Patterns(); // get a set of // initial events MyPattern maxp = new MyPattern(); double max; while (true) { max = Double.NEGATIVE_INFINITY; for (int i = 0; i < ie.size(); i++) { MyPattern mp = ie.get(i), prev = mp; while ((mp = extend(mp)) != null) { prev = mp; } if (prev.ben > max) { maxp = prev; max = prev.ben; } } if (max <= 0) break; else { addPattern(maxp); printMyPattern(maxp); remove(maxp); } } if (writer != null) { writer.close(); } System.out.println("Compressed size: " + comp_size + ", uncompressed size: " + uncomp_size + ", compression ratio: " + uncomp_size / (0.0 + comp_size)); long endTime = System.currentTimeMillis(); long totalTime = endTime - startTime; System.out.println("Running time: " + totalTime / 1000 + " seconds"); } /** * candidate based algorithm, search for the best encoding of the data given * the set of candidates * * @throws IOException */ public void seqkrimp() throws IOException { MyPattern maxp = new MyPattern(); double max; while (true) { max = Double.NEGATIVE_INFINITY; int mi = getBestPattern(); if (candidates.get(mi).ben > max) { maxp = new MyPattern(candidates.get(mi)); max = candidates.get(mi).ben; } if (max <= 0) break; else { addPattern(maxp); printMyPattern(maxp); remove(maxp); } // remove the best candidates from the candidate lists candidates.remove(mi); for (int i = 0; i < candidates.size(); i++) { candidates.get(i).ben = 0; candidates.get(i).freq = 0; candidates.get(i).g_cost = 0; } } if (writer != null) { try { writer.close(); } catch (IOException e) { e.printStackTrace(); } } System.out.println("Compressed size: " + comp_size + ", uncompressed size: " + uncomp_size + ", compression ratio: " + uncomp_size / (0.0 + comp_size)); } /** * Initialization */ void initialization() { patterns = new ArrayList<MyPattern>(); candidates = new ArrayList<MyPattern>(); related_events = new HashMap<Integer, ArrayList<Integer>>(); Nword = 0; comp_size = 0; characters = new ArrayList<Integer>(); // Temporarily, the characters map // contains event id and the number of // time the corresponding event occurs // in the database for (int i = 0; i < data.size(); i++) { for (int j = 0; j < data.get(i).size(); j++) { if (data.get(i).get(j).id >= characters.size()) { for (int ii = characters.size(); ii < data.get(i).get(j).id + 1; ii++) characters.add(new Integer(0)); } characters.set(data.get(i).get(j).id, characters.get(data.get(i).get(j).id) + 1); } Nword += data.get(i).size(); } Nword += 2 * characters.size(); for (int i = 0; i < characters.size(); i++) { MyPattern mp = new MyPattern(); mp.ids.add(i); mp.ben = 0; mp.freq = characters.get(i) + 2; // plus 2 because counting also two // occurence of the singleton in // the dictionary mp.g_cost = 0; patterns.add(mp); // add the given singleton to the dictionary comp_size += mp.freq * Math.log(Nword) / Math.log(2) - mp.freq * Math.log(mp.freq) / Math.log(2); characters.set(i, patterns.size() - 1); // the characters map now // contains event id and its // index in the dictionary } uncomp_size = comp_size; // remove occurences of rare events in the data, rare events are the // ones having frequency less than SignTest.N (25 by default) for (int i = 0; i < data.size(); i++) { for (Iterator<Event> it = data.get(i).iterator(); it.hasNext();) { if (patterns.get(characters.get(it.next().id)).freq < SignTest.N) { it.remove(); } } } } /** * add a new pattern to the dictionary * * @param pattern * the input pattern */ void addPattern(MyPattern pattern) { Nword = Nword - (pattern.freq - 1) * pattern.ids.size() + pattern.ids.size() + pattern.freq; // update the number of // encoded words comp_size -= pattern.ben; // update the compression size HashMap<Integer, Integer> hm = new HashMap<Integer, Integer>(); // hm // contains // event // id // and // the // number // of // time // the // event // occurs // in // the // pattern.ids for (int j = 0; j < pattern.ids.size(); j++) { if (!hm.containsKey(pattern.ids.get(j))) { hm.put(pattern.ids.get(j), 1); } else { hm.put(pattern.ids.get(j), hm.get(pattern.ids.get(j)) + 1); } } for (int i = 0; i < patterns.size(); i++) { // update the frequency of // the existing patterns if (patterns.get(i).ids.size() == 1 && hm.containsKey(patterns.get(i).ids.get(0))) {// singleton // among the // events of // the // pattern patterns.get(i).freq -= (pattern.freq - 1) * hm.get(patterns.get(i).ids.get(0)); patterns.get(i).freq += hm.get(patterns.get(i).ids.get(0)); } } patterns.add(pattern); // add the new pattern to the dictionary } /** * extend the current pattern * * @param pattern * the pattern to be extended * @return null if no extension gives additional compression benefit or the * extended pattern if otherwise */ MyPattern extend(MyPattern pattern) { ArrayList<Integer> ve = get_Extending_Events_SignTest(pattern.ids .get(pattern.ids.size() - 1)); // get the set of extending // events candidates.clear(); // append the set of extending event to the pattern to create new // candidates for (int i = 0; i < ve.size(); i++) { MyPattern can = new MyPattern();// create a new candidate can.g_cost = 0; can.freq = 0; can.ben = 0; can.ids = new ArrayList(pattern.ids); can.ids.add(ve.get(i)); candidates.add(can); } if (candidates.isEmpty()) return null; int best = getBestPattern(); // get the index of the best candidate if (candidates.get(best).ben > pattern.ben) return candidates.get(best); else return null; } /** * get the set of initial patterns * * @return return a set of initial patterns */ ArrayList<MyPattern> get_Initial_Patterns() { ArrayList<MyPattern> ie = new ArrayList(); for (int i = 0; i < patterns.size(); i++) { if (patterns.get(i).freq >= SignTest.N) { // only consider unrare // events ie.add(new MyPattern(patterns.get(i))); } } for (int i = 0; i < ie.size(); i++) { ie.get(i).ben = ie.get(i).freq; } Collections.sort(ie); while (ie.size() > NSTART) { ie.remove(ie.size() - 1); } for (int i = 0; i < ie.size(); i++) { ie.get(i).ben = 0; } return ie; } /** * get the set of events being considered to extend a pattern, Signed Test * is used to select such events * * @return the set of events being considered to extend a pattern */ ArrayList<Integer> get_Extending_Events_SignTest(Integer e) { if (related_events.containsKey(e)) return related_events.get(e); ArrayList<Integer> ve = getRelatedEvents(e); related_events.put(e, ve); return ve; } /** * get the best patterns among the set of candidates * * @return index of the best pattern in the candidates ArrayList */ int getBestPattern() { int index = 0; double min = Double.POSITIVE_INFINITY; for (int i = 0; i < candidates.size(); i++) {// for every candidate // get all the best matches of the candidate in every sequence for (int j = 0; j < data.size(); j++) { HashMap<Integer, ArrayList<Integer>> hm = new HashMap<Integer, ArrayList<Integer>>(); ArrayList<ArrayList<Integer>> pos = new ArrayList<ArrayList<Integer>>(); for (int k = 0; k < candidates.get(i).ids.size(); k++) { if (!hm.containsKey(candidates.get(i).ids.get(k))) { ArrayList<Integer> a = new ArrayList<Integer>(); a.add(k); hm.put(candidates.get(i).ids.get(k), a); } else { ArrayList<Integer> a = hm.get(candidates.get(i).ids .get(k)); a.add(k); hm.put(candidates.get(i).ids.get(k), a); } pos.add(new ArrayList()); } for (int k = 0; k < data.get(j).size(); k++) { if (hm.containsKey(data.get(j).get(k).id)) { for (int l = 0; l < hm.get(data.get(j).get(k).id) .size(); l++) pos.get(hm.get(data.get(j).get(k).id).get(l)).add( data.get(j).get(k).ts); } } ArrayList<ArrayList<Integer>> matches = getBestMatches(pos); candidates.get(i).freq += matches.size(); candidates.get(i).g_cost += gap_cost(matches); } if (candidates.get(i).freq == 0) // skip the candidate that does not // occurr in the data continue; candidates.get(i).freq += 1;// plus one because we also count its // occurence in the dictionary double com = get_Compress_Size_When_Adding(candidates.get(i)); if (com < min) { min = com; index = i; } } candidates.get(index).ben = comp_size - min; return index; } /** * remove all the best matches of the pattern in the data * * @param pattern */ void remove(MyPattern pattern) { for (int j = 0; j < data.size(); j++) { HashMap<Integer, ArrayList<Integer>> hm = new HashMap(); ArrayList<ArrayList<Integer>> pos = new ArrayList(); for (int k = 0; k < pattern.ids.size(); k++) { if (!hm.containsKey(pattern.ids.get(k))) { ArrayList<Integer> a = new ArrayList(); a.add(k); hm.put(pattern.ids.get(k), a); } else { ArrayList<Integer> a = hm.get(pattern.ids.get(k)); a.add(k); hm.put(pattern.ids.get(k), a); } pos.add(new ArrayList()); } for (int k = 0; k < data.get(j).size(); k++) { if (hm.containsKey(data.get(j).get(k).id)) { for (int l = 0; l < hm.get(data.get(j).get(k).id).size(); l++) pos.get(hm.get(data.get(j).get(k).id).get(l)).add( data.get(j).get(k).ts); } } ArrayList<ArrayList<Integer>> matches = getBestMatches(pos); remove(matches, j); } } /** * get the compress size of the data when the given @param pattern is added * to the current dictionary * * @param pattern * @return */ double get_Compress_Size_When_Adding(MyPattern pattern) { // System.out.println(pattern.ids); int new_Nword = Nword - (pattern.freq - 1) * pattern.ids.size() + pattern.ids.size() + pattern.freq; double com = comp_size; // com += new_Nword * Math.log(new_Nword) / Math.log(2) - Nword * Math.log(Nword) / Math.log(2) - pattern.freq * Math.log(pattern.freq) / Math.log(2); HashMap<Integer, Integer> hm = new HashMap(); // hm contains event id // and the number of // time the event occurs // in the pattern.ids for (int i = 0; i < pattern.ids.size(); i++) { if (!hm.containsKey(pattern.ids.get(i))) { hm.put(pattern.ids.get(i), 1); } else { hm.put(pattern.ids.get(i), hm.get(pattern.ids.get(i)) + 1); } } for (Integer key : hm.keySet()) { int new_freq = patterns.get(characters.get(key)).freq - hm.get(key) * pattern.freq + 2 * hm.get(key); com -= new_freq * Math.log(new_freq) / Math.log(2) - patterns.get(characters.get(key)).freq * Math.log(patterns.get(characters.get(key)).freq) / Math.log(2); } com += pattern.g_cost; return com; } /** * return the best matches of a pattern with positions stored in the @param * pos * * @param pos * @return */ ArrayList<ArrayList<Integer>> getBestMatches( ArrayList<ArrayList<Integer>> pos) { ArrayList<ArrayList<Integer>> matches = new ArrayList(); while (true) { ArrayList<ArrayList<Event>> matrix = new ArrayList(); for (int i = 0; i < pos.size(); i++) { matrix.add(new ArrayList()); } for (int i = 0; i < pos.size(); i++) { if (i == 0) { for (int j = 0; j < pos.get(0).size(); j++) { Event ww = new Event(); ww.ts = 0; ww.id = pos.get(0).get(j); ww.gap = 0; matrix.get(0).add(ww); } } else { for (int j = 0; j < pos.get(i).size(); j++) { int index = 0, min = Integer.MAX_VALUE, mini = 0; while (index < matrix.get(i - 1).size() && matrix.get(i - 1).get(index).id < pos.get(i) .get(j)) { if (matrix.get(i - 1).get(index).ts == Integer.MAX_VALUE) { index++; continue; } int g = matrix.get(i - 1).get(index).ts + bits(pos.get(i).get(j) - matrix.get(i - 1).get(index).id); if (g <= min) { min = g; mini = index; } index++; } Event ww = new Event(); ww.ts = min; ww.id = pos.get(i).get(j); ww.gap = mini; matrix.get(i).add(ww); } } } int min = Integer.MAX_VALUE, mini = 0; for (int i = 0; i < matrix.get(matrix.size() - 1).size(); i++) { if (min > matrix.get(matrix.size() - 1).get(i).ts) { min = matrix.get(matrix.size() - 1).get(i).ts; mini = i; } } if (min == Integer.MAX_VALUE) break; ArrayList<Integer> match = new ArrayList(); // trace back to get the best match HashMap<Integer, Integer> hm = new HashMap(); for (int i = matrix.size() - 1; i >= 0; i--) { match.add(0, matrix.get(i).get(mini).id); hm.put(matrix.get(i).get(mini).id, 1); mini = matrix.get(i).get(mini).gap; } matches.add(match); for (int i = 0; i < pos.size(); i++) { for (Iterator<Integer> it = pos.get(i).iterator(); it.hasNext();) { if (hm.containsKey(it.next())) { it.remove(); } } if (pos.get(i).isEmpty()) return matches; } } return matches; } /** * get the cost of encoding the set of gaps * * @param matches * the set of matches * @return the cost of encoding the gaps of the matches */ int gap_cost(ArrayList<ArrayList<Integer>> matches) { int g = 0; for (int i = 0; i < matches.size(); i++) { for (int j = 1; j < matches.get(i).size(); j++) g += bits(matches.get(i).get(j) - matches.get(i).get(j - 1)); } return g; } /** * remove all the matches in the sequence with the id index * * @param matches * @param index * the identifier of the sequence */ void remove(ArrayList<ArrayList<Integer>> matches, int index) { HashMap<Integer, Integer> hm = new HashMap(); for (int i = 0; i < matches.size(); i++) { for (int j = 0; j < matches.get(i).size(); j++) { hm.put(matches.get(i).get(j), 1); } } for (Iterator<Event> it = data.get(index).iterator(); it.hasNext();) { if (hm.containsKey(it.next().ts)) { it.remove(); } } } /** * get related events of the event @param e, sign-test is used to select * these events * * @param e * the input event * @return the set of related events to the input event @param e */ ArrayList<Integer> getRelatedEvents(Integer e) { HashMap<Integer, SignTest> me = new HashMap();// statistics HashMap<Integer, Integer> mc = new HashMap();// counter ArrayList<Integer> nextdata = new ArrayList(); for (int i = 0; i < data.size(); i++) { int next = data.get(i).size(); for (int j = 0; j < data.get(i).size(); j++) { if (data.get(i).get(j).id == e.intValue()) { next = j; break; } } next++; nextdata.add(next); } for (int i = 0; i < data.size(); i++) { mc.clear(); if (nextdata.get(i) >= data.get(i).size()) continue; double middle = data.get(i).get(nextdata.get(i)).ts; middle = middle + (data.get(i).get(data.get(i).size() - 1).ts - middle) / 2; for (int j = nextdata.get(i).intValue(); j < data.get(i).size(); j++) { if (data.get(i).get(j).ts <= middle) { // in the first half if (!mc.containsKey(data.get(i).get(j).id)) { // the event // has been // seen for // the first // time mc.put(data.get(i).get(j).id, new Integer(1)); } else { // the event has been already seen before mc.put(data.get(i).get(j).id, new Integer(mc.get(data.get(i).get(j).id) + 1)); } } else { // in the second half if (!mc.containsKey(data.get(i).get(j).id)) { // the event // has been // seen for // the first // time mc.put(data.get(i).get(j).id, new Integer(-1)); } else { // the event has been already seen before mc.put(data.get(i).get(j).id, new Integer(mc.get(data.get(i).get(j).id) - 1)); } } } for (Integer key : mc.keySet()) { if (!me.containsKey(key)) { // see for the first time SignTest st = new SignTest(1, 0); if (mc.get(key).intValue() > 0) st.Nplus = st.Nplus + 1; me.put(key, st); } else { // have been already seen SignTest st; if (mc.get(key).intValue() != 0) st = new SignTest(me.get(key).Npairs + 1, me.get(key).Nplus); else st = new SignTest(me.get(key).Npairs, me.get(key).Nplus); if (mc.get(key).intValue() > 0) st.Nplus = st.Nplus + 1; me.put(key, st); } } } ArrayList<Integer> results = new ArrayList(); for (Integer key : me.keySet()) { if (me.get(key).sign_test()) {// pass the sign-test results.add(key); } if (results.size() > NRELATED) break; } return results; } /** * * * * @param a * in input integer * @return the number of bits in the binary representation of the input * integer a using the Elias code */ int bits(Integer a) { if (a.intValue() < 0) return 0; else { double x = Math.log(a) / Math.log(2); // return // lowround(x)+2*lowround(Math.log(lowround(x)+1)/Math.log(2))+1; // //ellias delta return 2 * lowround(x) + 1; // elias gamma code } } /** * * @param x * an input number * @return the lower round value of x */ int lowround(double x) { int y = (int) Math.round(x); if (y > x) y = y - 1; return y; } /** * check if pattern p is occurred in the sequence index * * @param index * @return true if the pattern p is found in the sequence with the input * index */ boolean isOccurred(MyPattern p, int index) { int d = 0; for (int i = 0; i < data.get(index).size() && d < p.ids.size(); i++) { if (p.ids.get(d) == data.get(index).get(i).id) { d++; } } if (d == p.ids.size()) return true; else return false; } /** * print the sequence database */ void printData() { // System.out.println("o--------------------------------o"); for (int i = 0; i < data.size(); i++) { for (int j = 0; j < data.get(i).size(); j++) System.out.print((data.get(i).get(j).id + 1) + " -1 "); System.out.print("-2"); System.out.println(); } // System.out.println("o--------------------------------o"); } /** * print a pattern * * @param pattern * @throws IOException */ void printMyPattern(MyPattern pattern) throws IOException { if (writer == null) {// if save to memory if (labels == null || labels.isEmpty()) { System.out.print(""); for (int j = 0; j < pattern.ids.size(); j++) System.out.print(pattern.ids.get(j) + " "); System.out.println(" #SUP: " + pattern.ben); } else { System.out.print(""); for (int j = 0; j < pattern.ids.size(); j++) System.out.print(labels.get(pattern.ids.get(j)) + " "); System.out.println(" #SUP: " + pattern.ben); } } else { // if save to file StringBuilder buffer = new StringBuilder(); if (labels == null || labels.isEmpty()) { for (int j = 0; j < pattern.ids.size(); j++) writer.write(pattern.ids.get(j) + " "); writer.write(" #SUP: " + pattern.ben); } else { for (int j = 0; j < pattern.ids.size(); j++) writer.write(labels.get(pattern.ids.get(j)) + " "); writer.write(" #SUP: " + pattern.ben); } writer.write(buffer.toString()); writer.newLine(); } } public void setOutputFilePath(String outputFilePath) throws IOException { writer = new BufferedWriter(new FileWriter(outputFilePath)); } }