package ca.pfv.spmf.algorithms.frequentpatterns.dci_closed_optimized;
/* This file is copyright (c) 2008-2013 Philippe Fournier-Viger
*
* This file is part of the SPMF DATA MINING SOFTWARE
* (http://www.philippe-fournier-viger.com/spmf).
*
* SPMF is free software: you can redistribute it and/or modify it under the
* terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
*
* SPMF is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
* A PARTICULAR PURPOSE. See the GNU General Public License for more details.
* You should have received a copy of the GNU General Public License along with
* SPMF. If not, see <http://www.gnu.org/licenses/>.
*/
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.BitSet;
import java.util.Collections;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
/**
* This is the optimized implementation of the "DCI_Closed" algorithm.
* The DCI_Closed algorithm finds all closed itemsets in a transaction database. <br/><br/>
*
* DCI_Closed was initially proposed in this article:
* <br/><br/>
*
* Lucchese, C., Orlando, S. & Perego, Raffaele (2004), DCI_Closed: a fast and memory efficient
* algorithm to mine frequent closed itemsets, Proc. 2nd IEEE ICDM Workshop on Frequent Itemset
* Mining Implementations at ICDM 2004.
* <br/><br/>
*
* Note: My implementation assumes that there is no item named "0".
* <br/><br/>
*
* My implementation include several optimization:<br/>
* - the use of a bit matrix (as described in the TKDE paper)<br/>
* - projecting the database (as described in the TKDE paper)<br/>
* - intersecting bit by bit and stop at first different bit for inclusion check (similar to what is described in the TKDE paper, but check bits instead of words)
* <br/><br/>
*
* But more optimizations could be done:<br/>
* - intersecting word by word and stop at first different word for inclusion check (described in the TKDE paper)<br/>
* - reorder columns of the matrix (described in the TKDE paper)<br/>
* - reusing results of previous bitwise intersections (described in the TKDE paper)<br/>
* - changing for a breath-first DCI-like approach for dense datasets (as described in the TKDE paper)<br/>
* - ...<br/>
* - remove elements from postsets and use a linkedlist for postsets.<br/>
* - closedset could be an array.<br/>
* - etc.<br/><br/>
*
* Some of these further optimizations would need to use a custom BitSet class
* instead of the BitSet class of Java, because the BitSet class of Java does not let us
* iterate over the words inside the BitSet directly.
*
* @see BitMatrix
* @author Philippe Fournier-Viger
*/
public class AlgoDCI_Closed_Optimized {
// number of closed itemsets found
int closedCount =0;
// the number of transaction in the transaction database
int tidsCount =0;
// the largest item in the transaction database
int maxItemId =1;
// relative minimum support set by the user
private int minSuppRelative;
// object to write the output file
BufferedWriter writer = null;
/**
* Default constructor
*/
public AlgoDCI_Closed_Optimized() {
}
/**
* Run the algorithm.
* @param input the path of an input file (transaction database).
* @param output the path of the output file for writing the result
* @param minsup a minimum support threshold
* @throws IOException exception if error while writing/reading files
*/
public void runAlgorithm(String input, String output, int minsup) throws IOException {
// record start time
long startTimestamp = System.currentTimeMillis();
// reset number of itemsets found
closedCount=0;
System.out.println("Running the DCI-Closed algorithm");
// Prepare object to write the output file
writer = new BufferedWriter(new FileWriter(output));
// save the minimum support
this.minSuppRelative = minsup;
// (0) SCAN TO KNOW THE DATABASE SIZE AND # OF ITEMS TO INITIALISE BIT-MATRIX
firstScan(input);
// create the bit matrix
final BitMatrix matrix = new BitMatrix(maxItemId, tidsCount);
// (1) CREATE VERTICAL DATABASE INTO MEMORY
createVerticalDatabase(input, matrix);
// (2) INITIAL VARIABLES FOR THE FIRST CALL TO THE "DCI_CLOSED" PROCEDURE
// (as described in the paper)
List<Integer> closedset = new ArrayList<Integer>();
BitSet closedsetTIDs = null;
List<Integer> preset = new ArrayList<Integer>();
List<Integer> postset = new ArrayList<Integer>(maxItemId);
// Create postset and sort it by descending order or support.
// For each item:
for(int i=1; i<= maxItemId; i++){
// if the item is frequent
if(matrix.getSupportOfItemFirstTime(i) >= minSuppRelative){
// add it to the postset
postset.add(i);
}
}
// Sort items by support ascending order.
// But use the lexicographical order if
// the support is the same for two items.
Collections.sort(postset, new Comparator<Integer>(){
public int compare(Integer item1, Integer item2) {
// if the support is the same
if(matrix.getSupportOfItem(item1) == matrix.getSupportOfItem(item2)){
// compare the lexical order
return (item1 < item2) ? -1 : 1;
}
// otherwise, use the support
return matrix.getSupportOfItem(item1) - matrix.getSupportOfItem(item2);
}
});
// (3) CALL THE "DCI_CLOSED" RECURSIVE PROCEDURE
dci_closed(true, closedset, closedsetTIDs, postset, preset, matrix, matrix);
// print statistics
System.out.println("========== DCI_CLOSED - STATS ============");
System.out.println(" Number of transactions: " + tidsCount );
System.out.println(" Number of frequent closed itemsets: " + closedCount );
System.out.println(" Total time ~: " + (System.currentTimeMillis() - startTimestamp) + " ms");
// close the file
writer.close();
}
/**
* Scan database to know the database size and number of items to
* initialize the bit matrix.
* @param input the input file
* @throws IOException exception if error while reading the file
*/
private void firstScan(String input) throws NumberFormatException, IOException {
// Prepareobject to read the file
BufferedReader reader = new BufferedReader(new FileReader(input));
String line;
maxItemId = 0;
tidsCount =0; // variable to count the number of transaction.
// for each line (transaction) until the end of file
while( ((line = reader.readLine())!= null)){
// if the line is a comment, is empty or is a
// kind of metadata
if (line.isEmpty() == true ||
line.charAt(0) == '#' || line.charAt(0) == '%'
|| line.charAt(0) == '@') {
continue;
}
// split the transaction into items
String[] lineSplited = line.split(" ");
// for each item
for(String itemString : lineSplited){
// convert the item from string to integer
Integer item = Integer.parseInt(itemString);
// update the maximum item in the database
if(item > maxItemId){
maxItemId = item;
}
}
tidsCount++; // increase the transaction count
}
// close the file
reader.close();
}
/**
* The method "DCI_CLOSED" as described in the paper.
* @param firstime true if this method is called for the first time
* @param closedset the closed set (see paper).
* @param bitset the tids set of the closed set
* @param postset the postset (see paper for full details)
* @param preset the preset (see paper)
* @param matrix the modified matrix
* @param originalMatrix the original bitmatrix
* @exception IOException if error writing the output file
*/
private void dci_closed(boolean firstTime, List<Integer> closedset, BitSet bitset,
List<Integer> postset, List<Integer> preset, BitMatrix matrix, BitMatrix originalMatrix) throws IOException {
//L2: For all i in postset
for(Integer i : postset){
// L4 Calculate the tidset of newgen
// where newgen is "closedset" U {i}
BitSet newgenTIDs;
// if the first time
if(firstTime){
// it is the tidset of it
newgenTIDs = matrix.getBitSetOf(i);
}else{
// otherwise we intersect the tidset of closedset and the
// tidset of i
newgenTIDs = (BitSet)bitset.clone();
newgenTIDs.and(matrix.getBitSetOf(i));
}
// if newgen has a support no less than minsup
if(newgenTIDs.cardinality() >= minSuppRelative){
// L3: newgen = closedset U {i}
// Create the itemset for newgen
List<Integer> newgen = new ArrayList<Integer>(closedset.size()+1);
newgen.addAll(closedset);
newgen.add(i);
// L5: if newgen is not a duplicate
if(is_dup(newgenTIDs, preset, matrix) == false){
// L6: ClosedsetNew = newGen
List<Integer> closedsetNew = new ArrayList<Integer>();
closedsetNew.addAll(newgen);
// calculate tidset
BitSet closedsetNewTIDs = null;
// if first time
if(firstTime){
// the new tidset of closed set is the tidset of i
closedsetNewTIDs = (BitSet)matrix.getBitSetOf(i).clone();
}else{
// otherwise, we add the tidset of newgen
closedsetNewTIDs = (BitSet)newgenTIDs.clone();
}
// L7 : PostsetNew = emptyset
List<Integer> postsetNew = new ArrayList<Integer>();
// L8 for each j in Postset such that i _ j :
for(Integer j : postset){
// if i is smaller than j according to the total order on items
if(smallerAccordingToTotalOrder(i, j, originalMatrix)){
// L9
// if the tidset of j contains the tidset of newgen
if(isAllContainedIn(newgenTIDs, matrix.getBitSetOf(j))){
closedsetNew.add(j);
// recalculate TIDS of closedsetNEW by intersection
closedsetNewTIDs.and(matrix.getBitSetOf(j));
}else{
// otherwise add j to the new postset
postsetNew.add(j);
}
}
}
// L15 : write out closedsetNew and its support
int support = closedsetNewTIDs.cardinality();
writeOut(closedsetNew, support);
// L16: recursive call
// FIXED: we have to make a copy of preset before the recursive call
List<Integer> presetNew = new ArrayList<Integer>(preset);
if(firstTime){
// THIS IS THE "Dataset projection" optimization described in the TKDE paper.
BitMatrix projectedMatrix = projectMatrix(matrix, closedsetNewTIDs, support);
BitSet replacement = new BitSet(support);
replacement.set(0, support, true);
dci_closed(false, closedsetNew, replacement, postsetNew, presetNew, projectedMatrix, matrix);
}else{
dci_closed(false, closedsetNew, closedsetNewTIDs, postsetNew, presetNew, matrix, originalMatrix);
}
// L17 : Preset = Preset U {i}
preset.add(i);
}
}
}
}
/**
* Check if an item is smaller than another according to the support ascending order
* or if the support is the same, use the lexicographical order.
*/
private boolean smallerAccordingToTotalOrder(Integer i, Integer j, BitMatrix matrix) {
if(matrix.getSupportOfItem(i) == matrix.getSupportOfItem(j)){
return (i < j) ? true : false;
}
return matrix.getSupportOfItem(j) - matrix.getSupportOfItem(i) >0;
}
/**
* Write a frequent closed itemset that is found to the output file.
*/
private void writeOut(List<Integer> closedset, int support) throws IOException {
// increase the number of closed itemsets
closedCount++;
StringBuilder buffer = new StringBuilder();
// for each item in the closed itemset
Iterator<Integer> iterItem = closedset.iterator();
while(iterItem.hasNext()){
// append the item
buffer.append(iterItem.next());
// if it is not the last item, append a space
if(iterItem.hasNext()){
buffer.append(' ');
}else{
break;
}
}
// append the support
buffer.append(" #SUP: ");
buffer.append(support);
// append the buffer
writer.write(buffer.toString());
writer.newLine();
}
/**
* The method "is_dup" as described in the paper.
* @param newgenTIDs the tidset of newgen
* @param preset the itemset "preset"
* @param matrix the current transaction database as a bit matrix
*/
private boolean is_dup(BitSet newgenTIDs, List<Integer> preset, BitMatrix matrix) {
// L25
// For each item in preset
for(Integer j : preset){
// L26 :
// If tidset of newgen is included in tids of j
if(isAllContainedIn(newgenTIDs, matrix.getBitSetOf(j))){
return true; // FIXED: IN ORIGINAL PAPER THEY WROTE FALSE, BUT IT SHOULD BE TRUE
}
}
return false; // FIXED: IN ORIGINAL PAPER THEY WROTE TRUE, BUT IT SHOULD BE FALSE
}
/**
* Project the bitmatrix with a given bitset.
* This removes all the columns in the bitmatrix that do not contain a 1
* in the given bitset.
* @param matrix the original bitmatrix
* @param bitset a bitset
* @param projectedsize the number of transaction in the projected bitmatrix.
* @return a new bit matrix
*/
private BitMatrix projectMatrix(BitMatrix matrix, BitSet bitset, int projectedsize) {
// create a new batrix
BitMatrix newMatrix = new BitMatrix(maxItemId, projectedsize);
// This variable will be used to count the columns in the new bitmatrix
// because columns with no 1 in bitset will not be kept.
int newBit =0;
// for each bit in bitset
for (int bit = bitset.nextSetBit(0); bit >= 0; bit = bitset.nextSetBit(bit+1)) {
// for each item
for(int item = 1; item <= maxItemId; item++){
// if the bit is set to 1 in the bitset of the item
if(matrix.getBitSetOf(item).get(bit)){
// add the tid for that item in the new bit matrix
// at position newBit
newMatrix.addTidForItem(item, newBit);
}
}
// increase the current bit position
newBit++;
}
// return the new matrix
return newMatrix;
}
/**
* Create the in-memory vertical database by reading the input file.
* @param input an input file path.
* @throws IOException exception if an error while reading the file
*/
private void createVerticalDatabase(String input, BitMatrix matrix) throws IOException {
// Prepare object to read the input file
BufferedReader reader = new BufferedReader(new FileReader(input));
String line;
int tidCount =0;
// for each line (transaction) until the end of the file
while( ((line = reader.readLine())!= null)){
// for each item
for(String itemString : line.split(" ")){
// add the current transaction id to the tidset of the item
matrix.addTidForItem(Integer.parseInt(itemString), tidCount);
}
// increase the transaction count
tidCount++;
}
// close the file
reader.close();
}
/**
* Checks if the TIDs set represented by bs1 is included in the TIDs set represented by bs2.
* @param bs1 a first bitset
* @param bs2 another bitset
* @return true if the first bitset is contained in the second bitset
*/
private boolean isAllContainedIn(BitSet bs1, BitSet bs2) {
// for each bit of bs1
for (int i = bs1.nextSetBit(0); i >= 0; i = bs1.nextSetBit(i+1)) {
// if the bit is not in bs2 return false
if(bs2.get(i) == false){
return false;
}
}
// if all bits of bs1 are in bs2, return true
return true;
}
}