package ca.pfv.spmf.algorithms.frequentpatterns.fin_prepost;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.Arrays;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;
import ca.pfv.spmf.tools.MemoryLogger;
/*
* Copyright (c) 2008-2014 ZHIHONG DENG
*
* This file is part of the SPMF DATA MINING SOFTWARE
* (http://www.philippe-fournier-viger.com/spmf).
*
* SPMF is free software: you can redistribute it and/or modify it under the
* terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
*
* SPMF is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
* A PARTICULAR PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with
* SPMF. If not, see <http://www.gnu.org/licenses/>.
*/
/**
* Java implementation of the FIN algorithm.
*
* This implementation was obtained by converting the original C++ code of FIN
* provided by ZHIHONG DENG, to Java.
*
* The code is copyright by Zhihong Deng.
*
* @author Philippe Fournier-Viger
*/
public class FIN {
// the start time and end time of the last algorithm execution
long startTimestamp;
long endTimestamp;
// number of itemsets found
int outputCount = 0;
// object to write the output file
BufferedWriter writer = null;
public int[][] bf;
public int bf_cursor;
public int bf_size;
public int bf_col;
public int bf_currentSize;
public int numOfFItem; // Number of items
public int minSupport; // minimum support
public Item[] item; // list of items sorted by support
// public FILE out;
public int[] result; // the current itemset
public int resultLen = 0; // the size of the current itemset
public int resultCount = 0;
public int nlLenSum = 0; // node list length of the current itemset
// Tree stuff
public PPCTreeNode ppcRoot;
public NodeListTreeNode nlRoot;
public int[] itemsetCount;
public int[] nlistBegin;
public int nlistCol;
public int[] nlistLen;
public int firstNlistBegin;
public int PPCNodeCount;
public int[] SupportDict;
public int[] sameItems;
public int nlNodeCount;
/**
* Comparator to sort items by decreasing order of frequency
*/
static Comparator<Item> comp = new Comparator<Item>() {
public int compare(Item a, Item b) {
return ((Item) b).num - ((Item) a).num;
}
};
private int numOfTrans;
/**
* Run the algorithm
*
* @param filename
* the input file path
* @param minsup
* the minsup threshold
* @param output
* the output file path
* @throws IOException
* if error while reading/writting to file
*/
public void runAlgorithm(String filename, double minsup, String output)
throws IOException {
ppcRoot = new PPCTreeNode();
nlRoot = new NodeListTreeNode();
nlNodeCount = 0;
MemoryLogger.getInstance().reset();
// create object for writing the output file
writer = new BufferedWriter(new FileWriter(output));
// record the start time
startTimestamp = System.currentTimeMillis();
bf_size = 1000000;
bf = new int[100000][];
bf_currentSize = bf_size * 10;
bf[0] = new int[bf_currentSize];
bf_cursor = 0;
bf_col = 0;
// ==========================
// Read Dataset
getData(filename, minsup);
resultLen = 0;
result = new int[numOfFItem];
// Build tree
buildTree(filename);
nlRoot.label = numOfFItem;
nlRoot.firstChild = null;
nlRoot.next = null;
// Initialize tree
initializeTree();
sameItems = new int[numOfFItem];
int from_cursor = bf_cursor;
int from_col = bf_col;
int from_size = bf_currentSize;
// Recursively traverse the tree
NodeListTreeNode curNode = nlRoot.firstChild;
NodeListTreeNode next = null;
while (curNode != null) {
next = curNode.next;
// call the recursive "traverse" method
traverse(curNode, nlRoot, 1, 0);
for (int c = bf_col; c > from_col; c--) {
bf[c] = null;
}
bf_col = from_col;
bf_cursor = from_cursor;
bf_currentSize = from_size;
curNode = next;
}
writer.close();
MemoryLogger.getInstance().checkMemory();
// record the end time
endTimestamp = System.currentTimeMillis();
}
/**
* Build the tree
*
* @param filename
* the input filename
* @throws IOException
* if an exception while reading/writting to file
*/
void buildTree(String filename) throws IOException {
PPCNodeCount = 0;
ppcRoot.label = -1;
// READ THE FILE
BufferedReader reader = new BufferedReader(new FileReader(filename));
String line;
// we will use a buffer to store each transaction that is read.
Item[] transaction = new Item[1000];
// for each line (transaction) until the end of the file
while (((line = reader.readLine()) != null)) {
// if the line is a comment, is empty or is a
// kind of metadata
if (line.isEmpty() == true || line.charAt(0) == '#'
|| line.charAt(0) == '%' || line.charAt(0) == '@') {
continue;
}
// split the line into items
String[] lineSplited = line.split(" ");
// for each item in the transaction
int tLen = 0; // tLen
for (String itemString : lineSplited) {
// get the item
int itemX = Integer.parseInt(itemString);
// add each item from the transaction except infrequent item
for (int j = 0; j < numOfFItem; j++) {
// if the item appears in the list of frequent items, we add
// it
if (itemX == item[j].index) {
transaction[tLen] = new Item();
transaction[tLen].index = itemX; // the item
transaction[tLen].num = 0 - j;
tLen++;
break;
}
}
}
// sort the transaction
Arrays.sort(transaction, 0, tLen, comp);
// Print the transaction
// for(int j=0; j < tLen; j++){
// System.out.print(" " + transaction[j].index + " ");
// }
// System.out.println();
int curPos = 0;
PPCTreeNode curRoot = (ppcRoot);
PPCTreeNode rightSibling = null;
while (curPos != tLen) {
PPCTreeNode child = curRoot.firstChild;
while (child != null) {
if (child.label == 0 - transaction[curPos].num) {
curPos++;
child.count++;
curRoot = child;
break;
}
if (child.rightSibling == null) {
rightSibling = child;
child = null;
break;
}
child = child.rightSibling;
}
if (child == null)
break;
}
for (int j = curPos; j < tLen; j++) {
PPCTreeNode ppcNode = new PPCTreeNode();
ppcNode.label = 0 - transaction[j].num;
if (rightSibling != null) {
rightSibling.rightSibling = ppcNode;
rightSibling = null;
} else {
curRoot.firstChild = ppcNode;
}
ppcNode.rightSibling = null;
ppcNode.firstChild = null;
ppcNode.father = curRoot;
ppcNode.count = 1;
curRoot = ppcNode;
PPCNodeCount++;
}
}
// close the input file
reader.close();
PPCTreeNode root = ppcRoot.firstChild;
int pre = 0;
itemsetCount = new int[(numOfFItem - 1) * numOfFItem / 2];
nlistBegin = new int[(numOfFItem - 1) * numOfFItem / 2];
nlistLen = new int[(numOfFItem - 1) * numOfFItem / 2];
SupportDict = new int[PPCNodeCount + 1];
while (root != null) {
root.foreIndex = pre;
SupportDict[pre] = root.count;
pre++;
PPCTreeNode temp = root.father;
while (temp.label != -1) {
itemsetCount[root.label * (root.label - 1) / 2 + temp.label] += root.count;
nlistLen[root.label * (root.label - 1) / 2 + temp.label]++;
temp = temp.father;
}
if (root.firstChild != null) {
root = root.firstChild;
} else {
if (root.rightSibling != null) {
root = root.rightSibling;
} else {
root = root.father;
while (root != null) {
if (root.rightSibling != null) {
root = root.rightSibling;
break;
}
root = root.father;
}
}
}
}
// build 2-itemset nlist
int sum = 0;
for (int i = 0; i < (numOfFItem - 1) * numOfFItem / 2; i++) {
if (itemsetCount[i] >= minSupport) {
nlistBegin[i] = sum;
sum += nlistLen[i];
}
}
if (bf_cursor + sum > bf_currentSize * 0.85) {
bf_col++;
bf_cursor = 0;
bf_currentSize = sum + 1000;
bf[bf_col] = new int[bf_currentSize];
}
nlistCol = bf_col;
firstNlistBegin = bf_cursor;
root = ppcRoot.firstChild;
bf_cursor += sum;
while (root != null) {
PPCTreeNode temp = root.father;
while (temp.label != -1) {
if (itemsetCount[root.label * (root.label - 1) / 2 + temp.label] >= minSupport) {
int cursor = nlistBegin[root.label * (root.label - 1) / 2
+ temp.label]
+ firstNlistBegin;
bf[nlistCol][cursor] = root.foreIndex;
nlistBegin[root.label * (root.label - 1) / 2 + temp.label] += 1;
}
temp = temp.father;
}
if (root.firstChild != null) {
root = root.firstChild;
} else {
if (root.rightSibling != null) {
root = root.rightSibling;
} else {
root = root.father;
while (root != null) {
if (root.rightSibling != null) {
root = root.rightSibling;
break;
}
root = root.father;
}
}
}
}
for (int i = 0; i < numOfFItem * (numOfFItem - 1) / 2; i++) {
if (itemsetCount[i] >= minSupport) {
nlistBegin[i] = nlistBegin[i] - nlistLen[i];
}
}
}
/**
* Initialize the tree
*/
void initializeTree() {
NodeListTreeNode lastChild = null;
for (int t = numOfFItem - 1; t >= 0; t--) {
NodeListTreeNode nlNode = new NodeListTreeNode();
nlNode.label = t;
nlNode.support = 0;
nlNode.NLStartinBf = bf_cursor;
nlNode.NLLength = 0;
nlNode.NLCol = bf_col;
nlNode.firstChild = null;
nlNode.next = null;
nlNode.support = item[t].num;
if (nlRoot.firstChild == null) {
nlRoot.firstChild = nlNode;
lastChild = nlNode;
} else {
lastChild.next = nlNode;
lastChild = nlNode;
}
}
}
/**
* Read the input file to find the frequent items
*
* @param filename
* input file name
* @param minSupport
* @throws IOException
*/
void getData(String filename, double minSupport) throws IOException {
numOfTrans = 0;
// (1) Scan the database and count the support of each item.
// The support of items is stored in map where
// key = item value = support count
Map<Integer, Integer> mapItemCount = new HashMap<Integer, Integer>();
// scan the database
BufferedReader reader = new BufferedReader(new FileReader(filename));
String line;
// for each line (transaction) until the end of the file
while (((line = reader.readLine()) != null)) {
// if the line is a comment, is empty or is a
// kind of metadata
if (line.isEmpty() == true || line.charAt(0) == '#'
|| line.charAt(0) == '%' || line.charAt(0) == '@') {
continue;
}
numOfTrans++;
// split the line into items
String[] lineSplited = line.split(" ");
// for each item in the transaction
for (String itemString : lineSplited) {
// increase the support count of the item by 1
Integer item = Integer.parseInt(itemString);
Integer count = mapItemCount.get(item);
if (count == null) {
mapItemCount.put(item, 1);
} else {
mapItemCount.put(item, ++count);
}
}
}
// close the input file
reader.close();
this.minSupport = (int)Math.ceil(minSupport * numOfTrans);
numOfFItem = mapItemCount.size();
Item[] tempItems = new Item[numOfFItem];
int i = 0;
for (Entry<Integer, Integer> entry : mapItemCount.entrySet()) {
if (entry.getValue() >= minSupport) {
tempItems[i] = new Item();
tempItems[i].index = entry.getKey();
tempItems[i].num = entry.getValue();
i++;
}
}
item = new Item[i];
System.arraycopy(tempItems, 0, item, 0, i);
numOfFItem = item.length;
Arrays.sort(item, comp);
}
NodeListTreeNode iskItemSetFreq(NodeListTreeNode ni, NodeListTreeNode nj,
int level, NodeListTreeNode lastChild, IntegerByRef sameCountRef) {
if (bf_cursor + ni.NLLength > bf_currentSize) {
bf_col++;
bf_cursor = 0;
bf_currentSize = bf_size > ni.NLLength * 1000 ? bf_size
: ni.NLLength * 1000;
bf[bf_col] = new int[bf_currentSize];
}
NodeListTreeNode nlNode = new NodeListTreeNode();
nlNode.support = 0;
nlNode.NLStartinBf = bf_cursor;
nlNode.NLCol = bf_col;
nlNode.NLLength = 0;
int cursor_i = ni.NLStartinBf;
int cursor_j = nj.NLStartinBf;
int col_i = ni.NLCol;
int col_j = nj.NLCol;
while (cursor_i < ni.NLStartinBf + ni.NLLength
&& cursor_j < nj.NLStartinBf + nj.NLLength) {
if (bf[col_i][cursor_i] == bf[col_j][cursor_j]) {
bf[bf_col][bf_cursor++] = bf[col_j][cursor_j];
nlNode.NLLength++;
nlNode.support += SupportDict[bf[col_i][cursor_i]];
cursor_i += 1;
cursor_j += 1;
} else if (bf[col_i][cursor_i] < bf[col_j][cursor_j]) {
cursor_i += 1;
} else {
cursor_j += 1;
}
}
if (nlNode.support >= minSupport) {
if (ni.support == nlNode.support) {
sameItems[sameCountRef.count++] = nj.label;
} else {
nlNode.label = nj.label;
nlNode.firstChild = null;
nlNode.next = null;
if (ni.firstChild == null) {
ni.firstChild = nlNode;
lastChild = nlNode;
} else {
lastChild.next = nlNode;
lastChild = nlNode;
}
}
return lastChild;
} else {
bf_cursor = nlNode.NLStartinBf;
}
return lastChild;
}
/**
* Recursively traverse the tree to find frequent itemsets
* @param curNode
* @param curRoot
* @param level
* @param sameCount
* @throws IOException if error while writing itemsets to file
*/
public void traverse(NodeListTreeNode curNode, NodeListTreeNode curRoot,
int level, int sameCount) throws IOException {
MemoryLogger.getInstance().checkMemory();
// System.out.println("==== traverse(): " + curNode.label + " " + level
// + " " + sameCount);
NodeListTreeNode sibling = curNode.next;
NodeListTreeNode lastChild = null;
while (sibling != null) {
if ((level == 1 && itemsetCount[(curNode.label - 1) * curNode.label
/ 2 + sibling.label] >= minSupport)) {
IntegerByRef sameCountTemp = new IntegerByRef();
sameCountTemp.count = sameCount;
lastChild = is2_itemSetValid(curNode, sibling, level,
lastChild, sameCountTemp);
sameCount = sameCountTemp.count;
} else if (level > 1) {
IntegerByRef sameCountTemp = new IntegerByRef();
sameCountTemp.count = sameCount;
lastChild = iskItemSetFreq(curNode, sibling, level, lastChild,
sameCountTemp);
sameCount = sameCountTemp.count;
}
sibling = sibling.next;
}
resultCount += Math.pow(2.0, sameCount);
nlLenSum += Math.pow(2.0, sameCount) * curNode.NLLength;
result[resultLen++] = curNode.label;
// ============= Write itemset(s) to file ===========
writeItemsetsToFile(curNode, sameCount);
// ======== end of write to file
nlNodeCount++;
int from_cursor = bf_cursor;
int from_col = bf_col;
int from_size = bf_currentSize;
NodeListTreeNode child = curNode.firstChild;
NodeListTreeNode next = null;
while (child != null) {
next = child.next;
traverse(child, curNode, level + 1, sameCount);
for (int c = bf_col; c > from_col; c--) {
bf[c] = null;
}
bf_col = from_col;
bf_cursor = from_cursor;
bf_currentSize = from_size;
child = next;
}
resultLen--;
}
NodeListTreeNode is2_itemSetValid(NodeListTreeNode ni, NodeListTreeNode nj,
int level, NodeListTreeNode lastChild, IntegerByRef sameCount) {
int i = ni.label;
int j = nj.label;
if (ni.support == itemsetCount[(i - 1) * i / 2 + j]) {
sameItems[sameCount.count++] = nj.label;
} else {
NodeListTreeNode nlNode = new NodeListTreeNode();
nlNode.label = j;
nlNode.NLCol = nlistCol;
nlNode.NLStartinBf = nlistBegin[(i - 1) * i / 2 + j];
nlNode.NLLength = nlistLen[(i - 1) * i / 2 + j];
nlNode.support = itemsetCount[(i - 1) * i / 2 + j];
nlNode.firstChild = null;
nlNode.next = null;
if (ni.firstChild == null) {
ni.firstChild = nlNode;
lastChild = nlNode;
} else {
lastChild.next = nlNode;
lastChild = nlNode;
}
}
return lastChild;
}
/**
* This method write an itemset to file + all itemsets that can be made
* using its node list.
*
* @param curNode
* the current node
* @param sameCount
* the same count
* @throws IOException
* exception if error reading/writting to file
*/
private void writeItemsetsToFile(NodeListTreeNode curNode, int sameCount)
throws IOException {
// create a stringuffer
StringBuilder buffer = new StringBuilder();
if(curNode.support >= minSupport) {
outputCount++;
// append items from the itemset to the StringBuilder
for (int i = 0; i < resultLen; i++) {
buffer.append(item[result[i]].index);
buffer.append(' ');
}
// append the support of the itemset
buffer.append("#SUP: ");
buffer.append(curNode.support);
buffer.append("\n");
}
// === Write all combination that can be made using the node list of
// this itemset
if (sameCount > 0) {
// generate all subsets of the node list except the empty set
for (long i = 1, max = 1 << sameCount; i < max; i++) {
for (int k = 0; k < resultLen; k++) {
buffer.append(item[result[k]].index);
buffer.append(' ');
}
// we create a new subset
for (int j = 0; j < sameCount; j++) {
// check if the j bit is set to 1
int isSet = (int) i & (1 << j);
if (isSet > 0) {
// if yes, add it to the set
buffer.append(item[sameItems[j]].index);
buffer.append(' ');
// newSet.add(item[sameItems[j]].index);
}
}
buffer.append("#SUP: ");
buffer.append(curNode.support);
buffer.append("\n");
outputCount++;
}
}
// write the strinbuffer to file and create a new line
// so that we are ready for writing the next itemset.
writer.write(buffer.toString());
}
/**
* Print statistics about the latest execution of the algorithm to
* System.out.
*/
public void printStats() {
System.out.println("========== FIN - STATS ============");
System.out.println(" Minsup = " + minSupport
+ "\n Number of transactions: " + numOfTrans);
System.out.println(" Number of frequent itemsets: " + outputCount);
System.out.println(" Total time ~: " + (endTimestamp - startTimestamp)
+ " ms");
System.out.println(" Max memory:"
+ MemoryLogger.getInstance().getMaxMemory() + " MB");
System.out.println("=====================================");
}
/** Class to pass an integer by reference as in C++
*/
class IntegerByRef {
int count;
}
class Item {
public int index;
public int num;
}
class NodeListTreeNode {
public int label;
public NodeListTreeNode firstChild;
public NodeListTreeNode next;
public int support;
public int NLStartinBf;
public int NLLength;
public int NLCol;
}
class PPCTreeNode {
public int label;
public PPCTreeNode firstChild;
public PPCTreeNode rightSibling;
public PPCTreeNode father;
public int count;
public int foreIndex;
}
}