package ca.pfv.spmf.algorithms.frequentpatterns.fin_prepost;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.Arrays;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;
import ca.pfv.spmf.tools.MemoryLogger;
/*
* Copyright (c) 2008-2015 ZHIHONG DENG
*
* This file is part of the SPMF DATA MINING SOFTWARE
* (http://www.philippe-fournier-viger.com/spmf).
*
* SPMF is free software: you can redistribute it and/or modify it under the
* terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
*
* SPMF is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
* A PARTICULAR PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with
* SPMF. If not, see <http://www.gnu.org/licenses/>.
*/
/**
* Java implementation of the PrePost/PrePost+ algorithm.
*
* This implementation was obtained by converting the original C++ code of
* Prepost by ZHIHONG DENG to Java.
*
* @author Philippe Fournier-Viger
*/
public class PrePost {
// the start time and end time of the last algorithm execution
long startTimestamp;
long endTimestamp;
// number of itemsets found
int outputCount;
// object to write the output file
BufferedWriter writer = null;
public int[][] bf;
public int bf_cursor;
public int bf_size;
public int bf_col;
public int bf_currentSize;
public int numOfFItem; // Number of items
public int minSupport; // minimum support
public Item[] item; // list of items sorted by support
// public FILE out;
public int[] result; // the current itemset
public int resultLen; // the size of the current itemset
public int resultCount;
public int nlLenSum; // node list length of the current itemset
// Tree stuff
public PPCTreeNode ppcRoot;
public NodeListTreeNode nlRoot;
public PPCTreeNode[] headTable;
public int[] headTableLen;
public int[] itemsetCount;
public int[] sameItems;
public int nlNodeCount;
// if this parameter is set to true, the PrePost+ algorithm is run instead of PrePost
// (both are implemented in this file, because they have similarities)
public boolean usePrePostPlus = false;
/**
* Use this method to indicate that you want to use the PrePost+ algorithm
* instead of PrePost.
* @param usePrePostPlus if true, PrePost+ will be run instead of PrePost when executing the method runAlgorithm()
*/
public void setUsePrePostPlus(boolean usePrePostPlus) {
this.usePrePostPlus = usePrePostPlus;
}
/**
* Comparator to sort items by decreasing order of frequency
*/
static Comparator<Item> comp = new Comparator<Item>() {
public int compare(Item a, Item b) {
return ((Item) b).num - ((Item) a).num;
}
};
private int numOfTrans;
/**
* Run the algorithm
*
* @param filename
* the input file path
* @param minsup
* the minsup threshold
* @param output
* the output file path
* @throws IOException
* if error while reading/writting to file
*/
public void runAlgorithm(String filename, double minsup, String output)
throws IOException {
outputCount = 0;
nlNodeCount = 0;
ppcRoot = new PPCTreeNode();
nlRoot = new NodeListTreeNode();
resultLen = 0;
resultCount = 0;
nlLenSum = 0;
MemoryLogger.getInstance().reset();
// create object for writing the output file
writer = new BufferedWriter(new FileWriter(output));
// record the start time
startTimestamp = System.currentTimeMillis();
bf_size = 1000000;
bf = new int[100000][];
bf_currentSize = bf_size * 10;
bf[0] = new int[bf_currentSize];
bf_cursor = 0;
bf_col = 0;
// ==========================
// Read Dataset
getData(filename, minsup);
resultLen = 0;
result = new int[numOfFItem];
// Build tree
buildTree(filename);
nlRoot.label = numOfFItem;
nlRoot.firstChild = null;
nlRoot.next = null;
// Initialize tree
initializeTree();
sameItems = new int[numOfFItem];
int from_cursor = bf_cursor;
int from_col = bf_col;
int from_size = bf_currentSize;
// Recursively traverse the tree
NodeListTreeNode curNode = nlRoot.firstChild;
NodeListTreeNode next = null;
while (curNode != null) {
next = curNode.next;
// call the recursive "traverse" method
traverse(curNode, nlRoot, 1, 0);
for (int c = bf_col; c > from_col; c--) {
bf[c] = null;
}
bf_col = from_col;
bf_cursor = from_cursor;
bf_currentSize = from_size;
curNode = next;
}
writer.close();
MemoryLogger.getInstance().checkMemory();
// record the end time
endTimestamp = System.currentTimeMillis();
}
/**
* Build the tree
*
* @param filename
* the input filename
* @throws IOException
* if an exception while reading/writting to file
*/
void buildTree(String filename) throws IOException {
ppcRoot.label = -1;
// READ THE FILE
BufferedReader reader = new BufferedReader(new FileReader(filename));
String line;
// we will use a buffer to store each transaction that is read.
Item[] transaction = new Item[1000];
// for each line (transaction) until the end of the file
while (((line = reader.readLine()) != null)) {
// if the line is a comment, is empty or is a
// kind of metadata
if (line.isEmpty() == true || line.charAt(0) == '#'
|| line.charAt(0) == '%' || line.charAt(0) == '@') {
continue;
}
// split the line into items
String[] lineSplited = line.split(" ");
// for each item in the transaction
int tLen = 0; // tLen
for (String itemString : lineSplited) {
// get the item
int itemX = Integer.parseInt(itemString);
// add each item from the transaction except infrequent item
for (int j = 0; j < numOfFItem; j++) {
// if the item appears in the list of frequent items, we add
// it
if (itemX == item[j].index) {
transaction[tLen] = new Item();
transaction[tLen].index = itemX; // the item
transaction[tLen].num = 0 - j;
tLen++;
break;
}
}
}
// sort the transaction
Arrays.sort(transaction, 0, tLen, comp);
// Print the transaction
// for(int j=0; j < tLen; j++){
// System.out.print(" " + transaction[j].index + " ");
// }
// System.out.println();
int curPos = 0;
PPCTreeNode curRoot = (ppcRoot);
PPCTreeNode rightSibling = null;
while (curPos != tLen) {
PPCTreeNode child = curRoot.firstChild;
while (child != null) {
if (child.label == 0 - transaction[curPos].num) {
curPos++;
child.count++;
curRoot = child;
break;
}
if (child.rightSibling == null) {
rightSibling = child;
child = null;
break;
}
child = child.rightSibling;
}
if (child == null)
break;
}
for (int j = curPos; j < tLen; j++) {
PPCTreeNode ppcNode = new PPCTreeNode();
ppcNode.label = 0 - transaction[j].num;
if (rightSibling != null) {
rightSibling.rightSibling = ppcNode;
rightSibling = null;
} else {
curRoot.firstChild = ppcNode;
}
ppcNode.rightSibling = null;
ppcNode.firstChild = null;
ppcNode.father = curRoot;
ppcNode.labelSibling = null;
ppcNode.count = 1;
curRoot = ppcNode;
}
}
// close the input file
reader.close();
// System.out.println( "====");
// Create a header table
headTable = new PPCTreeNode[numOfFItem];
headTableLen = new int[numOfFItem];
PPCTreeNode[] tempHead = new PPCTreeNode[numOfFItem];
itemsetCount = new int[(numOfFItem - 1) * numOfFItem / 2];
PPCTreeNode root = ppcRoot.firstChild;
int pre = 0;
int last = 0;
while (root != null) {
root.foreIndex = pre;
pre++;
if (headTable[root.label] == null) {
headTable[root.label] = root;
tempHead[root.label] = root;
} else {
tempHead[root.label].labelSibling = root;
tempHead[root.label] = root;
}
headTableLen[root.label]++;
PPCTreeNode temp = root.father;
while (temp.label != -1) {
itemsetCount[root.label * (root.label - 1) / 2 + temp.label] += root.count;
temp = temp.father;
}
if (root.firstChild != null) {
root = root.firstChild;
} else {
// back visit
root.backIndex = last;
last++;
if (root.rightSibling != null) {
root = root.rightSibling;
} else {
root = root.father;
while (root != null) {
// back visit
root.backIndex = last;
last++;
if (root.rightSibling != null) {
root = root.rightSibling;
break;
}
root = root.father;
}
}
}
}
}
/**
* Initialize the tree
*/
void initializeTree() {
NodeListTreeNode lastChild = null;
for (int t = numOfFItem - 1; t >= 0; t--) {
if (bf_cursor > bf_currentSize - headTableLen[t] * 3) {
bf_col++;
bf_cursor = 0;
bf_currentSize = 10 * bf_size;
bf[bf_col] = new int[bf_currentSize];
}
NodeListTreeNode nlNode = new NodeListTreeNode();
nlNode.label = t;
nlNode.support = 0;
nlNode.NLStartinBf = bf_cursor;
nlNode.NLLength = 0;
nlNode.NLCol = bf_col;
nlNode.firstChild = null;
nlNode.next = null;
PPCTreeNode ni = headTable[t];
while (ni != null) {
nlNode.support += ni.count;
bf[bf_col][bf_cursor++] = ni.foreIndex;
bf[bf_col][bf_cursor++] = ni.backIndex;
bf[bf_col][bf_cursor++] = ni.count;
nlNode.NLLength++;
ni = ni.labelSibling;
}
if (nlRoot.firstChild == null) {
nlRoot.firstChild = nlNode;
lastChild = nlNode;
} else {
lastChild.next = nlNode;
lastChild = nlNode;
}
}
}
/**
* Read the input file to find the frequent items
*
* @param filename
* input file name
* @param minSupport
* @throws IOException
*/
void getData(String filename, double support) throws IOException {
numOfTrans = 0;
// (1) Scan the database and count the support of each item.
// The support of items is stored in map where
// key = item value = support count
Map<Integer, Integer> mapItemCount = new HashMap<Integer, Integer>();
// scan the database
BufferedReader reader = new BufferedReader(new FileReader(filename));
String line;
// for each line (transaction) until the end of the file
while (((line = reader.readLine()) != null)) {
// if the line is a comment, is empty or is a
// kind of metadata
if (line.isEmpty() == true || line.charAt(0) == '#'
|| line.charAt(0) == '%' || line.charAt(0) == '@') {
continue;
}
numOfTrans++;
// split the line into items
String[] lineSplited = line.split(" ");
// for each item in the transaction
for (String itemString : lineSplited) {
// increase the support count of the item by 1
Integer item = Integer.parseInt(itemString);
Integer count = mapItemCount.get(item);
if (count == null) {
mapItemCount.put(item, 1);
} else {
mapItemCount.put(item, ++count);
}
}
}
// close the input file
reader.close();
minSupport = (int) Math.ceil(support * numOfTrans);
numOfFItem = mapItemCount.size();
Item[] tempItems = new Item[numOfFItem];
int i = 0;
for (Entry<Integer, Integer> entry : mapItemCount.entrySet()) {
if (entry.getValue() >= minSupport) {
tempItems[i] = new Item();
tempItems[i].index = entry.getKey();
tempItems[i].num = entry.getValue();
i++;
}
}
item = new Item[i];
System.arraycopy(tempItems, 0, item, 0, i);
numOfFItem = item.length;
Arrays.sort(item, comp);
}
NodeListTreeNode iskItemSetFreq(NodeListTreeNode ni, NodeListTreeNode nj,
int level, NodeListTreeNode lastChild, IntegerByRef sameCountRef) {
// System.out.println("====\n" + "isk_itemSetFreq() samecount = " +
// sameCountRef.count);
if (bf_cursor + ni.NLLength * 3 > bf_currentSize) {
bf_col++;
bf_cursor = 0;
bf_currentSize = bf_size > ni.NLLength * 1000 ? bf_size
: ni.NLLength * 1000;
bf[bf_col] = new int[bf_currentSize];
}
NodeListTreeNode nlNode = new NodeListTreeNode();
nlNode.support = 0;
nlNode.NLStartinBf = bf_cursor;
nlNode.NLCol = bf_col;
nlNode.NLLength = 0;
int cursor_i = ni.NLStartinBf;
int cursor_j = nj.NLStartinBf;
int col_i = ni.NLCol;
int col_j = nj.NLCol;
int last_cur = -1;
while (cursor_i < ni.NLStartinBf + ni.NLLength * 3
&& cursor_j < nj.NLStartinBf + nj.NLLength * 3) {
if (bf[col_i][cursor_i] > bf[col_j][cursor_j]
&& bf[col_i][cursor_i + 1] < bf[col_j][cursor_j + 1]) {
if (last_cur == cursor_j) {
bf[bf_col][bf_cursor - 1] += bf[col_i][cursor_i + 2];
} else {
bf[bf_col][bf_cursor++] = bf[col_j][cursor_j];
bf[bf_col][bf_cursor++] = bf[col_j][cursor_j + 1];
bf[bf_col][bf_cursor++] = bf[col_i][cursor_i + 2];
nlNode.NLLength++;
}
nlNode.support += bf[col_i][cursor_i + 2];
last_cur = cursor_j;
cursor_i += 3;
} else if (bf[col_i][cursor_i] < bf[col_j][cursor_j]) {
cursor_i += 3;
} else if (bf[col_i][cursor_i + 1] > bf[col_j][cursor_j + 1]) {
cursor_j += 3;
}
}
if (nlNode.support >= minSupport) {
if (ni.support == nlNode.support && (usePrePostPlus || nlNode.NLLength == 1)) {
sameItems[sameCountRef.count++] = nj.label;
bf_cursor = nlNode.NLStartinBf;
if (nlNode != null) {
nlNode = null;
}
} else {
nlNode.label = nj.label;
nlNode.firstChild = null;
nlNode.next = null;
if (ni.firstChild == null) {
ni.firstChild = nlNode;
lastChild = nlNode;
} else {
lastChild.next = nlNode;
lastChild = nlNode;
}
}
return lastChild;
} else {
bf_cursor = nlNode.NLStartinBf;
if (nlNode != null)
nlNode = null;
}
return lastChild;
}
/**
* Recursively traverse the tree to find frequent itemsets
* @param curNode
* @param curRoot
* @param level
* @param sameCount
* @throws IOException if error while writing itemsets to file
*/
public void traverse(NodeListTreeNode curNode, NodeListTreeNode curRoot,
int level, int sameCount) throws IOException {
MemoryLogger.getInstance().checkMemory();
// System.out.println("==== traverse(): " + curNode.label + " "+ level +
// " " + sameCount);
NodeListTreeNode sibling = curNode.next;
NodeListTreeNode lastChild = null;
while (sibling != null) {
if (level > 1
|| (level == 1 && itemsetCount[(curNode.label - 1)
* curNode.label / 2 + sibling.label] >= minSupport)) {
// tangible.RefObject<Integer> tempRef_sameCount = new
// tangible.RefObject<Integer>(
// sameCount);
// int sameCountTemp = sameCount;
IntegerByRef sameCountTemp = new IntegerByRef();
sameCountTemp.count = sameCount;
lastChild = iskItemSetFreq(curNode, sibling, level, lastChild,
sameCountTemp);
sameCount = sameCountTemp.count;
}
sibling = sibling.next;
}
resultCount += Math.pow(2.0, sameCount);
nlLenSum += Math.pow(2.0, sameCount) * curNode.NLLength;
result[resultLen++] = curNode.label;
// ============= Write itemset(s) to file ===========
writeItemsetsToFile(curNode, sameCount);
// ======== end of write to file
nlNodeCount++;
int from_cursor = bf_cursor;
int from_col = bf_col;
int from_size = bf_currentSize;
NodeListTreeNode child = curNode.firstChild;
NodeListTreeNode next = null;
while (child != null) {
next = child.next;
traverse(child, curNode, level + 1, sameCount);
for (int c = bf_col; c > from_col; c--) {
bf[c] = null;
}
bf_col = from_col;
bf_cursor = from_cursor;
bf_currentSize = from_size;
child = next;
}
resultLen--;
}
/**
* This method write an itemset to file + all itemsets that can be made
* using its node list.
*
* @param curNode
* the current node
* @param sameCount
* the same count
* @throws IOException
* exception if error reading/writting to file
*/
private void writeItemsetsToFile(NodeListTreeNode curNode, int sameCount)
throws IOException {
// create a stringuffer
StringBuilder buffer = new StringBuilder();
if(curNode.support >= minSupport) {
outputCount++;
// append items from the itemset to the StringBuilder
for (int i = 0; i < resultLen; i++) {
buffer.append(item[result[i]].index);
buffer.append(' ');
}
// append the support of the itemset
buffer.append("#SUP: ");
buffer.append(curNode.support);
buffer.append("\n");
}
// === Write all combination that can be made using the node list of
// this itemset
if (sameCount > 0) {
// generate all subsets of the node list except the empty set
for (long i = 1, max = 1 << sameCount; i < max; i++) {
for (int k = 0; k < resultLen; k++) {
buffer.append(item[result[k]].index);
buffer.append(' ');
}
// we create a new subset
for (int j = 0; j < sameCount; j++) {
// check if the j bit is set to 1
int isSet = (int) i & (1 << j);
if (isSet > 0) {
// if yes, add it to the set
buffer.append(item[sameItems[j]].index);
buffer.append(' ');
// newSet.add(item[sameItems[j]].index);
}
}
buffer.append("#SUP: ");
buffer.append(curNode.support);
buffer.append("\n");
outputCount++;
}
}
// write the strinbuffer to file and create a new line
// so that we are ready for writing the next itemset.
writer.write(buffer.toString());
}
/**
* Print statistics about the latest execution of the algorithm to
* System.out.
*/
public void printStats() {
String prePost = usePrePostPlus ? "PrePost+" : "PrePost";
System.out.println("========== " + prePost + " - STATS ============");
System.out.println(" Minsup = " + minSupport
+ "\n Number of transactions: " + numOfTrans);
System.out.println(" Number of frequent itemsets: " + outputCount);
System.out.println(" Total time ~: " + (endTimestamp - startTimestamp)
+ " ms");
System.out.println(" Max memory:"
+ MemoryLogger.getInstance().getMaxMemory() + " MB");
System.out.println("=====================================");
}
/**
* Class to pass an integer by reference as in C++
*/
class IntegerByRef {
int count;
}
class Item {
public int index;
public int num;
}
class NodeListTreeNode {
public int label;
public NodeListTreeNode firstChild;
public NodeListTreeNode next;
public int support;
public int NLStartinBf;
public int NLLength;
public int NLCol;
}
class PPCTreeNode {
public int label;
public PPCTreeNode firstChild;
public PPCTreeNode rightSibling;
public PPCTreeNode labelSibling;
public PPCTreeNode father;
public int count;
public int foreIndex;
public int backIndex;
}
}