package ca.pfv.spmf.algorithms.sequentialpatterns.fournier2008_seqdim.multidimensionalpatterns;
/* This file is copyright (c) 2008-2013 Philippe Fournier-Viger
*
* This file is part of the SPMF DATA MINING SOFTWARE
* (http://www.philippe-fournier-viger.com/spmf).
*
* SPMF is free software: you can redistribute it and/or modify it under the
* terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
*
* SPMF is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
* A PARTICULAR PURPOSE. See the GNU General Public License for more details.
* You should have received a copy of the GNU General Public License along with
* SPMF. If not, see <http://www.gnu.org/licenses/>.
*/
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import ca.pfv.spmf.algorithms.frequentpatterns.aprioriTID.AlgoAprioriTID;
import ca.pfv.spmf.algorithms.frequentpatterns.aprioriTIDClose.AlgoAprioriTIDClose;
import ca.pfv.spmf.algorithms.frequentpatterns.charm.AlgoCharm_Bitset;
import ca.pfv.spmf.input.transaction_database_list_integers.TransactionDatabase;
import ca.pfv.spmf.patterns.itemset_array_integers_with_tids_bitset.Itemsets;
/**
* Implementation of the DIM algorithm by Pinto et al. (2001) to extract frequent MD-Patterns
* (multi-dimensional patterns) from a MD-Database. The algorithm is described in:
* <br/><br/>
*
* Pinto, H., Han, J., Pei, J., Wang, K., Chen, Q., & Dayal, U. (2001, October).
* Multi-dimensional sequential pattern mining. In
* Proceedings of the tenth international conference on Information and knowledge management (pp. 81-88).
* ACM.
* <br/><br/>
*
* This implementation use the Apriori, AprioriClose or CHARM algorithms depending on what the user prefers.
* This allow to find all frequent MD-Patterns or just those that are closed.
* <br/><br/>
*
* The idea of closed MD sequential pattern mining is described in (Songram, 2006):
* <br/><br/>
*
* P. Songram, V. Boonjing, S. Intakosum: Closed Multi-dimensional Sequential-Pattern Minin. Proc. of ITNG 2006.
* <br/><br/>
*
* This algorithm implementation proceeds as follow, it
* (1) convert MD-Patterns into itemsets,
* (2) mine frequent (closed) itemsets from the md-patterns generated
* in Step 1
* and (3) convert frequent (closed) itemsets back in MD-Patterns.
*
* @see MDPattern
* @see MDPatterns
* @see MDPatternsDatabase
* @author Philippe Fournier-Viger
*/
public class AlgoDim{
// the list of MDpatterns found
private MDPatterns patterns = new MDPatterns("Frequent MD Patterns");
// the number of dimensions in each pattern
private int dimensionsCount;
// if true, the algorithm finds closed patterns
private boolean findClosedPatterns;
// if true, the algorithm finds closed patterns with Charm instead
// of AprioriClose
private boolean findClosedPatternsWithCharm;
// The following structure are used to convert
// from a dimension value used by MD-Patterns to an item ID
// used by Apriori and CHarm and vice-versa.
// The identifier of a mdpattern is a String of the form "val-i"
// where val is a dimension value for the i-th dimension.
// Key: item id Value: mdpattern identifier
private Map<Integer, String> mapItemIdIdentifier = new HashMap<Integer,String>();
// Value: item id key: mdpattern identifier
private Map<String, Integer> mapIdentifierItemId = new HashMap<String, Integer>();
// the largest Item id that was used when converting from a mdpattern
// to an itemset
int lastUniqueItemIdGiven=0;
/**
* @param findClosedPatterns Indicates if this class has to find respectively frequent itemsets
* or frequent closed itemsets.
* @param findClosedPatternsWithCharm if true, the algorithm finds closed patterns with Charm instead
// of AprioriClose
*/
public AlgoDim(boolean findClosedPatterns,
boolean findClosedPatternsWithCharm){
this.findClosedPatterns = findClosedPatterns;
this.findClosedPatternsWithCharm = findClosedPatternsWithCharm;
}
/**
* Run the DIM algorithm
* @param mdPatDatabase an md-pattern database
* @param minsupp a minimum support threshold as a percentage (double)
* @return the md-patterns found
* @throws IOException exception if error reading/writing file
*/
public MDPatterns runAlgorithm(MDPatternsDatabase mdPatDatabase, double minsupp) throws IOException {
// initialize the set of MDpatterns for storing the result
patterns = new MDPatterns("FREQUENT MD Patterns");
// get the number of dimensions from the first pattern
// in the mdpattern database
this.dimensionsCount = mdPatDatabase.getMDPatterns().get(0).size();
// if the user wants to use the CHARM algorithm
if(findClosedPatternsWithCharm){
// create the transaction database by converting from
// a mdpattern database
TransactionDatabase contextCharm = new TransactionDatabase();
for(MDPattern pattern : mdPatDatabase.getMDPatterns()){
contextCharm.addTransaction(convertPatternToItemset(pattern));
}
// run the charm algorithm to get closed patterns
AlgoCharm_Bitset charm = new AlgoCharm_Bitset();
Itemsets frequentPatterns = charm.runAlgorithm(null, contextCharm, minsupp, true, 10000);
int maxSupport = 0;
// Convert patterns found by Charm into MDPatterns
// for each level
for(List<ca.pfv.spmf.patterns.itemset_array_integers_with_tids_bitset.Itemset> itemsets : frequentPatterns.getLevels()){
// for each pattern found by charm
for(ca.pfv.spmf.patterns.itemset_array_integers_with_tids_bitset.Itemset itemset : itemsets){
// convert to a md-pattern
MDPattern pattern = convertItemsetCharmToPattern(itemset);
// add to the set of patterns found
patterns.addPattern(pattern, pattern.size());
// if the support is highest seen until
// now, update the maximum support seen.
if(itemset.getAbsoluteSupport() > maxSupport){
maxSupport = itemset.getAbsoluteSupport();
}
}
}
// add the empty set to the list of patterns if necessary
// if the maximum support is smaller than the number
// of transactions in the transaction database for charm
// (it means that the empty set is a closed itemset)
if(maxSupport < contextCharm.size()){
patterns.addPattern(convertItemsetCharmToPattern(new ca.pfv.spmf.patterns.itemset_array_integers_with_tids_bitset.Itemset()), 0);
}
// if the user wants to use the APRIORI-CLOSE algorithm
}else if(findClosedPatterns){
// (1) create the transaction database by converting from
// a mdpattern database
TransactionDatabase database = new TransactionDatabase();
for(MDPattern pattern : mdPatDatabase.getMDPatterns()){
database.addTransaction(convertPatternToItemset(pattern));
}
// run the APRIORI-TID-CLOSE algorithm to get closed patterns
AlgoAprioriTIDClose apriori = new AlgoAprioriTIDClose();
ca.pfv.spmf.patterns.itemset_array_integers_with_tids.Itemsets closedItemsets = apriori.runAlgorithm(database,minsupp, null);
// Convert patterns found by AprioriClose into MDPatterns
// for each level
for(List<ca.pfv.spmf.patterns.itemset_array_integers_with_tids.Itemset> itemsets : closedItemsets.getLevels()){
// for each pattern of that level
for(ca.pfv.spmf.patterns.itemset_array_integers_with_tids.Itemset itemset : itemsets){
// convert to a md-pattern
MDPattern pattern = convertItemsetToPattern(itemset);
// add to the set of patterns found
patterns.addPattern(pattern, pattern.size());
}
}
}else{ // otherwise, if the user want to use APRIORI-TID
// (1)create the transaction database by converting from
// a mdpattern database to a transaction database
TransactionDatabase database = new TransactionDatabase();
for(MDPattern pattern : mdPatDatabase.getMDPatterns()){
database.addTransaction(convertPatternToItemset(pattern));
}
// Apply the APRIORI-TID algorithm
AlgoAprioriTID apriori = new AlgoAprioriTID();
ca.pfv.spmf.patterns.itemset_array_integers_with_tids.Itemsets closedItemsets = apriori.runAlgorithm(database,minsupp);
apriori.setEmptySetIsRequired(true);
// Convert patterns found by AprioriClose into MDPatterns
// for each level
for(List<ca.pfv.spmf.patterns.itemset_array_integers_with_tids.Itemset> itemsets : closedItemsets.getLevels()){
// for each pattern of that level
for(ca.pfv.spmf.patterns.itemset_array_integers_with_tids.Itemset itemset : itemsets){
// convert to a md-pattern
MDPattern pattern = convertItemsetToPattern(itemset);
// add to the set of patterns found
patterns.addPattern(pattern, pattern.size());
}
}
// add the empty set
patterns.addPattern(convertItemsetCharmToPattern(new ca.pfv.spmf.patterns.itemset_array_integers_with_tids_bitset.Itemset()), 0);
}
// return the set of patterns found
return patterns;
}
/**
* Convert from an item ID to a dimension value.
* @param itemID an item ID
* @return the dimension value corresponding to this item ID
*/
private Integer getValueForItemId(int itemID){
// convert to an identifier
String identifier = mapItemIdIdentifier.get(itemID);
int index = identifier.indexOf("-");
// return only the part before the "-"
return Integer.valueOf(identifier.substring(0, index));
}
/**
* Convert from an item ID to a dimension position.
* @param value an item ID
* @return the dimension position in the list of dimensions corresponding
* to that item.
*/
private Integer getDimensionForItemId(int value){
// convert to an identifier
String identifier = mapItemIdIdentifier.get(value);
int index = identifier.indexOf('-');
// return only the part after the "-"
return Integer.valueOf(identifier.substring(index+1, identifier.length()));
}
/**
* Convert dimension value to an item ID.
* @param indexDimension the position of the dimension in the list of dimensions
* @param value the value for the dimension
* @return the item ID
*/
private int convertDimensionValueToItemId(int indexDimension, Integer value){
// get the item ID by using the map
Integer itemId = mapIdentifierItemId.get("" + value + "-" + indexDimension);
// if there is no item ID for this dimension value yet
if(itemId == null){
// we create a new item ID
itemId = lastUniqueItemIdGiven++;
// we create the corresponding identifier for the
// dimension value
StringBuilder identifier = new StringBuilder();
identifier.append(value);
identifier.append('-');
identifier.append(indexDimension);
// we update the map so that we can convert
// from item ID to dimension value and vice versa
mapIdentifierItemId.put(identifier.toString(), itemId);
mapItemIdIdentifier.put(itemId, identifier.toString());
}
return itemId;
}
/**
* Convert an MD-pattern to an itemset
* @param pattern an MD-pattern
* @return an itemset as a list of integers
*/
private List<Integer> convertPatternToItemset(MDPattern pattern) {
// create the itemset
List<Integer> itemset = new ArrayList<Integer>();
// for each dimension value in the pattern
for(int i=0; i < pattern.values.size(); i++){
// convert to an item ID and add it to the itemset
itemset.add(convertDimensionValueToItemId(i, pattern.values.get(i)));
}
// return the itemset
return itemset;
}
/**
* Convert from an itemset to an MD-Pattern
* @param itemset an itemset
* @return an MD-pattern
*/
private MDPattern convertItemsetToPattern(ca.pfv.spmf.patterns.itemset_array_integers_with_tids.Itemset itemset) {
// create the md-pattern
MDPattern mdpattern = new MDPattern(0);
// for each dimension i
for(int i=0; i< dimensionsCount; i++){
// for each item j
for(int j=0; j<itemset.size(); j++){
// get the dimension corresponding to the item ID
int dimension = getDimensionForItemId(itemset.get(j));
// get the dimension value corresponding to the item ID
int value = getValueForItemId(itemset.get(j));
// if it is the dimension i
if(dimension == i){
// add the dimension value to the MD pattern
mdpattern.addInteger(value);
}
}
// if the dimension value was not found,
// add the value *.
if(mdpattern.size() == i){
mdpattern.addWildCard();
}
}
//We also need to set the tidset of the mdpattern
mdpattern.setPatternsIDList(itemset.getTransactionsIds());
// we return the mdpattern.
return mdpattern;
}
/**
* Convert from an itemset used by CHARM to an MD-Pattern
* @param itemset
* @return
*/
private MDPattern convertItemsetCharmToPattern(ca.pfv.spmf.patterns.itemset_array_integers_with_tids_bitset.Itemset itemset) {
// create the mdpattern
MDPattern mdpattern = new MDPattern(0);
// for each dimension i
for(int i=0; i< dimensionsCount; i++){
// for each item j
for(int j=0; j<itemset.size(); j++){
// create array
int[] objects = itemset.getItems();
// get the dimension corresponding to the item ID
int dimension = getDimensionForItemId(objects[j]);
// get the dimension value corresponding to the item ID
int value = getValueForItemId(objects[j]);
if(dimension == i){
// if it is the dimension i
// add the dimension value to the MD pattern
mdpattern.addInteger(value);
}
}
// if the dimension value was not found,
// add the value *.
if(mdpattern.size() == i){
mdpattern.addWildCard();
}
}
// HERE WE CONVERT FROM A BITSET TO A SET OF INTEGER
// NOTE: THIS MAY BE COSTLY AND IT WOULD BE BETTER
// IF WE DON'T HAVE TO DO THAT AND WE JUST USE A SET OF INTEGER
// OR A BITSET EVERYWHERE.
// BUT SINCE THE CODE FOR THE FOURNIER-VIGER 2008 ALGORITHM
// IS QUITE COMPLEX, I WILL NOT FIX THIS ISSUE NOW.
Set<Integer> tidset = new HashSet<Integer>();
for (int tid = itemset.getTransactionsIds().nextSetBit(0); tid >= 0; tid = itemset.getTransactionsIds().nextSetBit(tid+1)) {
// make the sum
tidset.add(tid);
}
mdpattern.setPatternsIDList(tidset);
// return the md-pattern
return mdpattern;
}
/**
* Print statistics about this algorithm execution to System.out
* @param databaseSize the number of mdpattern in the md-pattern database.
*/
public void printStats(int databaseSize) {
System.out.println("============= DIM - STATS =============");
System.out.println(" Frequent patterns count : " + patterns.size());
patterns.printPatterns(databaseSize);
System.out.println("===================================================");
}
}