package ca.pfv.spmf.tools.dataset_generator; /* This file is copyright (c) 2008-2012 Philippe Fournier-Viger * * This file is part of the SPMF DATA MINING SOFTWARE * (http://www.philippe-fournier-viger.com/spmf). * * SPMF is free software: you can redistribute it and/or modify it under the * terms of the GNU General Public License as published by the Free Software * Foundation, either version 3 of the License, or (at your option) any later * version. * SPMF is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR * A PARTICULAR PURPOSE. See the GNU General Public License for more details. * You should have received a copy of the GNU General Public License along with * SPMF. If not, see <http://www.gnu.org/licenses/>. */ import java.io.BufferedWriter; import java.io.FileWriter; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.HashSet; import java.util.List; import java.util.Random; /** * This class is a random sequence database generator such that * the user provides some parameters and this class generate a sequence database * that is written to the disk. * * @author Philippe Fournier-Viger */ public class SequenceDatabaseGenerator { // a random number generator private static Random random = new Random(System.currentTimeMillis()); /** * This method randomly generates a sequence database according to parameters provided. * @param sequenceCount the number of sequences required * @param maxDistinctItems the maximum number of distinct items * @param itemCountByItemset the number of items by itemset * @param itemsetCountBySequence the number of itemsets by sequence * @param output the file path for writting the generated database * @param withTimestamps if true, this database will contain timestamps, otherwise not * @throws IOException */ public void generateDatabase(int sequenceCount, int maxDistinctItems, int itemCountByItemset, int itemsetCountBySequence, String output, boolean withTimestamps) throws IOException { // We create a BufferedWriter to write the database to disk BufferedWriter writer = new BufferedWriter(new FileWriter(output)); // For the number of sequences to be generated for (int i = 0; i < sequenceCount; i++) { // if it is not the first one, we write on a new line if(i != 0){ writer.newLine(); } // for the number of itemsets to be generated for (int j = 0; j < itemsetCountBySequence; j++) { // This hashset will be used to remember which items have // already been added to this itemset. HashSet<Integer> alreadyAdded = new HashSet<Integer>(); // if the user asked for timestamps, we write the timestamp if(withTimestamps){ writer.write("<" + j + "> "); } // create an arraylist to store items from the itemset that will be generated List<Integer> itemset = new ArrayList<Integer>(); // for the number of items by itemset for (int k = 0; k < itemCountByItemset; k++) { // we generate the item randomly and write it to disk int item = random.nextInt(maxDistinctItems) + 1; // if we already added this item to this itemset // we choose another one while(alreadyAdded.contains(item)){ item = random.nextInt(maxDistinctItems) + 1; } alreadyAdded.add(item); itemset.add(item); } // sort the itemset Collections.sort(itemset); // write the itemset for(Integer item : itemset){ writer.write(item + " "); } // we write the itemset separator writer.write("-1 "); } // we write the end of line writer.write("-2 "); } writer.close(); // we close the file } }