package tr.gov.ulakbim.jDenetX.streams.generators.multilabel; /* * MetaMultilabelGenerator.java * Copyright (C) 2010 University of Waikato, Hamilton, New Zealand * @author Jesse Read (jmr30@cs.waikato.ac.nz) * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ import tr.gov.ulakbim.jDenetX.core.EuclideanSimilarityDiscoverer.MultilabelInstancesHeader; import tr.gov.ulakbim.jDenetX.core.InstancesHeader; import tr.gov.ulakbim.jDenetX.core.ObjectRepository; import tr.gov.ulakbim.jDenetX.options.AbstractOptionHandler; import tr.gov.ulakbim.jDenetX.options.ClassOption; import tr.gov.ulakbim.jDenetX.options.FloatOption; import tr.gov.ulakbim.jDenetX.options.IntOption; import tr.gov.ulakbim.jDenetX.streams.InstanceStream; import tr.gov.ulakbim.jDenetX.tasks.TaskMonitor; import weka.core.*; import java.util.*; public class MetaMultilabelGenerator extends AbstractOptionHandler implements InstanceStream { private static final long serialVersionUID = 1L; public ClassOption binaryGeneratorOption = new ClassOption( "binaryGenerator", 's', "Binary Generator (use thihs option to specify the number of attributes, but specify two classes only).", InstanceStream.class, "generators.RandomTreeGenerator"); public IntOption metaRandomSeedOption = new IntOption( "metaRandomSeed", 'm', "Random seed (for the meta process).", 1); public IntOption numLabelsOption = new IntOption( "numLabels", 'c', "Number of labels.", 1); public IntOption skewOption = new IntOption( "skew", 'k', "Skewed label distribution: 1 (default) = yes; 0 = no (relatively uniform).", 1, 0, 1); public FloatOption labelCardinalityOption = new FloatOption( "labelCardinality", 'z', "Target label cardinality of resulting set", 1.5, 0.0, Integer.MAX_VALUE); protected MultilabelInstancesHeader m_MultilabelInstancesHeader = null; protected InstanceStream m_BinaryGenerator = null; protected Instances multilabelStreamTemplate = null; protected Random m_MetaRandom = null; protected int m_N = 0, m_A = 0; protected double m_Z = 0.0; protected double skew[] = null, skew_n[] = null; protected double matrix[][] = null; protected ArrayList m_FeatureEffects[] = null; @Override public void prepareForUseImpl(TaskMonitor monitor, ObjectRepository repository) { this.restart(); } @Override public void restart() { // Extract option 'c' (number of classes(labels)) this.m_N = numLabelsOption.getValue(); // Binary generator this.m_BinaryGenerator = (InstanceStream) getPreparedClassOption(this.binaryGeneratorOption); this.m_BinaryGenerator.restart(); // Extract number of attributes (minus class-attribute) this.m_A = this.m_BinaryGenerator.getHeader().numAttributes() - 1; // Random seed this.m_MetaRandom = new Random(this.metaRandomSeedOption.getValue()); // Setup queue system (so that generated binary instances aren't 'wasted') this.queue = new LinkedList[2]; for (int i = 0; i < this.queue.length; i++) { this.queue[i] = new LinkedList<Instance>(); } // Generate the multi-label header this.m_MultilabelInstancesHeader = generateMultilabelHeader(this.m_BinaryGenerator.getHeader()); // Determine Z : label cardinality as a percentage of |L| (m_N) m_Z = labelCardinalityOption.getValue(); double z = m_Z; // Chceck that the label sets we generate fit the label cardinality we specified while (true) { // Create the label skew this.skew = fillSkew(m_MetaRandom, z); // Create a normalised version of the skew (for wwhen we choose at least one label) this.skew_n = Arrays.copyOf(skew, skew.length); Utils.normalize(this.skew_n); // Create a matrix from the label skew this.matrix = fillMatrix(skew, m_Z / (double) m_N, m_MetaRandom); double total = 0.0; for (int i = 0; i < 10000; i++) { total += (generateSet(discreteRandomIndex(this.skew_n))).size(); } total /= 10000.0; if (total - m_Z < -0.1) z += 0.1; else if (total - m_Z > 0.1) z -= 0.1; else break; } // Create the feature-label mappings m_FeatureEffects = getTopCombinations(m_N * 2); } /** * GenerateMultilabelHeader. */ protected MultilabelInstancesHeader generateMultilabelHeader(Instances si) { Instances mi = new Instances(si, 0, 0); mi.setClassIndex(-1); mi.deleteAttributeAt(mi.numAttributes() - 1); FastVector bfv = new FastVector(); bfv.addElement("0"); bfv.addElement("1"); for (int i = 0; i < this.m_N; i++) { mi.insertAttributeAt(new Attribute("class" + i, bfv), i); } this.multilabelStreamTemplate = mi; this.multilabelStreamTemplate.setRelationName("SYN_Z" + this.labelCardinalityOption.getValue() + "L" + this.m_N + "X" + m_A + "S" + metaRandomSeedOption.getValue() + ": -C " + this.m_N); this.multilabelStreamTemplate.setClassIndex(this.m_N); return new MultilabelInstancesHeader(multilabelStreamTemplate, m_N); } /** * GenSkew. * Generate a label skew (given desired lcard z) * * @param z desired label cardinality * @param r random generator */ private double[] fillSkew(Random r, double z) { double d[] = new double[m_N]; for (int i = 0; i < m_N; i++) { if (skewOption.getValue() >= 1) d[i] = m_MetaRandom.nextDouble(); else d[i] = 1.0; } Utils.normalize(d, Utils.sum(d) / z); for (int i = 0; i < m_N; i++) { if (Double.isNaN(d[i])) d[i] = 0.01; } return d; } /** * GetNextWithBinary. * Get the next instance with binary class i * * @param i the class to generate (0,1) */ LinkedList<Instance> queue[] = null; private Instance getNextWithBinary(int i) { int lim = 1000; if (queue[i].size() <= 0) { int c = -1; while (lim-- > 0) { Instance tinst = this.m_BinaryGenerator.nextInstance(); //System.err.println("next binary : "+tinst); c = (int) Math.round(tinst.classValue()); if (i == c) return tinst; else if (queue[c].size() < 100) queue[c].add(tinst); } System.err.println("[Overflow] The binary stream is too skewed, could not get an example of class " + i + ""); System.exit(1); return null; } else return queue[i].remove(); } /** * LabelCorrelation. * * @param lbls existing labels (indices) in the set * @return a random label (index) to be associated with these labels (-1 if none) */ private int labelCorrelation(ArrayList<Integer> lbls) { double r[] = new double[m_N]; Arrays.fill(r, 1.0); for (int l : lbls) { //get row for (int j = 0; j < matrix[l].length; j++) { // *= P(j|l) (probability of label 'j', given that label 'l' is in the set r[j] = (j == l) ? 0.0 : r[j] * matrix[j][l]; } } return discreteRandomIndex(r); } /** * GenerateML. * Generates a multi-label example. */ @Override public Instance nextInstance() { try { return generateMLInstance(generateSet(discreteRandomIndex(this.skew_n))); } catch (Exception e) { e.printStackTrace(); System.exit(1); } return null; } private ArrayList generateSet(int l) { ArrayList<Integer> lbls = new ArrayList<Integer>(); while (l >= 0) { lbls.add(l); l = labelCorrelation(lbls); } return lbls; } /** * GenerateMLInstance. */ private Instance generateMLInstance(ArrayList<Integer> lbls) throws Exception { // create a multi-label instance : Instance ml_x = new SparseInstance(this.multilabelStreamTemplate.numAttributes()); ml_x.setDataset(this.multilabelStreamTemplate); // set classes for (int i = 0; i < m_N; i++) ml_x.setValue(i, 0.0); for (int i = 0; i < lbls.size(); i++) { ml_x.setValue(lbls.get(i), 1.0); } // generate binary instances Instance binary0 = getNextWithBinary(0); Instance binary1 = getNextWithBinary(1); // Loop through each feature attribute @warning: assumes class is last index for (int a = 0; a < m_A; a++) { // The combination is present: use a positive value if (lbls.containsAll(m_FeatureEffects[a % m_FeatureEffects.length])) { ml_x.setValue(m_N + a, binary1.value(a)); } // The combination is absent: use a negative value else { ml_x.setValue(m_N + a, binary0.value(a)); } } return ml_x; } /** * DiscreteRandomIndex. * Pick a random index i of p, based on the weight of the doubles each p[i] contains * * @note: expecting data to be normalised first */ private int discreteRandomIndex(double p[]) { double r = m_MetaRandom.nextDouble(); if (Utils.sum(p) <= r || Double.isNaN(Utils.sum(p))) return -1; //m_MetaRandom.nextInt(p.length); int i = 0; double sum = 0.0; while (r > sum) { // won't be selecting anything if (i >= p.length) return -1; sum += p[i++]; } //System.out.println("i="+i); return i - 1; } protected static double genE(int i, double L) { return L * Math.pow(Math.E, -L * i); } /** * genMatrix. * P(i) = matrix[i][i] * P(i|j) = matrix[i][j] * * @param skew the matrix with skew stored along the diagonal * @param Z goal label cardinality * @param r random seed */ protected double[][] fillMatrix(double skew[], double Z, Random r) { this.matrix = new double[skew.length][skew.length]; //System.out.println("skew "+Arrays.toString(skew)); for (int i = 0; i < skew.length; i++) { matrix[i][i] = Utils.roundDouble(skew[i], 3); } for (int i = 0; i < matrix.length; i++) { for (int j = i + 1; j < matrix[i].length; j++) { // label-dependence factors if (r.nextDouble() <= (Z * 2.0)) { matrix[i][j] = randFromRange(min(P(i), P(j)), max(P(i), P(j))); matrix[j][i] = (matrix[i][j] * matrix[i][i]) / matrix[j][j]; // Bayes Rule } // label-exclusivity factors else { matrix[i][j] = min(P(i), P(j)); matrix[j][i] = (matrix[i][j] * matrix[j][j]) / matrix[i][i]; // Bayes Rule } // this is just rounding matrix[i][j] = Utils.roundDouble(matrix[i][j], 3); matrix[j][i] = Utils.roundDouble(matrix[j][i], 3); } } return matrix; } protected double randFromRange(double min, double max) { return min + genE(m_MetaRandom.nextInt(5), (max - min)); } // P(i) protected double P(int i) { return matrix[i][i]; } // P(i|j) protected double P(int i, int j) { return matrix[i][j]; } // the highest possible prob. of P(A|B) given A and B protected double max(double A, double B) { return Math.min(1.0, (B / A)); } // the lowest possible prob. of P(A|B) given A and B protected double min(double A, double B) { return Math.max(0.0, (-1.0 + A + B)); } /** * GetTopCombinations. * Return the top n occurring combinations (we just measure the 10000 for this) */ private ArrayList[] getTopCombinations(int n) { HashMap<String, Integer> top = new HashMap<String, Integer>(); for (int i = 0; i < 10000; i++) { String s = arrayToString(generateSet(discreteRandomIndex(this.skew_n)), m_N); top.put(s, top.get(s) != null ? top.get(s) + 1 : 1); } HashMap<String, Integer> rating = getAsReverseSortedHashMap(top); ArrayList al[] = new ArrayList[rating.size()]; int i = 0; for (String s : rating.keySet()) { al[i++] = stringToArray(s); } return al; } // auxilliary functions follow private static HashMap<String, Integer> getAsReverseSortedHashMap(HashMap<String, Integer> c) { Map<String, Integer> tempMap = new HashMap<String, Integer>(); for (String wsState : c.keySet()) { tempMap.put(wsState, c.get(wsState)); } List<String> mapKeys = new ArrayList<String>(tempMap.keySet()); List<Integer> mapValues = new ArrayList<Integer>(tempMap.values()); HashMap<String, Integer> sortedMap = new LinkedHashMap<String, Integer>(); TreeSet<Integer> sortedSet = new TreeSet<Integer>(mapValues); Object[] sortedArray = sortedSet.toArray(); int size = sortedArray.length; for (int i = 0; i < size; i++) { sortedMap.put(mapKeys.get(mapValues.indexOf(sortedArray[size - 1 - i])), (Integer) sortedArray[size - 1 - i]); } return sortedMap; } private static ArrayList stringToArray(String s) { ArrayList al = new ArrayList(); for (int i = 0; i < s.length(); i++) { if (s.charAt(i) == '1') al.add(i); } return al; } private static String arrayToString(ArrayList<Integer> lbls, int N) { StringBuilder sb = new StringBuilder(N); for (int i = 0; i < N; i++) { sb.append('0'); } for (int l : lbls) { sb.setCharAt(l, '1'); } return sb.toString(); } @Override public InstancesHeader getHeader() { return m_MultilabelInstancesHeader; } @Override public String getPurposeString() { return "Generates a multi-label stream using a binary generator."; } @Override public long estimatedRemainingInstances() { return -1; } @Override public boolean hasMoreInstances() { return true; } @Override public boolean isRestartable() { return true; } @Override public void getDescription(StringBuilder sb, int indent) { // TODO Auto-generated method stub } }