ClusteringExtractor.java example

Explorer
wekax-master
- weka-3-6-2
- wekaUT
  - GetAllSubPackages.java
  - weka
/*
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

/*
 *    ClusteringExtractor.java
 *    Copyright (C) 2003 Mikhail Bilenko
 *
 */

package weka.extraction;

import weka.core.*;
import weka.clusterers.*;
import java.util.*;

/** An abstract extractor class. Takes a set of objects and trains on it;
 * then can be used for extraction on a testing set.
 *
 * @author Mikhail Bilenko (mbilenko@cs.utexas.edu)
 * @version $Revision: 1.1 $
 */
public class ClusteringExtractor extends Extractor implements  OptionHandler {

  /** The baseline extractor that is used */
  protected Extractor m_extractor = null;   // TODO:  fill in some basic extractor

  /** The clusterer */
  protected Clusterer m_clusterer = new SeededKMeans();

  /** Two fundamental modes.
   * We can either cluster documents, and train separate extractors
   * depending what the document is like
   * Or, we can cluster text segments and train separate extractors
   * for different segments
   * Or, we could mix, but we're not touching this for now...
   */
  public static final int MODE_DOCUMENT_CLUSTERS = 1;
  public static final int MODE_SEGMENT_CLUSTERS = 2;
  public static final int MODE_MIXED = 4;
  public static final Tag[] TAGS_CLUSTERING_MODE = {
    new Tag(MODE_DOCUMENT_CLUSTERS, "Document clusters"),
    new Tag(MODE_SEGMENT_CLUSTERS, "Text segment clusters"),
    new Tag(MODE_MIXED, "Both document and segment clusters")
  };
  protected int m_mode = MODE_DOCUMENT_CLUSTERS;

  /** Verbose? */
  protected boolean m_verbose = false;


  
  
  /** A default constructor */
  public ClusteringExtractor() {
  } 
  
  /** Given training data, train the extractor
   * @param labeledData a set of training data
   * @param unlabeledData we don't plan to use transduction here for now
   */
  public void trainExtractor(Instances labeledData, Instances unlabeledData) throws Exception{
    switch(m_mode) {

    case MODE_DOCUMENT_CLUSTERS:
      //  1. cluster labeledData
      //  2. train an extractor for each cluster
      break;

    case MODE_SEGMENT_CLUSTERS:
      //  1. segment each document and populate an Instances object with segments
      //  2. train an extractor for each cluster
      break;

    case MODE_MIXED:
      System.err.println("Mixed mode not implemented for now");
    } 
  }

  /** Perform extraction on a set of data. 
   * @param testData a set of instances on which to perform extraction
   * @param docFillerMap a map where the uniqueID of an instance (document) is mapped to a
   * HashMap, which maps fillers to a list of Integer positions
   */
  public void testExtractor(Instances testData, HashMap docFillerMap) throws Exception {
    switch(m_mode) {

    case MODE_DOCUMENT_CLUSTERS:
      for (int i = 0; i < testData.numInstances(); i++) {
	Instance instance = testData.instance(i);
	// 1. assign instance to a cluster
	// 2. apply that cluster's  extractor to get the result
      }
      break;
      
    case MODE_SEGMENT_CLUSTERS:
      for (int i = 0; i < testData.numInstances(); i++) {
	Instance instance = testData.instance(i);
	// 1. segment instance
	// 2. assign each segment to a cluster
	// 3. apply that cluster's extractor to get the result
      }
      break;
      
    case MODE_MIXED:
      System.err.println("Mixed mode not implemented for now");
    } 
  }


  /** Set the clustering mode 
   * @param mode one of MODE_DOCUMENT_CLUSTERS or MODE_SEGMENT_CLUSTERS
   */
  public void setMode(SelectedTag mode) {
    if (mode.getTags() == TAGS_CLUSTERING_MODE) {
      m_mode = mode.getSelectedTag().getID();
    }
  }

  /**
   * return the clustering mode
   * @return one of MODE_DOCUMENT_CLUSTERS or MODE_SEGMENT_CLUSTERS
   */
  public SelectedTag getMode() {
    return new SelectedTag(m_mode, TAGS_CLUSTERING_MODE);
  }


  /** Set the clusterer
   * @param clusterer the clusterer to be used
   */
  public void setClusterer(Clusterer clusterer) {
    m_clusterer = clusterer;
  } 


  /** Get the clusterer
   * @return the clusterer that is used
   */
  public Clusterer getClusterer() {
    return m_clusterer;
  }

  /** Set the extractor
   * @param extractor the extractor to be used
   */
  public void setExtractor(Extractor extractor) {
    m_extractor = extractor;
  } 
  
  /** Get the extractor
   * @return the extractor that is used
   */
  public Extractor getExtractor() {
    return m_extractor;
  }

  /**
   * set the verbosity level of the clusterer
   * @param verbose messages on(true) or off (false)
   */
  public void setVerbose (boolean verbose) {
    m_verbose = verbose;
  }

  /**
   * get the verbosity level of the clusterer
   * @return messages on(true) or off (false)
   */
  public boolean getVerbose () {
    return m_verbose;
  }

  
  /**
   * Returns an enumeration describing the available options
   *
   * @return an enumeration of all the available options
   **/
  public Enumeration listOptions() {
    
    Vector newVector = new Vector(0);
    
    // TODO:  list options... last thing we care about for now
    return newVector.elements();
  }

  /**
   * Parses a given list of options.
   *
   * Valid options are:<p>
   *
   * -D document-clustering mode
   * or
   * -S segment-clustering mode
   *
   * -E extractor-name extractor-options <br>
   * extractor and its options
   *
   * -C clusterer-name clusterer-options <br>
   * clusterer and its options <p>
   *
   *
   * @param options the list of options as an array of strings
   * @exception Exception if an option is not supported
   *
   **/
  public void setOptions(String[] options) throws Exception {
    String optionString;

    // get the mode
    if (Utils.getFlag('D', options)) {
      setMode(new SelectedTag(MODE_DOCUMENT_CLUSTERS, TAGS_CLUSTERING_MODE));
    } else if (Utils.getFlag('S', options)) {
      setMode(new SelectedTag(MODE_SEGMENT_CLUSTERS, TAGS_CLUSTERING_MODE));
    } else {
      throw new Exception("Must specify -D or -S for clustering mode");
    }

    // get the extractor specification
    optionString = Utils.getOption('E', options);
    if (optionString.length() != 0) {
      String[] extractorSpec = Utils.splitOptions(optionString);
      String extractorName = extractorSpec[0]; 
      extractorSpec[0] = "";
      if (m_verbose) {
	System.out.println("Extractor name: " + extractorName + "\nExtractor parameters: " + concatStringArray(extractorSpec));
      }
      setExtractor(Extractor.forName(extractorName, extractorSpec));
    }

    // get the clusterer specification
     optionString = Utils.getOption('E', options);
    if (optionString.length() != 0) {
      String[] clustererSpec = Utils.splitOptions(optionString);
      String clustererName = clustererSpec[0]; 
      clustererSpec[0] = "";
      if (m_verbose) {
	System.out.println("Clusterer name: " + clustererName + "\nClusterer parameters: " + concatStringArray(clustererSpec));
      }
      setClusterer(Clusterer.forName(clustererName, clustererSpec));
    }
  }

    /** A little helper to create a single String from an array of Strings
   * @param strings an array of strings
   * @returns a single concatenated string, separated by commas
   */
  public static String concatStringArray(String[] strings) {
    String result = new String();
    for (int i = 0; i < strings.length; i++) {
      result = result + "\"" + strings[i] + "\" ";
    }
    return result;
  } 



  /**
   * Gets the current settings of Greedy Agglomerative Clustering
   *
   * @return an array of strings suitable for passing to setOptions()
   */
  public String [] getOptions() {
    
    String [] options = new String [70];
    int current = 0;

    if (m_mode == MODE_DOCUMENT_CLUSTERS) {
      options[current++] = "-D";
    } else if (m_mode == MODE_SEGMENT_CLUSTERS) {
      options[current++] = "-S";
    } 

    // the extractor name and options
    options[current++] = "-E";
    options[current++] = Utils.removeSubstring(m_extractor.getClass().getName(), "weka.extraction.");;
    if (m_extractor instanceof OptionHandler) {
      String[] extractorOptions = ((OptionHandler)m_extractor).getOptions();
      for (int i = 0; i < extractorOptions.length; i++) {
	options[current++] = extractorOptions[i];
      }
    }

    // the clusterer name and options
    options[current++] = "-C";
    options[current++] = Utils.removeSubstring(m_clusterer.getClass().getName(), "weka.clusterers.");
    if (m_clusterer instanceof OptionHandler) {
      String[] clustererOptions = ((OptionHandler)m_clusterer).getOptions();
      for (int i = 0; i < clustererOptions.length; i++) {
	options[current++] = clustererOptions[i];
      }
    } 

    // fill the rest with blanks
    while (current < options.length) {
      options[current++] = "";
    }

    return options;
  }
  

}