/* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ /* * ClusteringExtractor.java * Copyright (C) 2003 Mikhail Bilenko * */ package weka.extraction; import weka.core.*; import weka.clusterers.*; import java.util.*; /** An abstract extractor class. Takes a set of objects and trains on it; * then can be used for extraction on a testing set. * * @author Mikhail Bilenko (mbilenko@cs.utexas.edu) * @version $Revision: 1.1 $ */ public class ClusteringExtractor extends Extractor implements OptionHandler { /** The baseline extractor that is used */ protected Extractor m_extractor = null; // TODO: fill in some basic extractor /** The clusterer */ protected Clusterer m_clusterer = new SeededKMeans(); /** Two fundamental modes. * We can either cluster documents, and train separate extractors * depending what the document is like * Or, we can cluster text segments and train separate extractors * for different segments * Or, we could mix, but we're not touching this for now... */ public static final int MODE_DOCUMENT_CLUSTERS = 1; public static final int MODE_SEGMENT_CLUSTERS = 2; public static final int MODE_MIXED = 4; public static final Tag[] TAGS_CLUSTERING_MODE = { new Tag(MODE_DOCUMENT_CLUSTERS, "Document clusters"), new Tag(MODE_SEGMENT_CLUSTERS, "Text segment clusters"), new Tag(MODE_MIXED, "Both document and segment clusters") }; protected int m_mode = MODE_DOCUMENT_CLUSTERS; /** Verbose? */ protected boolean m_verbose = false; /** A default constructor */ public ClusteringExtractor() { } /** Given training data, train the extractor * @param labeledData a set of training data * @param unlabeledData we don't plan to use transduction here for now */ public void trainExtractor(Instances labeledData, Instances unlabeledData) throws Exception{ switch(m_mode) { case MODE_DOCUMENT_CLUSTERS: // 1. cluster labeledData // 2. train an extractor for each cluster break; case MODE_SEGMENT_CLUSTERS: // 1. segment each document and populate an Instances object with segments // 2. train an extractor for each cluster break; case MODE_MIXED: System.err.println("Mixed mode not implemented for now"); } } /** Perform extraction on a set of data. * @param testData a set of instances on which to perform extraction * @param docFillerMap a map where the uniqueID of an instance (document) is mapped to a * HashMap, which maps fillers to a list of Integer positions */ public void testExtractor(Instances testData, HashMap docFillerMap) throws Exception { switch(m_mode) { case MODE_DOCUMENT_CLUSTERS: for (int i = 0; i < testData.numInstances(); i++) { Instance instance = testData.instance(i); // 1. assign instance to a cluster // 2. apply that cluster's extractor to get the result } break; case MODE_SEGMENT_CLUSTERS: for (int i = 0; i < testData.numInstances(); i++) { Instance instance = testData.instance(i); // 1. segment instance // 2. assign each segment to a cluster // 3. apply that cluster's extractor to get the result } break; case MODE_MIXED: System.err.println("Mixed mode not implemented for now"); } } /** Set the clustering mode * @param mode one of MODE_DOCUMENT_CLUSTERS or MODE_SEGMENT_CLUSTERS */ public void setMode(SelectedTag mode) { if (mode.getTags() == TAGS_CLUSTERING_MODE) { m_mode = mode.getSelectedTag().getID(); } } /** * return the clustering mode * @return one of MODE_DOCUMENT_CLUSTERS or MODE_SEGMENT_CLUSTERS */ public SelectedTag getMode() { return new SelectedTag(m_mode, TAGS_CLUSTERING_MODE); } /** Set the clusterer * @param clusterer the clusterer to be used */ public void setClusterer(Clusterer clusterer) { m_clusterer = clusterer; } /** Get the clusterer * @return the clusterer that is used */ public Clusterer getClusterer() { return m_clusterer; } /** Set the extractor * @param extractor the extractor to be used */ public void setExtractor(Extractor extractor) { m_extractor = extractor; } /** Get the extractor * @return the extractor that is used */ public Extractor getExtractor() { return m_extractor; } /** * set the verbosity level of the clusterer * @param verbose messages on(true) or off (false) */ public void setVerbose (boolean verbose) { m_verbose = verbose; } /** * get the verbosity level of the clusterer * @return messages on(true) or off (false) */ public boolean getVerbose () { return m_verbose; } /** * Returns an enumeration describing the available options * * @return an enumeration of all the available options **/ public Enumeration listOptions() { Vector newVector = new Vector(0); // TODO: list options... last thing we care about for now return newVector.elements(); } /** * Parses a given list of options. * * Valid options are:<p> * * -D document-clustering mode * or * -S segment-clustering mode * * -E extractor-name extractor-options <br> * extractor and its options * * -C clusterer-name clusterer-options <br> * clusterer and its options <p> * * * @param options the list of options as an array of strings * @exception Exception if an option is not supported * **/ public void setOptions(String[] options) throws Exception { String optionString; // get the mode if (Utils.getFlag('D', options)) { setMode(new SelectedTag(MODE_DOCUMENT_CLUSTERS, TAGS_CLUSTERING_MODE)); } else if (Utils.getFlag('S', options)) { setMode(new SelectedTag(MODE_SEGMENT_CLUSTERS, TAGS_CLUSTERING_MODE)); } else { throw new Exception("Must specify -D or -S for clustering mode"); } // get the extractor specification optionString = Utils.getOption('E', options); if (optionString.length() != 0) { String[] extractorSpec = Utils.splitOptions(optionString); String extractorName = extractorSpec[0]; extractorSpec[0] = ""; if (m_verbose) { System.out.println("Extractor name: " + extractorName + "\nExtractor parameters: " + concatStringArray(extractorSpec)); } setExtractor(Extractor.forName(extractorName, extractorSpec)); } // get the clusterer specification optionString = Utils.getOption('E', options); if (optionString.length() != 0) { String[] clustererSpec = Utils.splitOptions(optionString); String clustererName = clustererSpec[0]; clustererSpec[0] = ""; if (m_verbose) { System.out.println("Clusterer name: " + clustererName + "\nClusterer parameters: " + concatStringArray(clustererSpec)); } setClusterer(Clusterer.forName(clustererName, clustererSpec)); } } /** A little helper to create a single String from an array of Strings * @param strings an array of strings * @returns a single concatenated string, separated by commas */ public static String concatStringArray(String[] strings) { String result = new String(); for (int i = 0; i < strings.length; i++) { result = result + "\"" + strings[i] + "\" "; } return result; } /** * Gets the current settings of Greedy Agglomerative Clustering * * @return an array of strings suitable for passing to setOptions() */ public String [] getOptions() { String [] options = new String [70]; int current = 0; if (m_mode == MODE_DOCUMENT_CLUSTERS) { options[current++] = "-D"; } else if (m_mode == MODE_SEGMENT_CLUSTERS) { options[current++] = "-S"; } // the extractor name and options options[current++] = "-E"; options[current++] = Utils.removeSubstring(m_extractor.getClass().getName(), "weka.extraction.");; if (m_extractor instanceof OptionHandler) { String[] extractorOptions = ((OptionHandler)m_extractor).getOptions(); for (int i = 0; i < extractorOptions.length; i++) { options[current++] = extractorOptions[i]; } } // the clusterer name and options options[current++] = "-C"; options[current++] = Utils.removeSubstring(m_clusterer.getClass().getName(), "weka.clusterers."); if (m_clusterer instanceof OptionHandler) { String[] clustererOptions = ((OptionHandler)m_clusterer).getOptions(); for (int i = 0; i < clustererOptions.length; i++) { options[current++] = clustererOptions[i]; } } // fill the rest with blanks while (current < options.length) { options[current++] = ""; } return options; } }