/* * Machine Learning support for FindBugs * Copyright (C) 2004,2005 University of Maryland * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ package edu.umd.cs.findbugs.ml; import java.io.BufferedOutputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStreamWriter; import java.io.PrintStream; import java.io.Writer; import java.util.ArrayList; import java.util.Collection; import java.util.IdentityHashMap; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Random; import java.util.Set; import java.util.StringTokenizer; import java.util.TreeSet; import org.dom4j.Document; import org.dom4j.Element; import org.dom4j.Node; import org.dom4j.io.SAXReader; import edu.umd.cs.findbugs.config.CommandLine; /** * Convert a BugCollection into ARFF format. See Witten and Frank, * <em>Data Mining</em>, ISBN 1-55860-552-5. * * @see edu.umd.cs.findbugs.BugCollection * @see edu.umd.cs.findbugs.BugInstance * @author David Hovemeyer */ public class ConvertToARFF { // ------------------------------------------------------------ // Helper classes // ------------------------------------------------------------ private static class DataFile { private Document document; private String appName; public DataFile(Document document, String appName) { this.document = document; this.appName = appName; } public Document getDocument() { return document; } public String getAppName() { return appName; } } private static class MissingNodeException extends Exception { private static final long serialVersionUID = -5042140832791541208L; public MissingNodeException(String msg) { super(msg); } } public interface Attribute { public String getName(); public void scan(Element element, String appName) throws MissingNodeException; public String getRange(); public String getInstanceValue(Element element, String appName) throws MissingNodeException; } private abstract static class XPathAttribute implements Attribute { private String name; private String xpath; public XPathAttribute(String name, String xpath) { this.name = name; this.xpath = xpath; } public String getName() { return name; } public String getInstanceValue(Element element, String appName) throws MissingNodeException { Object value = element.selectObject(xpath); if (value == null) throw new MissingNodeException("Could not get value from element (path=" + xpath + ")"); if (value instanceof List) { List<?> list = (List<?>) value; if (list.size() == 0) throw new MissingNodeException("Could not get value from element (path=" + xpath + ")"); value = list.get(0); } if (value instanceof Node) { Node node = (Node) value; return node.getText(); } else if (value instanceof String) { return (String) value; } else if (value instanceof Number) { String s = value.toString(); if (s.endsWith(".0")) s = s.substring(0, s.length() - 2); return s; } else throw new IllegalStateException("Unexpected object returned from xpath query: " + value); } } public static class NominalAttribute extends XPathAttribute { private Set<String> possibleValueSet; public NominalAttribute(String name, String xpath) { super(name, xpath); this.possibleValueSet = new TreeSet<String>(); } public void scan(Element element, String appName) { try { possibleValueSet.add(getInstanceValue(element, appName)); } catch (MissingNodeException ignore) { // Ignore: we'll just use an n/a value for this instance } } public String getRange() { return collectionToRange(possibleValueSet); } @Override public String getInstanceValue(Element element, String appName) throws MissingNodeException { return "\"" + super.getInstanceValue(element, appName) + "\""; } } public static class BooleanAttribute extends XPathAttribute { public BooleanAttribute(String name, String xpath) { super(name, xpath); } public void scan(Element element, String appName) throws MissingNodeException { // Nothing to do. } public String getRange() { return "{true, false}"; } @Override public String getInstanceValue(Element element, String appName) throws MissingNodeException { try { String value = super.getInstanceValue(element, appName); return "\"" + Boolean.valueOf(value).toString() + "\""; } catch (MissingNodeException e) { return "\"false\""; } } } private static final int UNCLASSIFIED = 0; private static final int BUG = 1; private static final int NOT_BUG = 2; private static final int HARMLESS = 4; private static final int HARMLESS_BUG = HARMLESS | BUG; public static abstract class AbstractClassificationAttribute implements Attribute { /* * (non-Javadoc) * * @see edu.umd.cs.findbugs.ml.ConvertToARFF.Attribute#getName() */ public String getName() { return "classification"; } /* * (non-Javadoc) * * @see * edu.umd.cs.findbugs.ml.ConvertToARFF.Attribute#scan(org.dom4j.Element * , java.lang.String) */ public void scan(Element element, String appName) throws MissingNodeException { } /* * (non-Javadoc) * * @see * edu.umd.cs.findbugs.ml.ConvertToARFF.Attribute#getInstanceValue(org * .dom4j.Element, java.lang.String) */ public String getInstanceValue(Element element, String appName) throws MissingNodeException { String annotationText = element.valueOf("./UserAnnotation[text()]"); // System.out.println("annotationText=" + annotationText); int state = getBugClassification(annotationText); return bugToString(state); } protected abstract String bugToString(int bugType) throws MissingNodeException; } public static class ClassificationAttribute extends AbstractClassificationAttribute { public String getRange() { return "{bug,not_bug,harmless_bug}"; } @Override protected String bugToString(int state) throws MissingNodeException { if (state == NOT_BUG) return "not_bug"; else if (state == BUG) return "bug"; else if (state == HARMLESS_BUG) return "harmless_bug"; else throw new MissingNodeException("Unclassified warning"); } } public static class BinaryClassificationAttribute extends AbstractClassificationAttribute { /* * (non-Javadoc) * * @see edu.umd.cs.findbugs.ml.ConvertToARFF.Attribute#getRange() */ public String getRange() { return "{bug, not_bug}"; } /* * (non-Javadoc) * * @see * edu.umd.cs.findbugs.ml.ConvertToARFF.AbstractClassificationAttribute * #bugToString(int) */ @Override protected String bugToString(int state) throws MissingNodeException { if (state == BUG) return "bug"; else if (state == NOT_BUG || state == HARMLESS_BUG) return "not_bug"; else throw new MissingNodeException("unclassified warning"); } } public static class NumericAttribute extends XPathAttribute { public NumericAttribute(String name, String xpath) { super(name, xpath); } public void scan(Element element, String appName) throws MissingNodeException { } public String getRange() { return "numeric"; } } public static class PriorityAttribute implements Attribute { public String getName() { return "priority"; } public void scan(Element element, String appName) throws MissingNodeException { } public String getRange() { return "{low,medium,high}"; } public String getInstanceValue(Element element, String appName) throws MissingNodeException { org.dom4j.Attribute attribute = element.attribute("priority"); if (attribute == null) throw new MissingNodeException("Missing priority attribute"); String value = attribute.getValue(); try { int prio = Integer.parseInt(value); switch (prio) { case 1: return "high"; case 2: return "medium"; case 3: return "low"; default: return "?"; } } catch (NumberFormatException e) { throw new MissingNodeException("Invalid priority value: " + value); } } } /** * An attribute that just gives each instance a unique id. The application * name is prepended, so each unique id really unique, even across * applications. Obviously, this attribute shouldn't be used as input to a * learning algorithm. * * <p> * Uses the Element's uid attribute if it has one. * </p> */ public static class IdAttribute implements Attribute { private TreeSet<String> possibleValueSet = new TreeSet<String>(); private boolean scanning = true; private int count = 0; public String getName() { return "id"; } public void scan(Element element, String appName) throws MissingNodeException { possibleValueSet.add(instanceValue(element, appName)); } public String getRange() { return collectionToRange(possibleValueSet); } public String getInstanceValue(Element element, String appName) throws MissingNodeException { if (scanning) { count = 0; scanning = false; } return instanceValue(element, appName); } private String instanceValue(Element element, String appName) { String nextId; org.dom4j.Attribute uidAttr = element.attribute("uid"); if (uidAttr != null) { nextId = uidAttr.getValue(); } else { nextId = String.valueOf(count++); } return "\"" + appName + "-" + nextId + "\""; } } public static class IdStringAttribute implements Attribute { /* * (non-Javadoc) * * @see edu.umd.cs.findbugs.ml.ConvertToARFF.Attribute#getName() */ public String getName() { return "ids"; } /* * (non-Javadoc) * * @see * edu.umd.cs.findbugs.ml.ConvertToARFF.Attribute#scan(org.dom4j.Element * , java.lang.String) */ public void scan(Element element, String appName) throws MissingNodeException { } /* * (non-Javadoc) * * @see edu.umd.cs.findbugs.ml.ConvertToARFF.Attribute#getRange() */ public String getRange() { return "string"; } int count = 0; /* * (non-Javadoc) * * @see * edu.umd.cs.findbugs.ml.ConvertToARFF.Attribute#getInstanceValue(org * .dom4j.Element, java.lang.String) */ public String getInstanceValue(Element element, String appName) throws MissingNodeException { String value; org.dom4j.Attribute uidAttr = element.attribute("uid"); if (uidAttr == null) { value = String.valueOf(count++); } else { value = uidAttr.getStringValue(); } return "\"" + appName + "-" + value + "\""; } } private static final String RANDOM_CHARS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; public static class RandomIdAttribute implements Attribute { private Random rng = new Random(); private IdentityHashMap<Element, String> idMap = new IdentityHashMap<Element, String>(); /* * (non-Javadoc) * * @see edu.umd.cs.findbugs.ml.ConvertToARFF.Attribute#getName() */ public String getName() { return "idr"; } /* * (non-Javadoc) * * @see * edu.umd.cs.findbugs.ml.ConvertToARFF.Attribute#scan(org.dom4j.Element * , java.lang.String) */ public void scan(Element element, String appName) throws MissingNodeException { idMap.put(element, generateId()); } private String generateId() { StringBuilder buf = new StringBuilder(); for (int i = 0; i < 20; ++i) { char c = RANDOM_CHARS.charAt(rng.nextInt(RANDOM_CHARS.length())); buf.append(c); } return buf.toString(); } /* * (non-Javadoc) * * @see edu.umd.cs.findbugs.ml.ConvertToARFF.Attribute#getRange() */ public String getRange() { TreeSet<String> range = new TreeSet<String>(); range.addAll(idMap.values()); if (range.size() != idMap.size()) throw new IllegalStateException("id collision!"); return collectionToRange(range); } /* * (non-Javadoc) * * @see * edu.umd.cs.findbugs.ml.ConvertToARFF.Attribute#getInstanceValue(org * .dom4j.Element, java.lang.String) */ public String getInstanceValue(Element element, String appName) throws MissingNodeException { String id = idMap.get(element); if (id == null) throw new IllegalStateException("Element not scanned?"); return "\"" + id + "\""; } } public static class AppNameAttribute implements Attribute { private Set<String> appNameSet = new TreeSet<String>(); public String getName() { return "appname"; } public void scan(Element element, String appName) throws MissingNodeException { appNameSet.add(appName); } public String getRange() { return collectionToRange(appNameSet); } public String getInstanceValue(Element element, String appName) throws MissingNodeException { return "\"" + appName + "\""; } } public static String collectionToRange(Collection<String> collection) { StringBuilder buf = new StringBuilder(); buf.append("{"); for (String aCollection : collection) { if (buf.length() > 1) buf.append(','); buf.append(aCollection); } buf.append("}"); return buf.toString(); } public interface AttributeCallback { public void apply(Attribute attribute) throws MissingNodeException, IOException; } // ------------------------------------------------------------ // Constants // ------------------------------------------------------------ private static final String DEFAULT_NODE_SELECTION_XPATH = "/BugCollection/BugInstance"; // ------------------------------------------------------------ // Fields // ------------------------------------------------------------ private List<Attribute> attributeList; private String nodeSelectionXpath; private boolean dropUnclassifiedWarnings; private String appName; // ------------------------------------------------------------ // Public methods // ------------------------------------------------------------ public ConvertToARFF() { this.attributeList = new LinkedList<Attribute>(); this.nodeSelectionXpath = DEFAULT_NODE_SELECTION_XPATH; this.dropUnclassifiedWarnings = false; } public void setAppName(String appName) { this.appName = appName; } /** * Set the xpath expression used to select BugInstance nodes. * * @param nodeSelectionXpath * the node selection xpath expression */ public void setNodeSelectionXpath(String nodeSelectionXpath) { this.nodeSelectionXpath = nodeSelectionXpath; } public int getNumAttributes() { return attributeList.size(); } public void dropUnclassifiedWarnings() { this.dropUnclassifiedWarnings = true; } public void addAttribute(Attribute attribute) { attributeList.add(attribute); } public void addNominalAttribute(String name, String xpath) { addAttribute(new NominalAttribute(name, xpath)); } public void addBooleanAttribute(String name, String xpath) { addAttribute(new BooleanAttribute(name, xpath)); } public void addClassificationAttribute() { addAttribute(new ClassificationAttribute()); } public void addNumericAttribute(String name, String xpath) { addAttribute(new NumericAttribute(name, xpath)); } public void addPriorityAttribute() { addAttribute(new PriorityAttribute()); } public void addIdAttribute() { addAttribute(new IdAttribute()); } public void addAppNameAttribute() { addAttribute(new AppNameAttribute()); } /** * Convert a single Document to ARFF format. * * @param relationName * the relation name * @param document * the Document * @param appName * the application name * @param out * Writer to write the ARFF output to */ public void convert(String relationName, Document document, String appName, final Writer out) throws IOException, MissingNodeException { scan(document, appName); generateHeader(relationName, out); generateInstances(document, appName, out); } /** * Scan a Document to find out the ranges of attributes. All Documents must * be scanned before generating the ARFF header and instances. * * @param document * the Document * @param appName * the application name */ public void scan(Document document, final String appName) throws MissingNodeException, IOException { List<Element> bugInstanceList = getBugInstanceList(document); for (final Element element : bugInstanceList) { scanAttributeList(new AttributeCallback() { public void apply(Attribute attribute) throws MissingNodeException { attribute.scan(element, appName); } }); } } /** * Generate ARFF header. Documents must have already been scanned. * * @param relationName * the relation name * @param out * Writer to write the ARFF output to */ public void generateHeader(String relationName, final Writer out) throws MissingNodeException, IOException { out.write("@relation "); out.write(relationName); out.write("\n\n"); scanAttributeList(new AttributeCallback() { public void apply(Attribute attribute) throws IOException { out.write("@attribute "); out.write(attribute.getName()); out.write(" "); out.write(attribute.getRange()); out.write("\n"); } }); out.write("\n"); out.write("@data\n"); } /** * Generate instances from given Document. Document should already have been * scanned, and the ARFF header generated. * * @param document * the Document * @param appName * the application name * @param out * Writer to write the ARFF output to */ public void generateInstances(Document document, final String appName, final Writer out) throws MissingNodeException, IOException { List<Element> bugInstanceList = getBugInstanceList(document); for (final Element element : bugInstanceList) { scanAttributeList(new AttributeCallback() { boolean first = true; public void apply(Attribute attribute) throws IOException { if (!first) out.write(","); first = false; String value; try { value = attribute.getInstanceValue(element, appName); } catch (MissingNodeException e) { value = "?"; } out.write(value); } }); out.write("\n"); } } /** * Apply a callback to all Attributes. * * @param callback * the callback */ public void scanAttributeList(AttributeCallback callback) throws MissingNodeException, IOException { for (Attribute attribute : attributeList) { callback.apply(attribute); } } // ------------------------------------------------------------ // Implementation // ------------------------------------------------------------ private static int getBugClassification(String annotationText) { StringTokenizer tok = new StringTokenizer(annotationText, " \t\r\n\f.,:;-"); int state = UNCLASSIFIED; while (tok.hasMoreTokens()) { String s = tok.nextToken(); if (s.equals("BUG")) state |= BUG; else if (s.equals("NOT_BUG")) state |= NOT_BUG; else if (s.equals("HARMLESS")) state |= HARMLESS; } if ((state & NOT_BUG) != 0) return NOT_BUG; else if ((state & BUG) != 0) return ((state & HARMLESS) != 0) ? HARMLESS_BUG : BUG; else return UNCLASSIFIED; } @SuppressWarnings("unchecked") private List<Element> getBugInstanceList(Document document) { List<Element> bugInstanceList = document.selectNodes(nodeSelectionXpath); if (dropUnclassifiedWarnings) { for (Iterator<Element> i = bugInstanceList.iterator(); i.hasNext();) { Element element = i.next(); String annotationText = element.valueOf("./UserAnnotation[text()]"); int classification = getBugClassification(annotationText); if (classification == UNCLASSIFIED) i.remove(); } } return bugInstanceList; } private static class C2ACommandLine extends CommandLine { private ConvertToARFF converter = new ConvertToARFF(); public C2ACommandLine() { addOption("-select", "xpath expression", "select BugInstance elements"); addSwitch("-train", "drop unclassified warnings"); addSwitch("-id", "add unique id attribute (as nominal)"); addSwitch("-ids", "add unique id attribute (as string)"); addSwitch("-idr", "add random unique id attribtue (as nominal)"); addSwitch("-app", "add application name attribute"); addOption("-nominal", "attrName,xpath", "add a nominal attribute"); addOption("-boolean", "attrName,xpath", "add a boolean attribute"); addOption("-numeric", "attrName,xpath", "add a numeric attribute"); addSwitch("-classification", "add bug classification attribute"); addSwitch("-binclass", "add binary (bug/not_bug) classification attribute"); addSwitch("-priority", "add priority attribute"); addOption("-appname", "app name", "set application name of all tuples"); } public ConvertToARFF getConverter() { return converter; } @Override protected void handleOption(String option, String optionExtraPart) throws IOException { if (option.equals("-train")) { converter.dropUnclassifiedWarnings(); } else if (option.equals("-id")) { converter.addIdAttribute(); } else if (option.equals("-ids")) { converter.addAttribute(new IdStringAttribute()); } else if (option.equals("-idr")) { converter.addAttribute(new RandomIdAttribute()); } else if (option.equals("-app")) { converter.addAppNameAttribute(); } else if (option.equals("-classification")) { converter.addClassificationAttribute(); } else if (option.equals("-binclass")) { converter.addAttribute(new BinaryClassificationAttribute()); } else if (option.equals("-priority")) { converter.addPriorityAttribute(); } } private interface XPathAttributeCreator { public Attribute create(String name, String xpath); } @Override protected void handleOptionWithArgument(String option, String argument) throws IOException { if (option.equals("-select")) { converter.setNodeSelectionXpath(argument); } else if (option.equals("-nominal")) { addXPathAttribute(option, argument, new XPathAttributeCreator() { public Attribute create(String name, String xpath) { return new NominalAttribute(name, xpath); } }); } else if (option.equals("-boolean")) { addXPathAttribute(option, argument, new XPathAttributeCreator() { public Attribute create(String name, String xpath) { return new BooleanAttribute(name, xpath); } }); } else if (option.equals("-numeric")) { addXPathAttribute(option, argument, new XPathAttributeCreator() { public Attribute create(String name, String xpath) { return new NumericAttribute(name, xpath); } }); } else if (option.equals("-appname")) { converter.setAppName(argument); } } protected void addXPathAttribute(String option, String argument, XPathAttributeCreator creator) { int comma = argument.indexOf(','); if (comma < 0) { throw new IllegalArgumentException("Missing comma separating attribute name and xpath in " + option + " option: " + argument); } String attrName = argument.substring(0, comma); String xpath = argument.substring(comma + 1); converter.addAttribute(creator.create(attrName, xpath)); } public void printUsage(PrintStream out) { out.println("Usage: " + ConvertToARFF.class.getName() + " [options] <relation name> <output file> <findbugs results> [<findbugs results>...]"); super.printUsage(out); } } public String toAppName(String fileName) { if (appName != null) return appName; // Remove file extension, if any int lastDot = fileName.lastIndexOf('.'); if (lastDot >= 0) fileName = fileName.substring(0, lastDot); return fileName; } public static void main(String[] argv) throws Exception { // Expand any option files C2ACommandLine commandLine = new C2ACommandLine(); argv = commandLine.expandOptionFiles(argv, true, true); // Parse command line arguments int argCount = commandLine.parse(argv); if (argCount > argv.length - 3) { commandLine.printUsage(System.err); System.exit(1); } String relationName = argv[argCount++]; String outputFileName = argv[argCount++]; // Create the converter ConvertToARFF converter = commandLine.getConverter(); if (converter.getNumAttributes() == 0) { throw new IllegalArgumentException("No attributes specified!"); } // Open output file Writer out = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(outputFileName))); // Read documents, // scan documents to find ranges of attributes List<DataFile> dataFileList = new ArrayList<DataFile>(); while (argCount < argv.length) { String fileName = argv[argCount++]; // Read input file as dom4j tree SAXReader reader = new SAXReader(); Document document = reader.read(fileName); DataFile dataFile = new DataFile(document, converter.toAppName(fileName)); dataFileList.add(dataFile); converter.scan(dataFile.getDocument(), dataFile.getAppName()); } // Generate ARFF header converter.generateHeader(relationName, out); // Generate instances from each document for (DataFile dataFile : dataFileList) { converter.generateInstances(dataFile.getDocument(), dataFile.getAppName(), out); } out.close(); } } // vim:ts=4