/*
*
* YAQP - Yet Another QSAR Project:
* Machine Learning algorithms designed for the prediction of toxicological
* features of chemical compounds become available on the Web. Yaqp is developed
* under OpenTox (http://opentox.org) which is an FP7-funded EU research project.
* This project was developed at the Automatic Control Lab in the Chemical Engineering
* School of National Technical University of Athens. Please read README for more
* information.
*
* Copyright (C) 2009-2010 Pantelis Sopasakis & Charalampos Chomenides
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* Contact:
* Pantelis Sopasakis
* chvng@mail.ntua.gr
* Address: Iroon Politechniou St. 9, Zografou, Athens Greece
* tel. +30 210 7723236
*/
package org.opentox.ontology.data;
import com.hp.hpl.jena.datatypes.xsd.XSDDatatype;
import com.hp.hpl.jena.rdf.model.Literal;
import com.hp.hpl.jena.rdf.model.RDFNode;
import com.hp.hpl.jena.rdf.model.Resource;
import com.hp.hpl.jena.rdf.model.SimpleSelector;
import com.hp.hpl.jena.rdf.model.Statement;
import com.hp.hpl.jena.rdf.model.StmtIterator;
import com.hp.hpl.jena.vocabulary.RDF;
import java.net.URI;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import org.opentox.core.exceptions.Cause;
import org.opentox.core.processors.Pipeline;
import org.opentox.io.processors.InputProcessor;
import org.opentox.io.publishable.OntObject;
import org.opentox.io.publishable.RDFObject;
import org.opentox.ontology.exceptions.ImproperEntityException;
import org.opentox.ontology.exceptions.YaqpOntException;
import org.opentox.ontology.namespaces.OTClass;
import org.opentox.ontology.namespaces.OTDataTypeProperties;
import org.opentox.ontology.namespaces.OTObjectProperties;
import org.opentox.ontology.processors.InstancesProcessor;
import org.opentox.qsar.processors.filters.AbstractFilter;
import org.opentox.qsar.processors.filters.AttributeCleanup;
import org.opentox.qsar.processors.filters.AttributeCleanup.ATTRIBUTE_TYPE;
import org.opentox.qsar.processors.filters.SimpleMVHFilter;
import weka.core.Attribute;
import weka.core.FastVector;
import weka.core.Instance;
import weka.core.Instances;
/**
*
* A set of data which can be used for training or testing a model.
* @author Pantelis Sopasakis
* @author Charalampos Chomenides
*/
@SuppressWarnings({"unchecked"}) public class Dataset {
private OntObject oo = null;
/**
* The name of the first attribute in the dataset, corresponding to a
* unique identifier for the compound.
*/
private static final String compound_uri = "compound_uri";
private enum WekaDataTypes {
string,
nominal,
numeric,
general;
}
/**
* A dataset is instantiated providing an OntObject, which is an ontological model.
* The class {@link OntObject } is in fact an extension of <code>OntModelImpl</code>
* of jena. Such an object (OntObject) can be retrieved from a remote dataset server,
* or from a local resource (e.g. file) using the <code>InputProcessor</code>.
* @param oo An ontological object holding a representation of a dataset. If an improper
* ontological entity is provided to construct the Dataset, methods like
* {@link Dataset#getInstances() getInstances()} are not likely to work, so you have to
* chack that the resource you provided is a dataset resource.
* @see DatasetBuilder
*/
public Dataset(OntObject oo) {
this.oo = oo;
}
/**
* The dataset as <code>Instances</code>. These objects are used by weka as
* input/output object to most algorithms (training, data preprocessing etc).
* The Instances equivalent of the dataset may contain three different types of
* <code>attributes</code>: numeric, nominal and/or string ones. The first attribute
* is always a string one corresponding to the compound of the dataentry while
* acting as an identifier for it. The name of this attribute is <code>compound_uri</code>
* and is unique among all data entries.
* @return Instances object for the dataset.
* @throws YaqpOntException In case something goes wrong with the provided
* representation (e.g. it does not correspond to a valid dataset).
*/
public Instances getInstances() throws YaqpOntException {
// SOME INITIAL DEFINITIONS:
Resource _DATAENTRY = OTClass.DataEntry.getOntClass(oo),
_DATASET = OTClass.Dataset.getOntClass(oo),
_FEATURE = OTClass.Feature.getOntClass(oo),
_NUMERIC_FEATURE = OTClass.NumericFeature.getOntClass(oo),
_NOMINAL_FEATURE = OTClass.NominalFeature.getOntClass(oo),
_STRING_FEATURE = OTClass.StringFeature.getOntClass(oo);
FastVector attributes = null;
Instances data = null;
StmtIterator dataSetIterator = null,
featureIterator = null,
valuesIterator = null,
dataEntryIterator = null;
String relationName = null;
Map<Resource, WekaDataTypes> featureTypes = new HashMap<Resource, WekaDataTypes>();
Map<Resource, ArrayList<String>> featureNominalValues = new HashMap<Resource, ArrayList<String>>();
// CHECK IF THE RESOURCE IS A DATASET. IF YES, GET ITS IDENTIFIER AND SET
// THE RELATION NAME ACCORDINGLY. IF NOT THROW AN ImproperEntityException.
// ALSO CHECK IF THERE ARE MULTIPLE DATASETS AND IF YES THROW EXCEPTION.
dataSetIterator =
oo.listStatements(new SimpleSelector(null, RDF.type, _DATASET));
if (dataSetIterator.hasNext()) {
relationName = dataSetIterator.next().getSubject().getURI();
if (dataSetIterator.hasNext()) {
throw new YaqpOntException(Cause.XONT518, "More than one datasets found");
}
} else {
// this is not a dataset model
throw new ImproperEntityException(Cause.XIE2, "Not a dataset");
}
dataSetIterator.close();
// POPULATE THE MAP WHICH CORRELATES RESOURCES TO WEKA DATA TYPES
ArrayList<String> nominalValues = new ArrayList<String>();
featureIterator =
oo.listStatements(new SimpleSelector(null, RDF.type, _FEATURE));
while (featureIterator.hasNext()) {
Resource feature = featureIterator.next().getSubject().as(Resource.class);
StmtIterator featureTypeIterator =
oo.listStatements(new SimpleSelector(feature, RDF.type, (RDFNode) null));
Set<Resource> featureTypesSet = new HashSet<Resource>();
while (featureTypeIterator.hasNext()) {
Resource type = featureTypeIterator.next().getObject().as(Resource.class);
featureTypesSet.add(type);
}
if (featureTypesSet.contains(_NUMERIC_FEATURE)) {
featureTypes.put(feature, WekaDataTypes.numeric);
} else if (featureTypesSet.contains(_STRING_FEATURE)) {
featureTypes.put(feature, WekaDataTypes.string);
} else if (featureTypesSet.contains(_NOMINAL_FEATURE)) {
featureTypes.put(feature, WekaDataTypes.nominal);
StmtIterator acceptValueIterator = oo.listStatements(
new SimpleSelector(feature, OTDataTypeProperties.acceptValue.createProperty(oo), (RDFNode)null));
// GET THE RANGE OF THE FEATURE:
while (acceptValueIterator.hasNext()){
nominalValues.add(acceptValueIterator.next().getObject().as(Literal.class).getString());
}
featureNominalValues.put(feature, nominalValues);
nominalValues = new ArrayList<String>();
} else {
assert (featureTypesSet.contains(_FEATURE));
featureTypes.put(feature, WekaDataTypes.general);
}
}
// GET THE ATTRIBUTES FOR THE DATASET:
attributes = getAttributes(featureTypes, featureNominalValues);
data = new Instances(relationName, attributes, 0);
// ITERATE OVER ALL DATA ENTRIES IN THE DATASET:
dataEntryIterator =
oo.listStatements(new SimpleSelector(null, RDF.type, _DATAENTRY));
while (dataEntryIterator.hasNext()) {
Statement dataEntry = dataEntryIterator.next();
/**
* B2. For every dataEntry, iterate over all values nodes.
*/
Instance temp = null;
valuesIterator =
oo.listStatements(new SimpleSelector(dataEntry.getSubject(), OTObjectProperties.values.createProperty(oo), (Resource) null));
double[] vals = new double[data.numAttributes()];
for (int i = 0; i < data.numAttributes(); i++) {
vals[i] = Instance.missingValue();
}
StmtIterator compoundNamesIterator =
oo.listStatements(new SimpleSelector(dataEntry.getSubject(), OTObjectProperties.compound.createProperty(oo), (Resource) null));
String compoundName = null;
if (compoundNamesIterator.hasNext()) {
compoundName = compoundNamesIterator.next().getObject().as(Resource.class).getURI();
}
vals[data.attribute(compound_uri).index()] = data.attribute(compound_uri).addStringValue(compoundName);
while (valuesIterator.hasNext()) {
Statement values = valuesIterator.next();
/*
* A pair of the form (AttributeName, AttributeValue) is created.
* This will be registered in an Instance-type object which
* is turn will be used to update the dataset.
*/
// atVal is the value of the attribute
String atVal = values.getProperty(OTDataTypeProperties.value.createProperty(oo)).getObject().as(Literal.class).getValue().toString();
// and atName is the name of the corresponding attribute.
String atName = values.getProperty(OTObjectProperties.feature.createProperty(oo)).getObject().as(Resource.class).getURI();
if (featureTypes.get(oo.createResource(atName)).equals(WekaDataTypes.numeric)) {
try {
vals[data.attribute(atName).index()] = Double.parseDouble(atVal);
/**
* The following catch rule, handles cases where some values are declared
* as numeric (double, float etc) but their value cannot be cast as
* double.
*/
} catch (NumberFormatException ex) {
/* Just don't include this value in the dataset */
}
} else if (featureTypes.get(oo.createResource(atName)).equals(WekaDataTypes.string)) {
vals[data.attribute(atName).index()] = data.attribute(atName).addStringValue(atVal);
} else if (XSDDatatype.XSDdate.getURI().equals(atName)) {
try {
vals[data.attribute(atName).index()] = data.attribute(atName).parseDate(atVal);
} catch (ParseException ex) {
System.out.println(ex);
//Logger.getLogger(Dataset.class.getName()).log(Level.SEVERE, null, ex);
}
}
}
temp = new Instance(1.0, vals);
// Add the Instance only if its compatible with the dataset!
if (data.checkInstance(temp)) {
data.add(temp);
} else {
System.err.println("Warning! The instance " + temp + " is not compatible with the dataset!");
}
}
dataEntryIterator.close();
return data;
}
private FastVector getAttributes(Map<Resource, WekaDataTypes> featureTypes, Map<Resource, ArrayList<String>> nominalValues) {
FastVector atts = new FastVector();
Set<Entry<Resource, WekaDataTypes>> entrySetDatatypes = featureTypes.entrySet();
// THE EXISTENCE OF THE (STRING) ATTRIBUTE 'COMPOUND_URI' IS MANDATORY FOR ALL
// DATASETS. THIS IS ALWAYS THE FIRST ATTRIBUTE IN THE LIST.
atts.addElement(new Attribute(compound_uri, (FastVector) null));
// ADD NUMERIC AND STRING ATTRIBUTES INTO THE FASTVECTOR:
for (Entry<Resource, WekaDataTypes> entry : entrySetDatatypes) {
WekaDataTypes dataType = entry.getValue();
if (dataType.equals(WekaDataTypes.numeric)) {
atts.addElement(new Attribute(entry.getKey().getURI()));
} else if (dataType.equals(WekaDataTypes.string) || dataType.equals(WekaDataTypes.general)) {
atts.addElement(new Attribute(entry.getKey().getURI(), (FastVector) null));
}
}
// COPE WITH NOMINAL VALUES:
Set<Entry<Resource, ArrayList<String>>> nominalAttsSet = nominalValues.entrySet();
for (Entry<Resource, ArrayList<String>> entry : nominalAttsSet){
FastVector nominalFVec = new FastVector(entry.getValue().size());
for (String nominalValue : entry.getValue()){
nominalFVec.addElement(nominalValue);
}
atts.addElement(new Attribute(entry.getKey().toString(), nominalFVec));
}
return atts;
}
public static void main(String[] args) throws Exception {
InputProcessor<OntObject> p1 = new InputProcessor<OntObject>();
DatasetBuilder p2 = new DatasetBuilder();
InstancesProcessor p3 = new InstancesProcessor();
AbstractFilter filter1 = new AttributeCleanup(new ATTRIBUTE_TYPE[] {ATTRIBUTE_TYPE.string});
AbstractFilter filter = new SimpleMVHFilter();
Pipeline pipe = new Pipeline();
pipe.add(p1);
pipe.add(p2);
pipe.add(p3);
pipe.add(filter1);
pipe.add(filter);
Instances data = (Instances) pipe.process(new URI("http://localhost/9"));
}
public RDFObject getRDF(){
return new RDFObject(oo);
}
}