Dataset.java example

Explorer

yaqp-turbo-master
- src
  - org
    - opentox
      - YAQP.java
      - config
        Configuration.java
        ServerFolders.java
      - core
        exceptions
        Cause.java
        ProcessorException.java
        YaqpException.java
        interfaces
        JBatchProcessor.java
        JEngine.java
        JMultiProcessor.java
        JMultiProcessorStatus.java
        JProcessor.java
        JTurboProcessor.java
        processors
        AbstractBatchProcessor.java
        AbstractMultiProcessor.java
        AbstractTurboProcessor.java
        BatchProcessor.java
        ParallelProcessor.java
        Pipeline.java
        Processor.java
        util
        MultiProcessorStatus.java
      - db
        DatabaseJanitor.java
        exceptions
        BadEmailException.java
        DbException.java
        DuplicateKeyException.java
        handlers
        ReaderHandler.java
        UpdateHandler.java
        WriterHandler.java
        package-info.java
        interfaces
        JDbConnector.java
        JDbProcessor.java
        JDbTable.java
        JDbTableColumn.java
        JHyperStatement.java
        JPrepStmt.java
        JQueryFood.java
        JQueryParam.java
        JQueryProcessor.java
        processors
        AbstractDbProcessor.java
        DbPipeline.java
        DbProcessor.java
        QueryProcessor.java
        package-info.java
        queries
        HyperResult.java
        HyperStatement.java
        QueryFood.java
        QueryParam.java
        package-info.java
        table
        AbstractTableProcessor.java
        StandardTables.java
        Table.java
        TableColumn.java
        TableCreator.java
        TableDropper.java
        collection
        AlgOntRelationTable.java
        AlgOntTable.java
        AlgorithmsTable.java
        FeaturesTable.java
        IndFeatRelationTable.java
        OmegaTable.java
        QSARModelsTable.java
        SupportVecTable.java
        TasksTable.java
        UserAuthTable.java
        UsersTable.java
        util
        EmailSupervisor.java
        Page.java
        PrepStmt.java
        PrepSwimmingPool.java
        QueryType.java
        SQLDataTypes.java
        TheDbConnector.java
      - io
        engines
        EngineFactory.java
        IOEngine.java
        RDFEngine.java
        TurtleEngine.java
        exceptions
        YaqpIOException.java
        interfaces
        JIOProcessor.java
        JOntModel.java
        JPublishable.java
        processors
        AbstractIOProcessor.java
        InputProcessor.java
        OutputProcessor.java
        Poster.java
        Publisher.java
        publishable
        JSONObject.java
        N3Object.java
        NTripleObject.java
        OntObject.java
        PDFObject.java
        RDFObject.java
        TurtleObject.java
        UriListObject.java
        util
        ServerList.java
        YaqpIOStream.java
      - ontology
        components
        Algorithm.java
        AlgorithmOntology.java
        ComponentList.java
        Feature.java
        OmegaModel.java
        QSARModel.java
        Task.java
        User.java
        UserGroup.java
        YaqpComponent.java
        package-info.java
        data
        Dataset.java
        DatasetBuilder.java
        DatasetFactory.java
        exceptions
        ImproperEntityException.java
        YaqpOntException.java
        interfaces
        JOntEntity.java
        JOntModel.java
        JOntProcessor.java
        namespaces
        OTAlgorithmTypes.java
        OTClass.java
        OTDataTypeProperties.java
        OTObjectProperties.java
        YaqpOntEntity.java
        processors
        AbstractOntProcessor.java
        InstancesProcessor.java
        util
        AlgorithmMeta.java
        AlgorithmParameter.java
        Meta.java
        YaqpAlgorithms.java
        vocabulary
        Audience.java
        ConstantParameters.java
      - qsar
        exceptions
        QSARException.java
        interfaces
        JTrainer.java
        JWekaTrainer.java
        processors
        QSARModelDBWriter.java
        filters
        AbstractFilter.java
        AttributeCleanup.java
        FCBSFilter.java
        InstancesFilter.java
        SimpleMVHFilter.java
        predictors
        SimplePredictor.java
        WekaPredictor.java
        trainers
        AbstractTrainer.java
        WekaTrainer.java
        classification
        NaiveBayesTrainer.java
        SVCTrainer.java
        WekaClassifier.java
        regression
        MLRTrainer.java
        SVMTrainer.java
        WekaRegressor.java
      - util
        logging
        YaqpLogger.java
        levels
        Debug.java
        Fatal.java
        Info.java
        ScrewedUp.java
        Trace.java
        Warning.java
        logobject
        AbstractLogObject.java
        LogObject.java
        processors
        AbstractLoggingProcessor.java
        ILoggingProcessor.java
        monitoring
        Jennifer.java
      - www
        rest
        Applecation.java
        YaqpServer.java
        components
        URITemplate.java
        YaqpApplication.java
        YaqpForm.java
        YaqpRepresentation.java
        YaqpResource.java
        YaqpResponse.java
        YaqpStatus.java
        resources
        AlgorithmResource.java
        AlgorithmsResource.java
        ModelMetaResource.java
        ModelResource.java
        ModelsResource.java
        services
        PredictionService.java
        Trainers.java
        TrainingService.java
- test
  - org
    - opentox
      - core
        processors
        BatchProcessorTest.java
        ParallelProcessorTest.java
        PipelineTest.java
      - db
        handlers
        ReaderHandlerTest.java
        UpdateHandlerTest.java
        WriterHandlerTest.java
        processors
        QueryProcessorTest.java
        queries
        HyperStatementTest.java
        table
        StandardTablesTest.java
        TableDropperTest.java
        util
        EmailSupervisorTest.java
        PrepSwimmingPoolTest.java
        TheDbConnectorTest.java
      - io
        processors
        InputProcessorTest.java
      - ontology
        ModelFactoryTest.java
        namespaces
        YaqpOntEntityTest.java
        util
        YaqpAlgorithmsTest.java
      - qsar
        processors
        trainers
        regression
        MLRTrainerTest.java
        TrainingPipeline.java
      - util
        logging
        YaqpLoggerTest.java

/*
 *
 * YAQP - Yet Another QSAR Project:
 * Machine Learning algorithms designed for the prediction of toxicological
 * features of chemical compounds become available on the Web. Yaqp is developed
 * under OpenTox (http://opentox.org) which is an FP7-funded EU research project.
 * This project was developed at the Automatic Control Lab in the Chemical Engineering
 * School of National Technical University of Athens. Please read README for more
 * information.
 *
 * Copyright (C) 2009-2010 Pantelis Sopasakis & Charalampos Chomenides
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 * Contact:
 * Pantelis Sopasakis
 * chvng@mail.ntua.gr
 * Address: Iroon Politechniou St. 9, Zografou, Athens Greece
 * tel. +30 210 7723236
 */
package org.opentox.ontology.data;

import com.hp.hpl.jena.datatypes.xsd.XSDDatatype;
import com.hp.hpl.jena.rdf.model.Literal;
import com.hp.hpl.jena.rdf.model.RDFNode;
import com.hp.hpl.jena.rdf.model.Resource;
import com.hp.hpl.jena.rdf.model.SimpleSelector;
import com.hp.hpl.jena.rdf.model.Statement;
import com.hp.hpl.jena.rdf.model.StmtIterator;
import com.hp.hpl.jena.vocabulary.RDF;
import java.net.URI;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import org.opentox.core.exceptions.Cause;
import org.opentox.core.processors.Pipeline;
import org.opentox.io.processors.InputProcessor;
import org.opentox.io.publishable.OntObject;
import org.opentox.io.publishable.RDFObject;
import org.opentox.ontology.exceptions.ImproperEntityException;
import org.opentox.ontology.exceptions.YaqpOntException;
import org.opentox.ontology.namespaces.OTClass;
import org.opentox.ontology.namespaces.OTDataTypeProperties;
import org.opentox.ontology.namespaces.OTObjectProperties;
import org.opentox.ontology.processors.InstancesProcessor;
import org.opentox.qsar.processors.filters.AbstractFilter;
import org.opentox.qsar.processors.filters.AttributeCleanup;
import org.opentox.qsar.processors.filters.AttributeCleanup.ATTRIBUTE_TYPE;
import org.opentox.qsar.processors.filters.SimpleMVHFilter;
import weka.core.Attribute;
import weka.core.FastVector;
import weka.core.Instance;
import weka.core.Instances;

/**
 *
 * A set of data which can be used for training or testing a model.
 * @author Pantelis Sopasakis
 * @author Charalampos Chomenides
 */
@SuppressWarnings({"unchecked"}) public class Dataset {

    private OntObject oo = null;
    /**
     * The name of the first attribute in the dataset, corresponding to a
     * unique identifier for the compound.
     */
    private static final String compound_uri = "compound_uri";

    private enum WekaDataTypes {
        string,
        nominal,
        numeric,
        general;
    }

    /**
     * A dataset is instantiated providing an OntObject, which is an ontological model.
     * The class {@link OntObject } is in fact an extension of <code>OntModelImpl</code>
     * of jena. Such an object (OntObject) can be retrieved from a remote dataset server,
     * or from a local resource (e.g. file) using the <code>InputProcessor</code>.
     * @param oo An ontological object holding a representation of a dataset. If an improper
     * ontological entity is provided to construct the Dataset, methods like
     * {@link Dataset#getInstances() getInstances()} are not likely to work, so you have to
     * chack that the resource you provided is a dataset resource.
     * @see DatasetBuilder
     */
    public Dataset(OntObject oo) {
        this.oo = oo;
    }

    /**
     * The dataset as <code>Instances</code>. These objects are used by weka as
     * input/output object to most algorithms (training, data preprocessing etc).
     * The Instances equivalent of the dataset may contain three different types of
     * <code>attributes</code>: numeric, nominal and/or string ones. The first attribute
     * is always a string one corresponding to the compound of the dataentry while  
     * acting as an identifier for it. The name of this attribute is <code>compound_uri</code>
     * and is unique among all data entries. 
     * @return Instances object for the dataset.
     * @throws YaqpOntException In case something goes wrong with the provided
     * representation (e.g. it does not correspond to a valid dataset).
     */
    public Instances getInstances() throws YaqpOntException {

        // SOME INITIAL DEFINITIONS:
        Resource _DATAENTRY = OTClass.DataEntry.getOntClass(oo),
                _DATASET = OTClass.Dataset.getOntClass(oo),
                _FEATURE = OTClass.Feature.getOntClass(oo),
                _NUMERIC_FEATURE = OTClass.NumericFeature.getOntClass(oo),
                _NOMINAL_FEATURE = OTClass.NominalFeature.getOntClass(oo),
                _STRING_FEATURE = OTClass.StringFeature.getOntClass(oo);
        FastVector attributes = null;
        Instances data = null;
        StmtIterator dataSetIterator = null,
                featureIterator = null,
                valuesIterator = null,
                dataEntryIterator = null;
        String relationName = null;
        Map<Resource, WekaDataTypes> featureTypes = new HashMap<Resource, WekaDataTypes>();
        Map<Resource, ArrayList<String>> featureNominalValues = new HashMap<Resource, ArrayList<String>>();


        // CHECK IF THE RESOURCE IS A DATASET. IF YES, GET ITS IDENTIFIER AND SET
        // THE RELATION NAME ACCORDINGLY. IF NOT THROW AN ImproperEntityException.
        // ALSO CHECK IF THERE ARE MULTIPLE DATASETS AND IF YES THROW EXCEPTION.
        dataSetIterator =
                oo.listStatements(new SimpleSelector(null, RDF.type, _DATASET));

        if (dataSetIterator.hasNext()) {
            relationName = dataSetIterator.next().getSubject().getURI();
            if (dataSetIterator.hasNext()) {
                throw new YaqpOntException(Cause.XONT518, "More than one datasets found");
            }
        } else {
            // this is not a dataset model
            throw new ImproperEntityException(Cause.XIE2, "Not a dataset");
        }
        dataSetIterator.close();



        //  POPULATE THE MAP WHICH CORRELATES RESOURCES TO WEKA DATA TYPES
        ArrayList<String> nominalValues = new ArrayList<String>();
        featureIterator =
                oo.listStatements(new SimpleSelector(null, RDF.type, _FEATURE));
        while (featureIterator.hasNext()) {
            Resource feature = featureIterator.next().getSubject().as(Resource.class);
            StmtIterator featureTypeIterator =
                    oo.listStatements(new SimpleSelector(feature, RDF.type, (RDFNode) null));
            Set<Resource> featureTypesSet = new HashSet<Resource>();
            while (featureTypeIterator.hasNext()) {
                Resource type = featureTypeIterator.next().getObject().as(Resource.class);
                featureTypesSet.add(type);
            }
            if (featureTypesSet.contains(_NUMERIC_FEATURE)) {
                featureTypes.put(feature, WekaDataTypes.numeric);
            } else if (featureTypesSet.contains(_STRING_FEATURE)) {
                featureTypes.put(feature, WekaDataTypes.string);
            } else if (featureTypesSet.contains(_NOMINAL_FEATURE)) {
                featureTypes.put(feature, WekaDataTypes.nominal);
                StmtIterator acceptValueIterator = oo.listStatements(
                        new SimpleSelector(feature, OTDataTypeProperties.acceptValue.createProperty(oo), (RDFNode)null));
                // GET THE RANGE OF THE FEATURE:   
                while (acceptValueIterator.hasNext()){
                    nominalValues.add(acceptValueIterator.next().getObject().as(Literal.class).getString());
                }
                featureNominalValues.put(feature, nominalValues);
                nominalValues = new ArrayList<String>();                             
            } else {
                assert (featureTypesSet.contains(_FEATURE));
                featureTypes.put(feature, WekaDataTypes.general);
            }
        }

        // GET THE ATTRIBUTES FOR THE DATASET:
        attributes = getAttributes(featureTypes, featureNominalValues);
        data = new Instances(relationName, attributes, 0);

        // ITERATE OVER ALL DATA ENTRIES IN THE DATASET:
        dataEntryIterator =
                oo.listStatements(new SimpleSelector(null, RDF.type, _DATAENTRY));
        while (dataEntryIterator.hasNext()) {
            Statement dataEntry = dataEntryIterator.next();


            /**
             * B2. For every dataEntry, iterate over all values nodes.
             */
            Instance temp = null;
            valuesIterator =
                    oo.listStatements(new SimpleSelector(dataEntry.getSubject(), OTObjectProperties.values.createProperty(oo), (Resource) null));

            double[] vals = new double[data.numAttributes()];
            for (int i = 0; i < data.numAttributes(); i++) {
                vals[i] = Instance.missingValue();
            }

            StmtIterator compoundNamesIterator =
                    oo.listStatements(new SimpleSelector(dataEntry.getSubject(), OTObjectProperties.compound.createProperty(oo), (Resource) null));
            String compoundName = null;
            if (compoundNamesIterator.hasNext()) {
                compoundName = compoundNamesIterator.next().getObject().as(Resource.class).getURI();
            }

            vals[data.attribute(compound_uri).index()] = data.attribute(compound_uri).addStringValue(compoundName);

            while (valuesIterator.hasNext()) {
                Statement values = valuesIterator.next();

                /*
                 * A pair of the form (AttributeName, AttributeValue) is created.
                 * This will be registered in an Instance-type object which
                 * is turn will be used to update the dataset.
                 */

                // atVal is the value of the attribute
                String atVal = values.getProperty(OTDataTypeProperties.value.createProperty(oo)).getObject().as(Literal.class).getValue().toString();
                // and atName is the name of the corresponding attribute.
                String atName = values.getProperty(OTObjectProperties.feature.createProperty(oo)).getObject().as(Resource.class).getURI();



                if (featureTypes.get(oo.createResource(atName)).equals(WekaDataTypes.numeric)) {
                    try {
                        vals[data.attribute(atName).index()] = Double.parseDouble(atVal);
                        /**
                         * The following catch rule, handles cases where some values are declared
                         * as numeric (double, float etc) but their value cannot be cast as
                         * double.
                         */
                    } catch (NumberFormatException ex) {
                        /* Just don't include this value in the dataset */
                    }
                } else if (featureTypes.get(oo.createResource(atName)).equals(WekaDataTypes.string)) {
                    vals[data.attribute(atName).index()] = data.attribute(atName).addStringValue(atVal);
                } else if (XSDDatatype.XSDdate.getURI().equals(atName)) {
                    try {
                        vals[data.attribute(atName).index()] = data.attribute(atName).parseDate(atVal);
                    } catch (ParseException ex) {
                        System.out.println(ex);
                        //Logger.getLogger(Dataset.class.getName()).log(Level.SEVERE, null, ex);
                    }
                }
            }
            temp = new Instance(1.0, vals);

            // Add the Instance only if its compatible with the dataset!
            if (data.checkInstance(temp)) {
                data.add(temp);
            } else {
                System.err.println("Warning! The instance " + temp + " is not compatible with the dataset!");
            }
        }
        dataEntryIterator.close();

        return data;

    }

    private FastVector getAttributes(Map<Resource, WekaDataTypes> featureTypes, Map<Resource, ArrayList<String>> nominalValues) {
        FastVector atts = new FastVector();
        Set<Entry<Resource, WekaDataTypes>> entrySetDatatypes = featureTypes.entrySet();
        // THE EXISTENCE OF THE (STRING) ATTRIBUTE 'COMPOUND_URI' IS MANDATORY FOR ALL
        // DATASETS. THIS IS ALWAYS THE FIRST ATTRIBUTE IN THE LIST.
        atts.addElement(new Attribute(compound_uri, (FastVector) null));
        // ADD NUMERIC AND STRING ATTRIBUTES INTO THE FASTVECTOR:
        for (Entry<Resource, WekaDataTypes> entry : entrySetDatatypes) {
            WekaDataTypes dataType = entry.getValue();
            if (dataType.equals(WekaDataTypes.numeric)) {
                atts.addElement(new Attribute(entry.getKey().getURI()));
            } else if (dataType.equals(WekaDataTypes.string) || dataType.equals(WekaDataTypes.general)) {
                atts.addElement(new Attribute(entry.getKey().getURI(), (FastVector) null));
            }
        }
        // COPE WITH NOMINAL VALUES:
        Set<Entry<Resource, ArrayList<String>>> nominalAttsSet = nominalValues.entrySet();
        for (Entry<Resource, ArrayList<String>> entry : nominalAttsSet){
            FastVector nominalFVec = new FastVector(entry.getValue().size());
            for (String nominalValue : entry.getValue()){
                nominalFVec.addElement(nominalValue);
            }
            atts.addElement(new Attribute(entry.getKey().toString(), nominalFVec));
        }
        return atts;
    }

    public static void main(String[] args) throws Exception {

        InputProcessor<OntObject> p1 = new InputProcessor<OntObject>();
        DatasetBuilder p2 = new DatasetBuilder();
        InstancesProcessor p3 = new InstancesProcessor();
        AbstractFilter filter1 = new AttributeCleanup(new ATTRIBUTE_TYPE[] {ATTRIBUTE_TYPE.string});
        AbstractFilter filter = new SimpleMVHFilter();

        Pipeline pipe = new Pipeline();
        pipe.add(p1);
        pipe.add(p2);
        pipe.add(p3);
        pipe.add(filter1);
        pipe.add(filter);

        Instances data = (Instances) pipe.process(new URI("http://localhost/9"));
        
    }

    public RDFObject getRDF(){
        return new RDFObject(oo);
    }
}