/*
* Copyright 2013 SciFY NPO <info@scify.org>.
*
* This product is part of the NewSum Free Software.
* For more information about NewSum visit
*
* http://www.scify.gr/site/en/our-projects/completed-projects/newsum-menu-en
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* If this code or its output is used, extended, re-engineered, integrated,
* or embedded to any extent in another software or hardware, there MUST be
* an explicit attribution to this work in the resulting source code,
* the packaging (where such packaging exists), or user interface
* (where such an interface exists).
* The attribution must be of the form "Powered by NewSum, SciFY"
*/
package org.scify.NewSumServer.Server.MachineLearning;
import org.scify.NewSumServer.Server.MachineLearning.util;
import gr.demokritos.iit.jinsect.documentModel.comparators.NGramCachedGraphComparator;
import gr.demokritos.iit.jinsect.documentModel.representations.DocumentNGramSymWinGraph;
import gr.demokritos.iit.jinsect.storage.INSECTDB;
import gr.demokritos.iit.jinsect.structs.GraphSimilarity;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
/**
* The methods to create a vector for
* {@link dataSets#labelingSet(gr.demokritos.iit.jinsect.storage.INSECTDB, java.lang.String, java.lang.String) labeling}
* or
* {@link dataSets#trainingSet(gr.demokritos.iit.jinsect.storage.INSECTDB, java.lang.String) Training}
*
* @author panagiotis giotis
*/
public class vector {
/**
* Get similarity between the mail graph and all class graphs
*
* @param text Is the mail for the labeling process
* @param file The path file for the Insect db
* @return A string with the vectors
*/
public static String labellingVector(String text, INSECTDB file) {
NGramCachedGraphComparator ngc = new NGramCachedGraphComparator();
DocumentNGramSymWinGraph textg = new DocumentNGramSymWinGraph(); // define graph for the mail that recive
DocumentNGramSymWinGraph categoryg = new DocumentNGramSymWinGraph(); // define graph for the class graph
double NVS;
String vector = "";
textg.setDataString(text); // convert text to graph
HashSet<String> hasGnames = new HashSet<String>(); //create a HashSet with all class graph names
hasGnames.addAll(Arrays.asList(file.getObjectList("cg")));
for (String categoryname : hasGnames) { // for each class graph find the similarity number
categoryg = (DocumentNGramSymWinGraph) file.loadObject(categoryname, "cg");
GraphSimilarity gs = ngc.getSimilarityBetween(textg, categoryg); //compare tow graphs
NVS = (gs.SizeSimilarity == 0.0) ? 0.0 : gs.ValueSimilarity / gs.SizeSimilarity;
if (vector.equals("")) {
vector = Double.toString(NVS) + ",";
} else {
vector = vector + Double.toString(NVS) + ","; // write the number of similarity to vector
}
}
return vector; //example 0.2,0.1,0,0,1,
}
/**
* Get similarity between the instances with the given class name and all
* class graphs
*
* @param file The path for the InsectDB file
* @return A ArrayList with all vectors
*/
public static ArrayList<String> trainingVector(INSECTDB file) {
NGramCachedGraphComparator ngc = new NGramCachedGraphComparator();
DocumentNGramSymWinGraph fisrtGraph = new DocumentNGramSymWinGraph();
DocumentNGramSymWinGraph secondGraph = new DocumentNGramSymWinGraph();
double NVS;
ArrayList<String> vectors = new ArrayList<String>();
HashSet<String> hasGnames = new HashSet<String>(); //create a HashSet with all class graph names
hasGnames.addAll(Arrays.asList(file.getObjectList("cg")));
//------------------optimisation----------------------------------------
ArrayList<DocumentNGramSymWinGraph> CategoryGraphs = new ArrayList<DocumentNGramSymWinGraph>();
for (String index : hasGnames) {
CategoryGraphs.add( (DocumentNGramSymWinGraph) file.loadObject(index, "cg"));
}
//-----------------optimisation----------------------------------------
HashSet<String> hasInames = new HashSet<String>(); //create a HashSet with all class graph names
hasInames.addAll(Arrays.asList(file.getObjectList("ig")));
for (String instanceGraph : hasInames) { // for each instance find the vector and put it in the list
String [] tempTable=util.recordLine(instanceGraph).trim().split(":");
String CategoryName =tempTable[1] ;
fisrtGraph = (DocumentNGramSymWinGraph) file.loadObject(instanceGraph, "ig");
String vector = "";
for (DocumentNGramSymWinGraph index : CategoryGraphs) {
//for (String index : hasGnames) {
//secondGraph = (DocumentNGramSymWinGraph) file.loadObject(index, "cg");
//GraphSimilarity gs = ngc.getSimilarityBetween(fisrtGraph, secondGraph);
GraphSimilarity gs = ngc.getSimilarityBetween(fisrtGraph, index);
NVS = (gs.SizeSimilarity == 0.0) ? 0.0 : gs.ValueSimilarity / gs.SizeSimilarity;
if (vector.equals("")) {
vector = Double.toString(NVS) + ",";
} else {
vector = vector + Double.toString(NVS) + ","; // write the number of similarity to vector
}
}
vector=vector+CategoryName;
vectors.add(vector); //add vector to Array List
}
return vectors;
}
}