/*
* Copyright 2015 Themistoklis Mavridis <themis.mavridis@issel.ee.auth.gr>.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.thesmartweb.swebrank;
import java.util.*;
/**
* Class for Normalized Web Distance calculation
* @author Themistoklis Mavridis
*/
public class NWD_total {
/**
* Method to get the NWD score
* @param ngd_arr the array of terms combinations to compare to the original query
* @param queries the original queries list
* @param ngd_threshold the nwd threshold
* @param i the index of the current query
* @param config_path the directory where all the configuration files are stored
* @return the NWD scores
*/
public int[] call(String[] ngd_arr,List<String> queries,Double ngd_threshold,int i, String config_path) {
//get all nwd scores for all the words comparing to the current query term
NWD_Analysis ngd=new NWD_Analysis();
Double[] ngd_scores=new Double[ngd_arr.length];
System.out.println("into ngd total");
double queryOriginalResults=ngd.logResults(queries.get(i), config_path);
for(int j=0;j<ngd_scores.length;j++){
int flag=0;
//if a word is in the first keywords do not calculate a ngd score for it
for(int k=0;k<queries.size();k++){
if (ngd_arr[j].equalsIgnoreCase(queries.get(k))){flag=1;}
}
//if (flag==0){ngd_scores[j]=ngd.NWD_score(queries.get(i),ngd_arr[j], config_path);}
if (flag==0){ngd_scores[j]=ngd.NWD_score(queries.get(i),ngd_arr[j], config_path,queryOriginalResults);}
if (flag==1){ngd_scores[j]=Double.parseDouble("10000000000000000");}
}
//get the scores to a list
List<Double> ngd_scores_list=Arrays.asList(ngd_scores);
//create a hashmap in order to map the scores with the indexes
IdentityHashMap<Double, Integer> originalIndices = new IdentityHashMap<>();
//copy the original scores list
for(int j=0; j<ngd_scores_list.size(); j++) {
originalIndices.put(ngd_scores_list.get(j), j);
}
//sort the scores
List<Double> sorted_ngd_scores = new ArrayList<Double>();
sorted_ngd_scores.addAll(ngd_scores_list);
Collections.sort(sorted_ngd_scores);
//get the original indexes
/*//if we want to take the top scores(for example top 10)
int top_ngd=10;
int[] origIndex=new int[10];
for(int i3=0; i3<top_ngd; i3++) {
Double score = sorted_ngd_scores.get(i3);
/ Lookup original index efficiently
origIndex[i3] = originalIndices.get(score);
}*/
//if we have a threshold for ngd scores we follow the code below
int y=0;
int counter=0;
while(y<sorted_ngd_scores.size()){
if(sorted_ngd_scores.get(y).compareTo(ngd_threshold)<=0){
counter++;
}
y++;
}
//we get the indexes and from them we get the terms of ngd_arr that are below the threshold
//we submit every term to the search engines and we get the keys that LDA analysis returns
int[] origIndex=new int[counter];
for(int j=0;j<origIndex.length-1;j++){
Double score = sorted_ngd_scores.get(j);
origIndex[j] = originalIndices.get(score);
}
return origIndex;
}
}