/*
* MetricSpearman.java
*
* Copyright (c) 2005-2007 Andrew Krizhanovsky /aka at mail.iias.spb.su/
* Distributed under GNU Public License.
*/
package wikipedia.experiment;
import wikipedia.kleinberg.Article;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.util.HashMap;
class WordPosition {
/** Position of word in the first list */
public int pos1;
/** Position of word in the second list */
public int pos2;
//public int dist;
public WordPosition() {
pos1 = pos2 = -1;
}
};
/** Spearman's rank correlation coefficient (Spearman's Footrule) is a metric.
* It assesses how well an arbitrary monotonic function could describe
* the relationship between two variables, without making any assumptions
* about the frequency distribution of the variables.
* See http://en.wikipedia.org/wiki/Spearman's_rank_correlation_coefficient
*
*/
public class MetricSpearman {
public MetricSpearman() {
}
/** Returns true if a is b with precision eps. */
public static boolean equals(double a,double b,double eps) {
return Math.abs(a - b) < eps ? true : false;
}
/** Compares titles of two list of articles. List can have different length.
* It is supposed that list1 (list2) has no duplicate values.
* @return -1 if List<Article> is null, bigger returned value means bigger difference.
*/
public static int compare(List<Article> list1, List<Article> list2) {
int i;
if( null == list1 || 0 == list1.size() ||
null == list2 || 0 == list2.size()) {
return -1;
}
String[] s1 = new String[list1.size()];
String[] s2 = new String[list2.size()];
for(i=0; i<s1.length; i++) {
s1[i] = list1.get(i).page_title;
}
for(i=0; i<s2.length; i++) {
s2[i] = list2.get(i).page_title;
}
return compare(s1,s2);
}
/** Compares two list of words. List can have different length.
* It is supposed that list1 (list2) has no duplicate values.
*
* Return -1 if String[] is null.
* Bigger returned value means bigger difference.
* If element of short list is absent in long list, then
* dist += (length(long_list) // ? - word_position_in_short_list
*/
public static int compare(String[] list1, String[] list2) {
if( null == list1 || 0 == list1.length ||
null == list2 || 0 == list2.length) {
return -1;
}
String[] list_short;
String[] list_long;
if(list1.length < list2.length) {
list_short = list1;
list_long = list2;
} else {
list_short = list2;
list_long = list1;
}
int dist = 0;
// Position of word in the first list:
// m<Word, Position_in_list1>
Map<String, Integer> map_long = new HashMap<String, Integer>();
for(int i=0; i<list_long.length; i++) {
map_long.put(list_long[i], i);
}
//boolean b_once = false;
for(int i=0; i<list_short.length; i++) {
if(map_long.containsKey(list_short[i])) {
dist += Math.abs(map_long.get(list_short[i]) - i);
//b_once = true;
} else {
dist += list_long.length;
}
}
return dist;
}
private static Map<String, WordPosition> createMap (String[] list1,String[] list2) {
Map<String, WordPosition> map = new HashMap<String, WordPosition>();
for(int i=0; i<list1.length; i++) {
WordPosition wp = new WordPosition();
wp.pos1 = i;
map.put(list1[i], wp);
}
for(int i=0; i<list2.length; i++) {
if(map.containsKey(list2[i])) {
WordPosition wp = map.get(list2[i]);
wp.pos2 = i;
}
}
return map;
}
/** Calculates the distance between two list of words
* (1 - similar, 0 - unrelated lists),
* (S1, S2 - reranked lists with common elements) by the formula:
* <pre>
* <math> Fr = 1- {{sum {S1 - S2}} over {MaxFr}</math>
* MaxFr = (|S|^2)/2 if |S| - even;
* MaxFr = (|S|+1)*(|S|-1)/2 if |S| - odd.</pre>
*
* List can have different length.
* It is supposed that list1 (list2) has no duplicate values.
*
* Return -1 if String[] is null.
*/
public static double calcSpearmanFootrule(String[] list1, String[] list2) {
if( null == list1 || 0 == list1.length ||
null == list2 || 0 == list2.length) {
return -1;
}
if(1 == list1.length && 1 == list2.length)
{
if(list1[0].equalsIgnoreCase(list2[0]))
return 1;
return 0;
}
Map<String, WordPosition> map = createMap(list1, list2);
List<String> s1 = new ArrayList<String>();
List<String> s2 = new ArrayList<String>();
for(String s:list1) {
WordPosition wp = map.get(s);
if(-1 != wp.pos2) {
s1.add(s);
}
}
for(String s:list2) {
if(map.containsKey(s)) {
WordPosition wp = map.get(s);
if(-1 != wp.pos1) {
s2.add(s);
}
}
}
assert(s1.size() == s2.size());
if(0 == s1.size() || 0 == s2.size())
return 0;
if(1 == s1.size() && 1 == s2.size())
{
if(s1.get(0).equalsIgnoreCase(s2.get(0)))
return 1;
return 0;
}
double dist = (double)compare((String[])s1.toArray(new String[0]),
(String[])s2.toArray(new String[0]));
double max_fr;
int b = s1.size();
if(b == b >> 1 << 1) { // even
max_fr = b*b / 2;
} else { // odd
max_fr = (b+1)*(b-1) / 2;
}
dist = 1 - dist / max_fr;
return dist;
}
/** Finds elements of small array in big one. Concatenates these elements
* in one string using 'token', add position of these elements in big array.
*/
public static String findStringWithPosition (String[] big, String[] small, String token) {
Map<String, WordPosition> map = createMap(big, small);
List<String> s1 = new ArrayList<String>();
List<String> s2 = new ArrayList<String>();
String res = "";
for(int i=0; i<big.length; i++) {
WordPosition wp = map.get(big[i]);
if(-1 != wp.pos2) {
if(0 < res.length()) {
res += token;
}
res += big[i] + i;
}
}
return res;
}
}
//class WordPosition {
// Position of word in the first list
// public int pos1;
/** Position of word in the second list */
//public int pos2;
//public int dist;
//};