package ivory.bloomir.util;
import java.util.List;
import java.util.SortedMap;
import java.util.Map;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import ivory.core.data.stat.SpamPercentileScore;
/**
* @author Nima Asadi
*/
public class DocumentUtility {
/**
* Generates new document ids based on spam/quality scores.
*
* @param spamScores Spam percentile scores
* @return Array of integers where the value at index i
* is the new document id for document i.
*/
public static int[] spamSortDocids(SpamPercentileScore spamScores) {
Preconditions.checkNotNull(spamScores);
int[] newDocids = new int[spamScores.getDocCount() + 1];
SortedMap<Integer, List<Integer>> spamSortedDocids = Maps.newTreeMap();
for (int i = 1; i < newDocids.length; i++) {
int score = -spamScores.getRawScore(i);
if(!spamSortedDocids.containsKey(score)) {
List<Integer> list = Lists.newArrayList();
spamSortedDocids.put(score, list);
}
spamSortedDocids.get(score).add(i);
}
int index = 1;
for(Map.Entry<Integer, List<Integer>> entry : spamSortedDocids.entrySet()) {
for(int docno: entry.getValue()) {
newDocids[docno] = index++;
}
}
return newDocids;
}
/**
* Retrieves old docid for spam sorted docids generated by {@link #spamSortDocids},
* thereby providing a reverse-look up table.
*
* @param spamSortedDocidsMap Array of integers generated
* by {@link #spamSortDocids}
* @return An array of integers where the value at index i
* is the old document id for document i
*/
public static int[] reverseLookupSpamSortedDocids(int[] spamSortedDocidsMap) {
Preconditions.checkNotNull(spamSortedDocidsMap);
int[] buffer = new int[spamSortedDocidsMap.length];
for(int i = 0; i < buffer.length; i++) {
buffer[spamSortedDocidsMap[i]] = i;
}
return buffer;
}
}