package ivory.bloomir.util; import java.util.List; import java.util.SortedMap; import java.util.Map; import com.google.common.base.Preconditions; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import ivory.core.data.stat.SpamPercentileScore; /** * @author Nima Asadi */ public class DocumentUtility { /** * Generates new document ids based on spam/quality scores. * * @param spamScores Spam percentile scores * @return Array of integers where the value at index i * is the new document id for document i. */ public static int[] spamSortDocids(SpamPercentileScore spamScores) { Preconditions.checkNotNull(spamScores); int[] newDocids = new int[spamScores.getDocCount() + 1]; SortedMap<Integer, List<Integer>> spamSortedDocids = Maps.newTreeMap(); for (int i = 1; i < newDocids.length; i++) { int score = -spamScores.getRawScore(i); if(!spamSortedDocids.containsKey(score)) { List<Integer> list = Lists.newArrayList(); spamSortedDocids.put(score, list); } spamSortedDocids.get(score).add(i); } int index = 1; for(Map.Entry<Integer, List<Integer>> entry : spamSortedDocids.entrySet()) { for(int docno: entry.getValue()) { newDocids[docno] = index++; } } return newDocids; } /** * Retrieves old docid for spam sorted docids generated by {@link #spamSortDocids}, * thereby providing a reverse-look up table. * * @param spamSortedDocidsMap Array of integers generated * by {@link #spamSortDocids} * @return An array of integers where the value at index i * is the old document id for document i */ public static int[] reverseLookupSpamSortedDocids(int[] spamSortedDocidsMap) { Preconditions.checkNotNull(spamSortedDocidsMap); int[] buffer = new int[spamSortedDocidsMap.length]; for(int i = 0; i < buffer.length; i++) { buffer[spamSortedDocidsMap[i]] = i; } return buffer; } }