package focusedCrawler.seedfinder;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import focusedCrawler.target.model.Page;
import focusedCrawler.target.model.ParsedData;
import focusedCrawler.util.parser.PaginaURL;
import focusedCrawler.util.string.StopListFile;
public class RelevanceModel {
private Map<String, Double> termScores = new HashMap<>();
public void addPage(boolean isRelevant, Page page) {
PaginaURL pageParser = new PaginaURL(page.getURL(), page.getContentAsString(), StopListFile.DEFAULT);
page.setParsedData(new ParsedData(pageParser));
String[] terms = page.getParsedData().getWords();
List<String> words = new ArrayList<String>();
for (int i = 0; i < terms.length; i++) {
if(terms[i] == null || terms[i].trim().length() < 2) {
continue;
}
words.add(terms[i]);
}
this.addPage(isRelevant, (String[]) words.toArray(new String[words.size()]));
}
public void addPage(boolean isRelevant, String[] docTerms) {
if(docTerms.length == 0) {
return;
}
Map<String, Integer> termFrequencies = countTerms(docTerms);
for(String term : docTerms) {
if(term == null || term.trim().length() < 2) {
continue;
}
int tf = termFrequencies.get(term);
Double termScore = termScores.get(term);
if(termScore == null) {
termScore = 0d;
}
double newScore;
if(isRelevant) {
newScore = termScore + ( ((double) tf) / docTerms.length);
} else {
newScore = termScore - ( ((double) tf) / docTerms.length);
}
termScores.put(term, newScore);
}
}
private Map<String, Integer> countTerms(String[] docTerms) {
Map<String, Integer> counts = new HashMap<String, Integer>();
for(int i = 0; i < docTerms.length; i++) {
if(docTerms[i] == null || docTerms[i].trim().length() < 2) {
continue;
}
Integer count = counts.get(docTerms[i]);
if(count == null) {
count = 1;
} else {
count++;
}
counts.put(docTerms[i], count);
}
return counts;
}
double getTermScore(String term) {
return termScores.get(term);
}
public double reweightScore(String term, double queryPrecision) {
if(!termScores.containsKey(term)) {
termScores.put(term, 1d/100);
}
double oldScore = termScores.get(term);
double newScore = oldScore * queryPrecision;
termScores.put(term, newScore);
return newScore;
}
public List<QueryTerm> getTermsWithBestScores(int size) {
this.termScores = sortByValueInDescendingOrder(this.termScores);
List<QueryTerm> terms = new ArrayList<QueryTerm>();
for (Entry<String, Double> termScore : this.termScores.entrySet()) {
terms.add(new QueryTerm(termScore.getKey(), termScore.getValue()));
if(terms.size() > size) {
break;
}
}
return terms;
}
public QueryTerm getTermsWithBestScore() {
termScores = sortByValueInDescendingOrder(termScores);
printTermScoresMap();
Entry<String, Double> first = termScores.entrySet().iterator().next();
QueryTerm queryTerm = new QueryTerm(first.getKey(), first.getValue());
return queryTerm;
}
private void printTermScoresMap() {
for(Entry<String, Double> entry : termScores.entrySet()) {
System.out.println(entry.getKey() + " - " + entry.getValue());
}
}
public QueryTerm getTermWithBestScoreExcept(Set<String> exceptions) {
termScores = sortByValueInDescendingOrder(termScores);
for(Entry<String, Double> ts : termScores.entrySet()) {
if(!exceptions.contains(ts.getKey())) {
return new QueryTerm(ts.getKey(), ts.getValue());
}
// else {
// System.out.println(ts.getKey() + " is an exception.");
// }
}
return null;
}
public static <K, V extends Comparable<? super V>> Map<K, V> sortByValueInDescendingOrder(Map<K, V> map) {
List<Map.Entry<K, V>> list = new LinkedList<Map.Entry<K, V>>(map.entrySet());
Collections.sort(list, new Comparator<Map.Entry<K, V>>() {
public int compare(Map.Entry<K, V> o1, Map.Entry<K, V> o2) {
return (o2.getValue()).compareTo(o1.getValue());
}
});
Map<K, V> result = new LinkedHashMap<K, V>();
for (Map.Entry<K, V> entry : list) {
result.put(entry.getKey(), entry.getValue());
}
return result;
}
}