package org.wikipedia.miner.web.service; import gnu.trove.iterator.TIntIntIterator; import gnu.trove.iterator.TIntObjectIterator; import gnu.trove.map.hash.TIntIntHashMap; import gnu.trove.map.hash.TIntObjectHashMap; import gnu.trove.set.hash.TIntHashSet; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.TreeSet; import javax.servlet.ServletConfig; import javax.servlet.ServletException; import javax.servlet.http.HttpServletRequest; import org.simpleframework.xml.Attribute; import org.simpleframework.xml.ElementList; import org.wikipedia.miner.comparison.ArticleComparer; import org.wikipedia.miner.db.struct.DbLinkLocation; import org.wikipedia.miner.db.struct.DbLinkLocationList; import org.wikipedia.miner.model.Article; import org.wikipedia.miner.model.Page; import org.wikipedia.miner.model.Page.PageType; import org.wikipedia.miner.model.Wikipedia; import org.wikipedia.miner.util.RelatednessCache; import org.dmilne.xjsf.Service; import org.dmilne.xjsf.UtilityMessages.ErrorMessage; import org.dmilne.xjsf.UtilityMessages.ParameterMissingMessage; import org.dmilne.xjsf.param.FloatParameter; import org.dmilne.xjsf.param.IntListParameter; import org.dmilne.xjsf.param.IntParameter; import com.google.gson.annotations.Expose; public class SuggestService extends WMService { private static final long serialVersionUID = 2890788121538938947L; private IntListParameter prmQueryTopics ; private IntParameter prmSuggestionLimit ; private IntParameter prmCategoryLimit ; private IntParameter prmSearchSpace ; private FloatParameter prmMinIndividualRelatedness ; private FloatParameter prmMinAverageRelatedness ; public SuggestService() { super("query","Suggests alternative topics that are related to a set of seed topics", "<p>This service takes a set of seed topics, and suggests articles that relate to them. These suggestions are weighted by thier relatedness to the query, and organized by the categories they belong to.</p>" + "<p>It is designed to be used in conjunction with the <a href='services.html?search'>search</a> service</p>", false); } public void init(ServletConfig config) throws ServletException { super.init(config); prmQueryTopics = new IntListParameter("queryTopics", "A set of topic ids that suggestons should relate to", null) ; addGlobalParameter(prmQueryTopics) ; prmSuggestionLimit = new IntParameter("maxSuggestions", "Maximum number of suggested topics to return", 250) ; addGlobalParameter(prmSuggestionLimit) ; prmCategoryLimit = new IntParameter("maxCategories", "Maximum number of categories to organize suggestions under", 25) ; addGlobalParameter(prmCategoryLimit) ; prmSearchSpace = new IntParameter("searchSpace", "Maximum number of rough suggestions to search. Increasing this will likely provide better suggestions, but slower responses", 100000) ; addGlobalParameter(prmSearchSpace) ; prmMinIndividualRelatedness = new FloatParameter("minIndividualRelatedness", "Minimum relatedness a suggestion must have to each query topic", 0.2F) ; addGlobalParameter(prmMinIndividualRelatedness) ; prmMinAverageRelatedness = new FloatParameter("minAverageRelatedness", "Minimum average relatedness a suggestion must have to all query topics", 0.3F) ; addGlobalParameter(prmMinAverageRelatedness) ; Integer[] topics = {147313, 4913064} ; addExample( new ExampleBuilder("To see suggestions for <i>hiking new zealand</i>") .addParam(prmQueryTopics, topics) .build() ) ; } public Service.Message buildWrappedResponse(HttpServletRequest request) throws Exception { Integer[] queryTopicIds = prmQueryTopics.getValue(request) ; if (queryTopicIds == null || queryTopicIds.length == 0) return new ParameterMissingMessage(request) ; Wikipedia wikipedia = this.getWikipedia(request) ; Message msg = new Message(request) ; //identify query topics HashMap<Integer,Article> queryTopics = new HashMap<Integer,Article>() ; for (int id:queryTopicIds) { Page page = wikipedia.getPageById(id) ; if (page.exists() && page.getType() == PageType.article) queryTopics.put(id, (Article)page) ; } if (queryTopics.isEmpty()) return new ErrorMessage(request, "no valid query topic ids specified") ; //gather roughly weighted suggestions TreeSet<Article> roughSuggestions = getRoughSuggestions(queryTopics, wikipedia) ; //refine suggestions List<Article> refinedSuggestions = getRefinedSuggestions(roughSuggestions, queryTopics, wikipedia, request) ; // gather categories TIntObjectHashMap<SuggestionCategory> categoriesById = getCategoriesById(refinedSuggestions) ; // refine and sort categories, identify categorized topics TIntHashSet categorizedIds = new TIntHashSet() ; ArrayList<SuggestionCategory> refinedCategories = getSortedCategories(categoriesById, categorizedIds, request) ; //build xml response for (SuggestionCategory cat: refinedCategories) msg.addCategory(cat) ; msg.setUncategorizedSuggestions(refinedSuggestions, categorizedIds) ; return msg; } private TreeSet<Article> getRoughSuggestions(HashMap<Integer,Article> queryTopics, Wikipedia wikipedia) { // get a rough ranking of suggestions just from overlaps of ids TIntIntHashMap roughWeights = new TIntIntHashMap() ; for (Article topic:queryTopics.values()){ //gather rough suggestions from all out links DbLinkLocationList outLinks = wikipedia.getEnvironment().getDbPageLinkOut().retrieve(topic.getId()) ; if (outLinks != null && outLinks.getLinkLocations() != null) { for (DbLinkLocation outLink:outLinks.getLinkLocations()) { Integer weight = roughWeights.get(outLink.getLinkId()) ; if (weight == null) weight = 0 ; roughWeights.put(outLink.getLinkId(), weight+outLink.getSentenceIndexes().size()) ; } } //gather rough suggestions from all in links DbLinkLocationList inLinks = wikipedia.getEnvironment().getDbPageLinkIn().retrieve(topic.getId()) ; if (inLinks != null && inLinks.getLinkLocations() != null) { for (DbLinkLocation inLink:inLinks.getLinkLocations()) { Integer weight = roughWeights.get(inLink.getLinkId()) ; if (weight == null) weight = 0 ; roughWeights.put(inLink.getLinkId(), weight+inLink.getSentenceIndexes().size()) ; } } } //sort rough suggestions TreeSet<Article> roughSuggestions = new TreeSet<Article>() ; TIntIntIterator iter = roughWeights.iterator() ; while (iter.hasNext()) { iter.advance() ; if (!queryTopics.containsKey(iter.key())) { Article rs = new Article(wikipedia.getEnvironment(), iter.key()) ; //System.out.println("rs:" + rs + ", " + iter.value()) ; rs.setWeight((double)iter.value()) ; roughSuggestions.add(rs) ; } } //for (Article art:roughSuggestions) { // System.out.println("gatheredRs: " + art + ", " + getHub().format(art.getWeight())) ; //} System.out.println(roughSuggestions.size() + " rough suggestions") ; return roughSuggestions ; } private List<Article> getRefinedSuggestions(TreeSet<Article> roughSuggestions, HashMap<Integer, Article> queryTopics, Wikipedia wikipedia, HttpServletRequest request) throws Exception { ArrayList<Article> refinedSuggestions = new ArrayList<Article>() ; int searchSpace = prmSearchSpace.getValue(request) ; float minIndividualRelatedness = prmMinIndividualRelatedness.getValue(request) ; float minAvgRelatedness = prmMinAverageRelatedness.getValue(request) ; RelatednessCache rc = new RelatednessCache(new ArticleComparer(wikipedia)) ; int c=0 ; for (Article suggestion:roughSuggestions) { if (c++ > searchSpace) break ; try { if (suggestion.getType() != PageType.article) continue ; double relatedness = 0 ; for (Article topic:queryTopics.values()) { double r = rc.getRelatedness(topic, suggestion) ; if (r < minIndividualRelatedness) { suggestion = null ; break ; } else { relatedness = relatedness + r ; } } if (suggestion == null) continue ; relatedness = relatedness/queryTopics.size() ; if (relatedness < minAvgRelatedness) continue ; suggestion.setWeight(relatedness) ; refinedSuggestions.add(suggestion) ; } catch (Exception e) { System.out.println(e.getMessage()) ; e.printStackTrace() ; }; } Collections.sort(refinedSuggestions) ; return refinedSuggestions.subList(0, Math.min(refinedSuggestions.size(), prmSuggestionLimit.getValue(request))) ; } private TIntObjectHashMap<SuggestionCategory> getCategoriesById(List<Article> suggestions) { TIntObjectHashMap<SuggestionCategory> categoriesById = new TIntObjectHashMap<SuggestionCategory>() ; for (Article suggestion:suggestions) { for (org.wikipedia.miner.model.Category cat : suggestion.getParentCategories()) { SuggestionCategory category = categoriesById.get(cat.getId()) ; if (category == null) category = new SuggestionCategory(cat) ; category.addSuggestion(suggestion) ; categoriesById.put(cat.getId(), category) ; } } return categoriesById ; } private ArrayList<SuggestionCategory> getSortedCategories(TIntObjectHashMap<SuggestionCategory> categoriesById, TIntHashSet categorizedIds, HttpServletRequest request) { //sort categories according to the weights of the articles they contain, discarding those that are too small ArrayList<SuggestionCategory> weightedCategories = new ArrayList<SuggestionCategory>() ; TIntObjectIterator<SuggestionCategory> iter = categoriesById.iterator() ; while (iter.hasNext()) { iter.advance() ; SuggestionCategory cat = iter.value() ; cat.recalculateWeight() ; //TODO: make configurable? if (cat.getSuggestions().size() > 3 && cat.getWeight() > 1) { weightedCategories.add(cat) ; } } Collections.sort(weightedCategories) ; //reweight categories to ignore weights of duplicate articles TIntHashSet seenIds = new TIntHashSet() ; Iterator<SuggestionCategory> iter2 = weightedCategories.iterator() ; while (iter2.hasNext()) { SuggestionCategory cat = iter2.next() ; for(Article art:cat.getSuggestions()) { if (seenIds.contains(art.getId())) { //this was seen in a higher-ranked category, so don't count it as part of the category weight. cat.ignore(art.getId()) ; } else { seenIds.add(art.getId()) ; } } cat.recalculateWeight() ; } //re-sort refined categories Collections.sort(weightedCategories) ; int maxCategories = prmCategoryLimit.getValue(request) ; ArrayList<SuggestionCategory> refinedCategories = new ArrayList<SuggestionCategory>() ; for (SuggestionCategory cat: weightedCategories){ if (refinedCategories.size() >= maxCategories) break ; if (cat.getWeight() > 1.5 && cat.getNonIgnoredSize() > 2) { // keep this category refinedCategories.add(cat) ; for (Article art:cat.getSuggestions()) categorizedIds.add(art.getId()) ; } } return refinedCategories ; } public class SuggestionCategory extends org.wikipedia.miner.model.Category { private ArrayList<Article> suggestions ; private TIntHashSet idsToIgnore ; public SuggestionCategory(org.wikipedia.miner.model.Category cat) { super(cat.getEnvironment(), cat.getId()) ; suggestions = new ArrayList<Article>() ; idsToIgnore = new TIntHashSet() ; setWeight(0.0) ; } public void addSuggestion(Article article) { suggestions.add(article) ; //weight = weight + article.getWeight() ; } public ArrayList<Article> getSuggestions() { return suggestions ; } public void ignore(int id) { idsToIgnore.add(id) ; } public boolean isIgnored(int id) { return idsToIgnore.contains(id) ; } public int getNonIgnoredSize() { return suggestions.size() - idsToIgnore.size() ; } public double recalculateWeight() { setWeight(0.0) ; int c = 0 ; for (Article a: suggestions) { if (!idsToIgnore.contains(id)) { setWeight(getWeight() + a.getWeight()) ; //only consider weight of top 3 articles? if (c++ >= 3) break ; } } return weight ; } } public static class Message extends Service.Message { @Expose @ElementList(entry="suggestionCategory") private ArrayList<Category> suggestionCategories = new ArrayList<Category>() ; @Expose @ElementList(entry="suggestion") private ArrayList<Suggestion> uncategorizedSuggestions = new ArrayList<Suggestion>(); private Message(HttpServletRequest request) { super(request) ; } private void addCategory(SuggestionCategory cat) { suggestionCategories.add(new Category(cat)) ; } private void setUncategorizedSuggestions(List<Article>allSuggestions, TIntHashSet categorizedIds) { for (Article art:allSuggestions) { if (!categorizedIds.contains(art.getId())) uncategorizedSuggestions.add(new Suggestion(art)) ; } } public List<Category> getSuggestionCategories() { return Collections.unmodifiableList(suggestionCategories); } public List<Suggestion> getUncategorizedSuggestions() { return Collections.unmodifiableList(uncategorizedSuggestions); } } public static class Category { @Expose @Attribute private int id ; @Expose @Attribute private String title ; @Expose @Attribute private double weight ; @Expose @ElementList(entry="suggestion", inline=true) private ArrayList<Suggestion> suggestions ; private Category(SuggestionCategory cat) { id = cat.getId() ; title = cat.getTitle() ; weight = cat.getWeight() ; suggestions = new ArrayList<Suggestion>() ; for (Article art:cat.getSuggestions()) { suggestions.add(new Suggestion(art)) ; } } public int getId() { return id; } public String getTitle() { return title; } public double getWeight() { return weight; } public List<Suggestion> getSuggestions() { return Collections.unmodifiableList(suggestions); } } public static class Suggestion { @Expose @Attribute private int id ; @Expose @Attribute private String title ; @Expose @Attribute private double weight ; private Suggestion(Article art) { id = art.getId() ; title = art.getTitle() ; weight = art.getWeight() ; } public int getId() { return id; } public String getTitle() { return title; } public double getWeight() { return weight; } } }