package org.wikipedia.miner.web.service; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Map; import java.util.TreeMap; import javax.servlet.ServletConfig; import javax.servlet.ServletException; import javax.servlet.http.HttpServletRequest; import org.simpleframework.xml.Attribute; import org.simpleframework.xml.Element; import org.simpleframework.xml.ElementList; import org.simpleframework.xml.Text; import org.wikipedia.miner.comparison.ArticleComparer; import org.wikipedia.miner.model.Article; import org.wikipedia.miner.model.Category; import org.wikipedia.miner.model.Wikipedia; import org.wikipedia.miner.web.util.ImageRetriever; import org.wikipedia.miner.web.util.UtilityMessages.InvalidIdMessage; import org.wikipedia.miner.web.util.UtilityMessages.InvalidTitleMessage; import org.apache.log4j.Logger; import org.dmilne.xjsf.Service; import org.dmilne.xjsf.UtilityMessages.ErrorMessage; import org.dmilne.xjsf.UtilityMessages.ParameterMissingMessage; import org.dmilne.xjsf.param.BooleanParameter; import org.dmilne.xjsf.param.EnumParameter; import org.dmilne.xjsf.param.IntParameter; import org.dmilne.xjsf.param.ParameterGroup; import org.dmilne.xjsf.param.StringParameter; import com.google.gson.annotations.Expose; import org.dmilne.xjsf.param.IntListParameter; @SuppressWarnings("serial") public class ExploreArticleService extends WMService { //TODO: modify freebase image request to use article titles rather than ids //TODO: if lang is not en, use languageLinks to translate article title to english. private enum GroupName { id, title, ids }; public enum DefinitionLength { LONG, SHORT }; private ImageRetriever imageRetriever; private ParameterGroup grpId; private IntParameter prmId; private ParameterGroup grpIds; private IntListParameter prmIds; private ParameterGroup grpTitle; private StringParameter prmTitle; private BooleanParameter prmDefinition; private EnumParameter<DefinitionLength> prmDefinitionLength; private BooleanParameter prmLabels; private BooleanParameter prmTranslations; private BooleanParameter prmImages; private IntParameter prmImageWidth; private IntParameter prmImageHeight; private BooleanParameter prmParentCategories; private BooleanParameter prmInLinks; private IntParameter prmInLinkMax; private IntParameter prmInLinkStart; private BooleanParameter prmOutLinks; private IntParameter prmOutLinkMax; private IntParameter prmOutLinkStart; private BooleanParameter prmLinkRelatedness; private static Logger logger = Logger.getLogger(ExploreArticleService.class); public ExploreArticleService() { super("core", "Provides details of individual articles", "<p></p>", false ); } @Override public void init(ServletConfig config) throws ServletException { super.init(config); grpId = new ParameterGroup(GroupName.id.name(), "To retrieve an article by id"); prmId = new IntParameter("id", "The unique identifier of the article to explore", null); grpId.addParameter(prmId); addParameterGroup(grpId); grpIds = new ParameterGroup(GroupName.ids.name(), "To retrieve articles by id"); prmIds = new IntListParameter("ids", "The list of unique identifiers for the articles to explore", null); grpIds.addParameter(prmIds); addParameterGroup(grpIds); grpTitle = new ParameterGroup(GroupName.title.name(), "To retrieve article by title"); prmTitle = new StringParameter("title", "The (case sensitive) title of the article to explore", null); grpTitle.addParameter(prmTitle); addParameterGroup(grpTitle); prmDefinition = new BooleanParameter("definition", "<b>true</b> if a snippet definition should be returned, otherwise <b>false</b>", false); addGlobalParameter(prmDefinition); String[] descLength = {"first paragraph", "first sentence"}; prmDefinitionLength = new EnumParameter<DefinitionLength>("definitionLength", "The required length of the definition", DefinitionLength.SHORT, DefinitionLength.values(), descLength); addGlobalParameter(prmDefinitionLength); addGlobalParameter(getWMHub().getFormatter().getEmphasisFormatParam()); addGlobalParameter(getWMHub().getFormatter().getLinkFormatParam()); prmLabels = new BooleanParameter("labels", "<b>true</b> if labels (synonyms, etc) for this topic are to be returned, otherwise <b>false</b>", false); addGlobalParameter(prmLabels); prmTranslations = new BooleanParameter("translations", "<b>true</b> if translations (language links) for this topic are to be returned, otherwise <b>false</b>", false); addGlobalParameter(prmTranslations); prmImages = new BooleanParameter("images", "Whether or not to retrieve relevant image urls from freebase", false); addGlobalParameter(prmImages); prmImageWidth = new IntParameter("maxImageWidth", "Images can be scaled. This defines their maximum width, in pixels", 150); addGlobalParameter(prmImageWidth); prmImageHeight = new IntParameter("maxImageHeight", "Images can be scaled. This defines their maximum height, in pixels", 150); addGlobalParameter(prmImageHeight); prmParentCategories = new BooleanParameter("parentCategories", "<b>true</b> if parent categories of this category should be returned, otherwise <b>false</b>", false); addGlobalParameter(prmParentCategories); prmInLinks = new BooleanParameter("inLinks", "<b>true</b> if articles that link to this article should be returned, otherwise <b>false</b>", false); addGlobalParameter(prmInLinks); prmInLinkMax = new IntParameter("inLinkMax", "the maximum number of in-links that should be returned. A max of <b>0</b> will result in all in-links being returned", 250); addGlobalParameter(prmInLinkMax); prmInLinkStart = new IntParameter("inLinkStart", "the index of the first in-link to return. Combined with <b>inLinkMax</b>, this parameter allows the user to page through large lists of in-links", 0); addGlobalParameter(prmInLinkStart); prmOutLinks = new BooleanParameter("outLinks", "<b>true</b> if articles that this article links to should be returned, otherwise <b>false</b>", false); addGlobalParameter(prmOutLinks); prmOutLinkMax = new IntParameter("outLinkMax", "the maximum number of out-links that should be returned. A max of <b>0</b> will result in all out-links being returned", 250); addGlobalParameter(prmOutLinkMax); prmOutLinkStart = new IntParameter("outLinkStart", "the index of the first out-link to return. Combined with <b>outLinkMax</b>, this parameter allows the user to page through large lists of out-links", 0); addGlobalParameter(prmOutLinkStart); prmLinkRelatedness = new BooleanParameter("linkRelatedness", "<b>true</b> if the relatedness of in- and out-links should be measured, otherwise <b>false</b>", false); addGlobalParameter(prmLinkRelatedness); imageRetriever = new ImageRetriever(getWMHub().getRetriever()); } @Override public Service.Message buildWrappedResponse(HttpServletRequest request) throws Exception { Wikipedia wikipedia = getWikipedia(request); ArticleComparer artComparer = null; if (prmLinkRelatedness.getValue(request)) { artComparer = getWMHub().getArticleComparer(this.getWikipediaName(request)); if (artComparer == null) { return new ErrorMessage(request, "Relatedness measures are unavailable for this instance of wikipedia"); } } ParameterGroup grp = getSpecifiedParameterGroup(request); if (grp == null) { return new ParameterMissingMessage(request); } List<Article> articleList = new ArrayList<Article>(); List<Integer> invalidList = new ArrayList<Integer>(); List<Integer> nullList = new ArrayList<Integer>(); switch (GroupName.valueOf(grp.getName())) { case id: Integer id = prmId.getValue(request); org.wikipedia.miner.model.Page page = wikipedia.getPageById(id); if (page == null) { nullList.add(id); } switch (page.getType()) { case disambiguation: case article: case redirect: articleList.add((Article) page); break; default: invalidList.add(id); } break; case ids: List<Integer> validList = new ArrayList<Integer>(); Integer[] ids = prmIds.getValue(request); for (int i = 0; i < ids.length; i++) { Integer integer = ids[i]; org.wikipedia.miner.model.Page pageIds = wikipedia.getPageById(integer); if (pageIds == null) { nullList.add(integer); } switch (pageIds.getType()) { case disambiguation: case article: case redirect: articleList.add((Article) pageIds); break; default: if (pageIds.getType() == org.wikipedia.miner.model.Page.PageType.category) { invalidList.add(integer); } else { nullList.add(integer); } } } break; case title: String title = prmTitle.getValue(request); Article arti = wikipedia.getArticleByTitle(title); if (arti == null) { return new InvalidTitleMessage(request, title); } else { articleList.add(arti); } break; } MessageList msge = new MessageList(request, nullList, invalidList); for (Article art : articleList) { ArticleMsg msg = new ArticleMsg(art); if (prmDefinition.getValue(request)) { String definition = null; if (prmDefinitionLength.getValue(request) == DefinitionLength.SHORT) { definition = art.getSentenceMarkup(0); } else { definition = art.getFirstParagraphMarkup(); } msg.setDefinition(getWMHub().getFormatter().format(definition, request, wikipedia)); } if (prmLabels.getValue(request)) { //get labels for this concept Article.Label[] labels = art.getLabels(); int total = 0; for (Article.Label lbl : labels) { total += lbl.getLinkOccCount(); } for (Article.Label lbl : labels) { long occ = lbl.getLinkOccCount(); if (occ > 0) { msg.addLabel(new Label(lbl, total)); } } } if (prmTranslations.getValue(request)) { TreeMap<String, String> translations = art.getTranslations(); for (Map.Entry<String, String> entry : translations.entrySet()) { msg.addTranslation(new Translation(entry.getKey(), entry.getValue())); } } if (prmImages.getValue(request)) { int width = prmImageWidth.getValue(request); int height = prmImageHeight.getValue(request); try { for (String imgTitle : imageRetriever.getImageTitles(art.getId())) { String imgUrl = imageRetriever.getImageUrl(imgTitle, width, height); if (imgUrl != null) { msg.addImage(new Image(imgUrl)); } } } catch (Exception e) { e.printStackTrace(); } } if (prmParentCategories.getValue(request)) { Category[] parents = art.getParentCategories(); logger.info("retrieving parents from " + parents.length + " total"); msg.setTotalParentCategories(parents.length); for (Category parent : parents) { msg.addParentCategory(new Page(parent)); } } if (prmOutLinks.getValue(request)) { int start = prmOutLinkStart.getValue(request); int max = prmOutLinkMax.getValue(request); if (max <= 0) { max = Integer.MAX_VALUE; } else { max = max + start; } Article[] linksOut = art.getLinksOut(); logger.info("retrieving out links [" + start + "," + max + "] from " + linksOut.length + " total"); msg.setTotalOutLinks(linksOut.length); for (int i = start; i < max && i < linksOut.length; i++) { Page p = new Page(linksOut[i]); if (artComparer != null) { p.setRelatedness(artComparer.getRelatedness(art, linksOut[i])); } msg.addOutLink(p); } } if (prmInLinks.getValue(request)) { int start = prmInLinkStart.getValue(request); int max = prmInLinkMax.getValue(request); if (max <= 0) { max = Integer.MAX_VALUE; } else { max = max + start; } Article[] linksIn = art.getLinksIn(); logger.info("retrieving in links [" + start + "," + max + "] from " + linksIn.length + " total"); msg.setTotalInLinks(linksIn.length); for (int i = start; i < max && i < linksIn.length; i++) { Page p = new Page(linksIn[i]); if (artComparer != null) { p.setRelatedness(artComparer.getRelatedness(art, linksIn[i])); } msg.addInLink(p); } } msge.addArticle(msg); } return msge; } public static class MessageList extends Service.Message { @Expose @ElementList(required = true, entry = "invalidList") private List<Integer> invalidList = null; @Expose @ElementList(required = true, entry = "articleLis") private List<ArticleMsg> articleList = null; @Expose @ElementList(required = true, entry = "nullList") private List<Integer> nullList = null; private MessageList(HttpServletRequest request, List<Integer> nullList, List<Integer> invalidList) { super(request); this.invalidList = invalidList; this.nullList = nullList; articleList = new ArrayList<ArticleMsg>(); } private void addArticle(ArticleMsg arti) { articleList.add(arti); } } public static class ArticleMsg { @Expose @Attribute private final int id; @Expose @Attribute private final String title; @Expose @Element(required = false, data = true) private String definition; @Expose @ElementList(required = false, entry = "image") private ArrayList<Image> images = null; @Expose @ElementList(required = false, entry = "label") private ArrayList<Label> labels = null; @Expose @ElementList(required = false, entry = "tranlation") private ArrayList<Translation> translations = null; @Expose @ElementList(required = false, entry = "parentCategory") private ArrayList<Page> parentCategories = null; @Expose @Attribute(required = false) private Integer totalParentCategories; @Expose @ElementList(required = false, entry = "inLink") private ArrayList<Page> inLinks = null; @Expose @Attribute(required = false) private Integer totalInLinks; @Expose @ElementList(required = false, entry = "outLink") private ArrayList<Page> outLinks = null; @Expose @Attribute(required = false) private Integer totalOutLinks; private ArticleMsg(Article art) { this.id = art.getId(); this.title = art.getTitle(); } private void setDefinition(String markup) { this.definition = markup; } private void addImage(Image image) { if (images == null) { images = new ArrayList<Image>(); } images.add(image); } private void addLabel(Label label) { if (labels == null) { labels = new ArrayList<Label>(); } labels.add(label); } private void addTranslation(Translation t) { if (translations == null) { translations = new ArrayList<Translation>(); } translations.add(t); } private void addParentCategory(Page p) { if (parentCategories == null) { parentCategories = new ArrayList<Page>(); } parentCategories.add(p); } private void setTotalParentCategories(int total) { totalParentCategories = total; } private void addInLink(Page p) { if (inLinks == null) { inLinks = new ArrayList<Page>(); } inLinks.add(p); } private void setTotalInLinks(int total) { totalInLinks = total; } private void addOutLink(Page p) { if (outLinks == null) { outLinks = new ArrayList<Page>(); } outLinks.add(p); } private void setTotalOutLinks(int total) { totalOutLinks = total; } public int getId() { return id; } public String getTitle() { return title; } public String getDefinition() { return definition; } public List<Image> getImages() { if (images == null) { return Collections.unmodifiableList(new ArrayList<Image>()); } return Collections.unmodifiableList(images); } public List<Label> getLabels() { if (labels == null) { return Collections.unmodifiableList(new ArrayList<Label>()); } return Collections.unmodifiableList(labels); } public List<Translation> getTranslations() { if (translations == null) { return Collections.unmodifiableList(new ArrayList<Translation>()); } return Collections.unmodifiableList(translations); } public List<Page> getParentCategories() { if (parentCategories == null) { return Collections.unmodifiableList(new ArrayList<Page>()); } return Collections.unmodifiableList(parentCategories); } public Integer getTotalParentCategories() { return totalParentCategories; } public List<Page> getInLinks() { if (inLinks == null) { return Collections.unmodifiableList(new ArrayList<Page>()); } return Collections.unmodifiableList(inLinks); } public Integer getTotalInLinks() { return totalInLinks; } public List<Page> getOutLinks() { if (outLinks == null) { return Collections.unmodifiableList(new ArrayList<Page>()); } return Collections.unmodifiableList(outLinks); } public Integer getTotalOutLinks() { return totalOutLinks; } } public static class Image { @Expose @Attribute private final String url; private Image(String url) { this.url = url; } public String getUrl() { return url; } } public static class Label { @Expose @Attribute private final String text; @Expose @Attribute private final long occurrances; @Expose @Attribute private final double proportion; @Expose @Attribute private final boolean isPrimary; @Expose @Attribute private final boolean fromRedirect; @Expose @Attribute private final boolean fromTitle; private Label(Article.Label lbl, long totalOccurrances) { text = lbl.getText(); occurrances = lbl.getLinkOccCount(); proportion = (double) occurrances / totalOccurrances; isPrimary = lbl.isPrimary(); fromRedirect = lbl.isFromRedirect(); fromTitle = lbl.isFromTitle(); } public String getText() { return text; } public long getOccurrances() { return occurrances; } public double getProportion() { return proportion; } public boolean isPrimary() { return isPrimary; } public boolean isFromRedirect() { return fromRedirect; } public boolean isFromTitle() { return fromTitle; } } public static class Translation { @Expose @Attribute private String lang; @Expose @Text(data = true) private String text; private Translation(String lang, String text) { this.lang = lang; this.text = text; } public String getLang() { return lang; } public String getText() { return text; } } public static class Page { @Expose @Attribute private final int id; @Expose @Attribute private final String title; @Expose @Attribute(required = false) private Double relatedness; protected Page(org.wikipedia.miner.model.Page p) { this.id = p.getId(); this.title = p.getTitle(); } protected void setRelatedness(double relatedness) { this.relatedness = relatedness; } public int getId() { return id; } public String getTitle() { return title; } public Double getRelatedness() { return relatedness; } } }