package org.wikibrain.webapi; import gnu.trove.map.TIntDoubleMap; import gnu.trove.set.TIntSet; import gnu.trove.set.hash.TIntHashSet; import org.apache.commons.cli.*; import org.eclipse.jetty.server.Connector; import org.eclipse.jetty.server.Request; import org.eclipse.jetty.server.Server; import org.eclipse.jetty.server.ServerConnector; import org.eclipse.jetty.server.handler.AbstractHandler; import org.eclipse.jetty.util.thread.QueuedThreadPool; import org.jooq.tools.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.wikibrain.conf.ConfigurationException; import org.wikibrain.conf.DefaultOptionBuilder; import org.wikibrain.core.cmd.Env; import org.wikibrain.core.cmd.EnvBuilder; import org.wikibrain.core.dao.*; import org.wikibrain.core.lang.Language; import org.wikibrain.core.model.LocalLink; import org.wikibrain.core.model.LocalPage; import org.wikibrain.core.model.NameSpace; import org.wikibrain.sr.SRMetric; import org.wikibrain.sr.SRResult; import org.wikibrain.sr.SRResultList; import org.wikibrain.sr.wikify.Wikifier; import org.wikibrain.utils.WpCollectionUtils; import javax.servlet.ServletException; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; import java.io.IOException; import java.util.*; /** * @author Shilad Sen */ public class WikiBrainServer extends AbstractHandler { private static final Logger LOG = LoggerFactory.getLogger(WikiBrainServer.class); private final Env env; private final LocalPageDao pageDao; private final LocalLinkDao linkDao; private final LocalCategoryMemberDao catDao; private WebEntityParser entityParser; public WikiBrainServer(Env env) throws ConfigurationException, DaoException { this.env = env; this.entityParser = new WebEntityParser(env); this.pageDao = env.getConfigurator().get(LocalPageDao.class); this.linkDao = env.getConfigurator().get(LocalLinkDao.class); this.catDao = env.getConfigurator().get(LocalCategoryMemberDao.class); // Warm up necessary components for (Language l : env.getLanguages()) { LOG.info("warming up components for language: " + l); getSr(l); env.getConfigurator().get(Wikifier.class, "websail", "language", l.getLangCode()); } LOG.info("warming up pagerank"); LocalPage p = pageDao.get(new DaoFilter().setLimit(1)).iterator().next(); linkDao.getPageRank(p.toLocalId()); } @Override public void handle(String target, Request request, HttpServletRequest httpServletRequest, HttpServletResponse httpServletResponse) throws IOException, ServletException { WikiBrainWebRequest req = new WikiBrainWebRequest(target, request, httpServletRequest, httpServletResponse); LOG.info("received request for {}, URL {}?{}", target, request.getRequestURL(), request.getQueryString()); try { // TODO: add logging if (target.equals("/languages")) { doLanguages(req); } else if (target.equals("/similarity")) { doSimilarity(req); } else if (target.equals("/cosimilarity")) { throw new UnsupportedOperationException(); } else if (target.equals("/mostSimilar")) { doMostSimilar(req); } else if (target.equals("/wikify")) { doWikify(req); } else if (target.equals("/pageRank")) { doPageRank(req); } else if (target.equals("/articlesInCategory")) { doArticlesInCategory(req); } else if (target.equals("/categoriesForArticle")) { doCategoriesForArticle(req); } } catch (WikiBrainWebException e) { req.writeError(e); } catch (ConfigurationException e) { req.writeError(e); } catch (DaoException e) { req.writeError(e); } } private void doLanguages(WikiBrainWebRequest req) { List<String> langs = new ArrayList<String>(); for (Language l : env.getLanguages()) { langs.add(l.getLangCode()); } Collections.sort(langs); req.writeJsonResponse("languages", langs); } private SRMetric getSr(Language lang) throws ConfigurationException { return env.getConfigurator().get(SRMetric.class, "simple-ensemble", "language", lang.getLangCode()); } private void doSimilarity(WikiBrainWebRequest req) throws ConfigurationException, DaoException { // TODO: support explanations Language lang = req.getLanguage(); List<WebEntity> entities = entityParser.extractEntityList(req); if (entities.size() != 2) { throw new WikiBrainWebException("Similarity requires exactly two entities"); } WebEntity entity1 = entities.get(0); WebEntity entity2 = entities.get(1); SRMetric sr = getSr(lang); SRResult r = null; switch (entity1.getType()) { case ARTICLE_ID: case TITLE: r = sr.similarity(entity1.getArticleId(), entity2.getArticleId(), false); break; case PHRASE: r = sr.similarity(entity1.getPhrase(), entity2.getPhrase(), false); break; default: throw new WikiBrainWebException("Unsupported entity type: " + entity1.getType()); } Double sim = (r != null && r.isValid()) ? r.getScore() : null; req.writeJsonResponse("score", sim, "entity1", entity1.toJson(), "entity2", entity2.toJson()); } private void doMostSimilar(WikiBrainWebRequest req) throws DaoException, ConfigurationException { Language lang = req.getLanguage(); WebEntity entity = entityParser.extractEntity(req); int n = Integer.valueOf(req.getParam("n", "10")); SRMetric sr = getSr(lang); SRResultList results; switch (entity.getType()) { case ARTICLE_ID: case TITLE: results = sr.mostSimilar(entity.getArticleId(), n); break; case PHRASE: results = sr.mostSimilar(entity.getPhrase(), n); break; default: throw new WikiBrainWebException("Unsupported entity type: " + entity.getType()); } List jsonResults = new ArrayList(); for (SRResult r : results) { LocalPage page = pageDao.getById(lang, r.getId()); Map obj = new HashMap(); obj.put("articleId", r.getId()); obj.put("score", r.getScore()); obj.put("lang", lang.getLangCode()); obj.put("title", page == null ? "Unknown" : page.getTitle().getCanonicalTitle()); jsonResults.add(obj); } req.writeJsonResponse("results", jsonResults); } private void doPageRank(WikiBrainWebRequest req) throws ConfigurationException, DaoException { Language lang = req.getLanguage(); WebEntity entity = entityParser.extractEntity(req); if (entity.getArticleId() < 0) { throw new WikiBrainWebException("articleId or title parameter required."); } int id = entity.getArticleId(); double pageRank = linkDao.getPageRank(lang, id); req.writeJsonResponse( "article", pageJson(lang, id), "pageRank", pageRank ); } private void doCategoriesForArticle(WikiBrainWebRequest req) throws ConfigurationException, DaoException { Language lang = req.getLanguage(); WebEntity entity = entityParser.extractEntity(req); if (entity.getArticleId() < 0) { throw new WikiBrainWebException("articleId or title parameter required."); } Set<LocalPage> candidates = extractCategories(req, lang); boolean weighted = Boolean.valueOf(req.getParam("weighted", "true")); TIntDoubleMap distances = catDao.getCategoryDistances(candidates, entity.getArticleId(), weighted); List distanceJson = new ArrayList(); for (int catId : WpCollectionUtils.sortMapKeys(distances, false)) { Map articleJson = pageJson(lang, catId); articleJson.put("distance", distances.get(catId)); distanceJson.add(articleJson); } req.writeJsonResponse( "article", entity.toJson(), "distances", distanceJson ); } private Set<LocalPage> extractCategories(WikiBrainWebRequest req, Language lang) throws DaoException { Set<LocalPage> candidates = new HashSet<LocalPage>(); if (req.hasParam("categoryIds")) { String ids[] = req.getParam("categoryIds").split("\\|"); for (String sid : ids) { LocalPage p = pageDao.getById(lang, Integer.valueOf(sid)); if (p == null) throw new WikiBrainWebException("No " + lang + " article loaded with id " + sid); candidates.add(p); } } else if (req.hasParam("categoryTitles")) { String titles[] = req.getParam("categoryTitles").split("\\|"); for (String t : titles) { LocalPage p = pageDao.getByTitle(lang, t); if (p == null) throw new WikiBrainWebException("No " + lang + " article loaded with title " + t); candidates.add(p); } } else { candidates = catDao.guessTopLevelCategories(lang); if (candidates == null || candidates.isEmpty()) { throw new WikiBrainWebException("No candidates specified and no top-level categories found."); } } return candidates; } private void doArticlesInCategory(WikiBrainWebRequest req) throws DaoException { Language lang = req.getLanguage(); TIntSet pageIds = null; LocalPage target; if (req.hasParam("targetCategoryId")) { target = pageDao.getById(lang, Integer.valueOf(req.getParam("targetCategoryId"))); } else if (req.hasParam("targetCategoryTitle")) { target = pageDao.getByTitle(lang, NameSpace.CATEGORY, req.getParam("targetCategoryTitle")); } else { throw new WikiBrainWebException("Either targetCategoryId or targetCategoryTitle must be specified"); } if (req.hasParam("titles")) { pageIds = new TIntHashSet(); for (String t : req.getParam("titles").split("\\|")) { int id = pageDao.getIdByTitle(t, lang, NameSpace.ARTICLE); if (id < 0) { throw new WikiBrainWebException("No " + lang + " article loaded with title " + t); } pageIds.add(id); } } else if (req.hasParam("articleIds")) { for (String id : req.getParam("articleIds").split("\\|")) { pageIds.add(Integer.valueOf(id)); } } Set<LocalPage> candidates = extractCategories(req, lang); if (!candidates.contains(target)) { throw new WikiBrainWebException("target category " + target + " not contained in [" + StringUtils.join(candidates) + "]"); } boolean weighted = Boolean.valueOf(req.getParam("weighted", "true")); Map<LocalPage, TIntDoubleMap> distances = catDao.getClosestCategories(candidates, pageIds, weighted); final List distanceJson = new ArrayList(); if (distances.containsKey(target)) { for (int pageId : WpCollectionUtils.sortMapKeys(distances.get(target), false)) { Map json = pageJson(lang, pageId); json.put("distance", distances.get(target).get(pageId)); distanceJson.add(json); } } req.writeJsonResponse( "category", pageJson(target), "distances", distanceJson ); } private Map pageJson(LocalPage p) { if (p == null) { return null; } Map json = new HashMap(); json.put("articleId", p.getLocalId()); json.put("title", p.getTitle().getCanonicalTitle()); json.put("lang", p.getLanguage().getLangCode()); return json; } private Map pageJson(Language lang, int pageId) throws DaoException { return pageJson(pageDao.getById(lang, pageId)); } private void doWikify(WikiBrainWebRequest req) throws ConfigurationException, DaoException { Language lang = req.getLanguage(); Wikifier wf = env.getConfigurator().get(Wikifier.class, "websail", "language", lang.getLangCode()); String text = req.getParamOrDie("text"); List jsonConcepts = new ArrayList(); for (LocalLink ll : wf.wikify(text)) { LocalPage page = pageDao.getById(lang, ll.getDestId()); Map obj = new HashMap(); obj.put("index", ll.getLocation()); obj.put("text", ll.getAnchorText()); obj.put("lang", lang.getLangCode()); obj.put("articleId", ll.getDestId()); obj.put("title", page == null ? "Unknown" : page.getTitle().getCanonicalTitle()); jsonConcepts.add(obj); } req.writeJsonResponse("text", text, "references", jsonConcepts); } public static void main(String args[]) throws Exception { Options options = new Options(); options.addOption( new DefaultOptionBuilder() .withLongOpt("port") .withDescription("Server port number") .create("p")); options.addOption( new DefaultOptionBuilder() .withLongOpt("listeners") .withDescription("Size of listener queue") .create("q")); EnvBuilder.addStandardOptions(options); CommandLineParser parser = new PosixParser(); CommandLine cmd; try { cmd = parser.parse(options, args); } catch (ParseException e) { System.err.println( "Invalid option usage: " + e.getMessage()); new HelpFormatter().printHelp("DumpLoader", options); return; } Env env = new EnvBuilder(cmd).build(); int port = Integer.valueOf(cmd.getOptionValue("p", "8000")); int queueSize = Integer.valueOf(cmd.getOptionValue("q", "100")); Server server = new Server(new QueuedThreadPool(queueSize, 20)); server.setHandler(new WikiBrainServer(env)); ServerConnector sc = new ServerConnector(server); sc.setPort(port); server.setConnectors(new Connector[]{sc}); server.start(); server.join(); } }