package org.wikibrain.core.dao.live; /** * utility class used by LiveAPI DAOs to query the wikipedia server and retrieve results as a list of QueryReply objects * author: derian */ import org.apache.commons.io.IOUtils; import org.wikibrain.core.WikiBrainException; import org.wikibrain.core.dao.DaoException; import org.wikibrain.core.lang.Language; import java.io.InputStream; import java.net.URL; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; public class LiveAPIQuery { private final Language lang; private final String outputFormat = "json"; //only JSON currently supported private final String queryAction; private final String queryType; private final String queryInfoPrefix; //prefix before params in URL string specifying what info should be returned private final String queryLimitPrefix; //prefix before "limit" and "continue" params in URL string //usually the same as queryInfoPrefix, but not in the case of prop and generator queries private final Boolean pluralPage; //if true, query URL string must contain plural version of pageid, title, etc as a param private final String queryResultDataSection; //section of the query result containing the data of interest private QueryParser parser = new QueryParser(); private Boolean redirects; private List<String> titles; private List<Integer> pageids; private String filterredir; private String from; private Integer namespace; private String prop = null; //only used in all-links queries, ensures that ids and titles of links will be returned private String queryUrl; private String queryResult = ""; //text representing the raw output of the query private LiveAPIQuery(LiveAPIQueryBuilder builder) { this.lang = builder.lang; if (builder.redirects != null) { this.redirects = builder.redirects; } if (builder.titles != null) { this.titles = builder.titles; } if (builder.pageids != null) { this.pageids = builder.pageids; } if (builder.filterredir != null) { this.filterredir = builder.filterredir; } if (builder.from != null) { this.from = builder.from; } if (builder.namespace != null) { this.namespace = builder.namespace; } // set parameters for the URL string according to the query type switch (builder.queryType) { case 0: //INFO: this.queryAction = "prop"; this.queryType = "info"; this.queryInfoPrefix = ""; this.queryLimitPrefix = "in"; this.pluralPage = true; this.queryResultDataSection = "pages"; break; case 1: //CATEGORYMEMBERS: this.queryAction = "list"; this.queryType = "categorymembers"; this.queryInfoPrefix = "cm"; this.queryLimitPrefix = "cm"; this.pluralPage = false; this.queryResultDataSection = "categorymembers"; break; case 2: //CATEGORIES: this.queryAction = "generator"; this.queryType = "categories"; this.queryInfoPrefix = ""; this.queryLimitPrefix = "gcl"; this.pluralPage = true; this.queryResultDataSection = "pages"; break; case 3: //LINKS: this.queryAction = "generator"; this.queryType = "links"; this.queryInfoPrefix = ""; this.queryLimitPrefix = "gpl"; this.pluralPage = true; this.queryResultDataSection = "pages"; break; case 4: //BACKLINKS: this.queryAction = "list"; this.queryType = "backlinks"; this.queryInfoPrefix = "bl"; this.queryLimitPrefix = "bl"; this.pluralPage = false; this.queryResultDataSection = "backlinks"; break; case 5: //ALLPAGES this.queryAction = "list"; this.queryType = "allpages"; this.queryInfoPrefix = "ap"; this.queryLimitPrefix = "ap"; this.pluralPage = false; this.queryResultDataSection = "allpages"; break; default: //ALLLINKS this.queryAction = "list"; this.queryType = "alllinks"; this.queryInfoPrefix = "al"; this.queryLimitPrefix = "al"; this.pluralPage = false; this.queryResultDataSection = "alllinks"; this.prop = "ids|title"; break; } constructQueryUrl(); } public void constructQueryUrl() { String http = "http://"; String host = ".wikipedia.org"; String queryUrl = http + lang.getLangCode() + host + "/w/api.php?action=query&format=" + outputFormat + "&" + queryAction + "=" + queryType + "&" + queryLimitPrefix + "limit=500"; if (!this.titles.isEmpty()) { queryUrl += "&" + queryInfoPrefix + "title" + (pluralPage ? "s" : "") + "=" + titles.get(0); for (int i = 1; i < titles.size(); i++) { queryUrl += "|" + titles.get(i); } } if (!this.pageids.isEmpty()) { queryUrl += "&" + queryInfoPrefix + "pageid" + (pluralPage ? "s" : "") + "=" + pageids.get(0); for (int i = 1; i < pageids.size(); i++) { queryUrl += "|" + pageids.get(i); } } //if redirects is true, resolve redirects in the query result if ((this.redirects != null) && this.redirects) { queryUrl += "&redirects="; } //specify whether to return redirects, non-redirects, or both in the query result //default is both if (this.filterredir != null) { queryUrl += "&" + queryInfoPrefix + "filterredir" + "=" + filterredir; } if (this.from != null) { queryUrl += "&" + queryInfoPrefix + "from" + "=" + from; } if (this.namespace != null) { queryUrl += "&" + queryInfoPrefix + "namespace" + "=" + namespace; } if (this.prop != null) { queryUrl += "&" + queryInfoPrefix + "prop" + "=" + prop; } this.queryUrl = queryUrl; } /** * method used by client DAOs to retrieve a list of QueryReplies representing the values of interest returned by the query * @return QueryReply list containing the values of interest * @throws DaoException */ public List<QueryReply> getValuesFromQueryResult() throws DaoException { List<QueryReply> values = new ArrayList<QueryReply>(); String queryContinue = ""; boolean hasContinue; do { //make query and set this.queryResult to the resulting text getRawQueryText(queryUrl + queryContinue); //parse the queryResult and add the resulting QueryReply objects to values parser.getQueryReturnValues(lang, queryResult, queryResultDataSection, values); /* * Determine whether or not the query result contained continue info, meaning there were too many * values to return in one query * If so, continue parsing by adding the continue info to the URL string */ queryContinue = parser.getContinue(queryResult, queryType, queryLimitPrefix); hasContinue = (!queryContinue.equals("")); queryContinue = "&" + queryLimitPrefix + "continue=" + queryContinue; } while (hasContinue); return values; } /** * queries the wikipedia server for text output that can be parsed to create a wikibrain data object * sets the class attribute queryResult to the value of this raw output * @return * @throws org.wikibrain.core.dao.DaoException */ private void getRawQueryText(String url) throws DaoException { String info = new String(); InputStream inputStr; try{ inputStr = new URL(url).openStream(); try { info = IOUtils.toString(inputStr); } catch(Exception e){ throw new DaoException("Error parsing LiveDao query URL"); } finally { IOUtils.closeQuietly(inputStr); } } catch(Exception e){ throw new DaoException("Error getting page from the Wikipedia Server (Check your internet connection) "); } queryResult = info; } //Builder used by client DAOs to create instances of LiveAPIQuery public static class LiveAPIQueryBuilder { private final Language lang; //private final QueryType queryType; private final Integer queryType; private Boolean redirects; private List<String> titles = new ArrayList<String>(); private List<Integer> pageids = new ArrayList<Integer>(); private String filterredir; private String from; private Integer namespace; private Map<String, Integer> queryTypeMap = new HashMap<String, Integer>(); public LiveAPIQueryBuilder(String queryType, Language lang) { initQueryTypeMap(); this.queryType = queryTypeMap.get(queryType); this.lang = lang; } private void initQueryTypeMap() { queryTypeMap.put("INFO", 0); queryTypeMap.put("CATEGORYMEMBERS", 1); queryTypeMap.put("CATEGORIES", 2); queryTypeMap.put("LINKS", 3); queryTypeMap.put("BACKLINKS", 4); queryTypeMap.put("ALLPAGES", 5); queryTypeMap.put("ALLLINKS", 6); } public LiveAPIQueryBuilder setRedirects(Boolean redirects) { this.redirects = redirects; return this; } public LiveAPIQueryBuilder setTitles(List<String> titles) { this.titles = titles; return this; } public LiveAPIQueryBuilder setPageids(List<Integer> pageids) { this.pageids = pageids; return this; } public LiveAPIQueryBuilder addTitle(String title) { this.titles.add(title); return this; } public LiveAPIQueryBuilder addPageid(int pageid) { this.pageids.add(pageid); return this; } public LiveAPIQueryBuilder setFilterredir(String filterredir) { this.filterredir = filterredir; return this; } public LiveAPIQueryBuilder setFrom(String from) { this.from = from; return this; } public LiveAPIQueryBuilder setNamespace(int namespace) { this.namespace = namespace; return this; } public LiveAPIQuery build() { return new LiveAPIQuery(this); } } }