package org.wikipedia.miner.web.util; import java.io.File; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.net.MalformedURLException; import java.net.URL; import java.net.URLEncoder; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.regex.Pattern; import javax.xml.parsers.ParserConfigurationException; import org.wikipedia.miner.model.Article; import org.xml.sax.SAXException; import com.google.gson.Gson; import com.google.gson.GsonBuilder; import com.google.gson.JsonElement; import com.google.gson.JsonParser; public class ImageRetriever { private static String baseUrl = "http://en.wikipedia.org/w/api.php" ; private WebContentRetriever retriever ; private Gson gson = new Gson(); private Set<String> bannedImages = new HashSet<String>() ; public ImageRetriever(WebContentRetriever retriever) { this.retriever = retriever ; bannedImages.add("File:Commons-logo.svg") ; } public List<String> getImageTitles(Integer articleId) throws UnsupportedEncodingException, IOException { List<String> imageTitles = new ArrayList<String>() ; URL url = new URL(baseUrl + "?action=query&pageids=" + articleId + "&prop=images&format=json") ; String json = retriever.getWebContent(url) ; Response response = gson.fromJson(json, Response.class) ; if (response == null) return imageTitles ; if (response.query == null) return imageTitles ; if (response.query.pages == null) return imageTitles ; for (Page page:response.query.pages.values()) { if (page.images == null) continue ; for (Image image:page.images) { if (bannedImages.contains(image.title)) continue ; imageTitles.add(image.title) ; } } return imageTitles ; } public String getImageUrl(String imageTitle, Integer width, Integer height) throws UnsupportedEncodingException, MalformedURLException, IOException { String url = baseUrl + "?action=query&titles=" + URLEncoder.encode(imageTitle, "UTF-8") + "&prop=imageinfo&iiprop=url&format=json" ; if (width != null) url = url + "&iiurlwidth=" + width ; if (height != null) url = url + "&iiurlheight=" + height ; //System.out.println(url) ; String json = retriever.getWebContent(new URL(url)) ; //System.out.println(json); Response response = gson.fromJson(json, Response.class) ; if (response == null) return null ; if (response.query == null) return null ; if (response.query.pages == null) return null ; for (Page page:response.query.pages.values()) { if (page.imageinfo == null) continue ; for (ImageInfo imageinfo:page.imageinfo) { if (imageinfo.thumburl != null) return imageinfo.thumburl ; if (imageinfo.url != null) return imageinfo.url ; } } return null ; } public static void main(String args[]) throws ParserConfigurationException, IOException, ClassNotFoundException, InstantiationException, IllegalAccessException, SAXException { File conf = new File("../configs/hub.xml") ; WebContentRetriever wcr = new WebContentRetriever(new HubConfiguration(conf)) ; ImageRetriever ir = new ImageRetriever(wcr) ; for(String img:ir.getImageTitles(852)) { System.out.println(img + ": " + ir.getImageUrl(img, 100, null)); } } private static class Response { public Query query ; } private static class Query { public Map<Integer,Page> pages ; } private static class Page { public int pageid ; public int ns ; public String title ; public List<Image> images ; public List<ImageInfo> imageinfo ; } private static class Image { public int ns ; public String title ; } private static class ImageInfo { public String thumburl ; public int thumbwidth ; public int thumbheight ; public String url ; } }