package org.wikipedia.miner.web.util;
import java.io.File;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
import javax.xml.parsers.ParserConfigurationException;
import org.wikipedia.miner.model.Article;
import org.xml.sax.SAXException;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import com.google.gson.JsonElement;
import com.google.gson.JsonParser;
public class ImageRetriever {
private static String baseUrl = "http://en.wikipedia.org/w/api.php" ;
private WebContentRetriever retriever ;
private Gson gson = new Gson();
private Set<String> bannedImages = new HashSet<String>() ;
public ImageRetriever(WebContentRetriever retriever) {
this.retriever = retriever ;
bannedImages.add("File:Commons-logo.svg") ;
}
public List<String> getImageTitles(Integer articleId) throws UnsupportedEncodingException, IOException {
List<String> imageTitles = new ArrayList<String>() ;
URL url = new URL(baseUrl + "?action=query&pageids=" + articleId + "&prop=images&format=json") ;
String json = retriever.getWebContent(url) ;
Response response = gson.fromJson(json, Response.class) ;
if (response == null)
return imageTitles ;
if (response.query == null)
return imageTitles ;
if (response.query.pages == null)
return imageTitles ;
for (Page page:response.query.pages.values()) {
if (page.images == null)
continue ;
for (Image image:page.images) {
if (bannedImages.contains(image.title))
continue ;
imageTitles.add(image.title) ;
}
}
return imageTitles ;
}
public String getImageUrl(String imageTitle, Integer width, Integer height) throws UnsupportedEncodingException, MalformedURLException, IOException {
String url = baseUrl + "?action=query&titles=" + URLEncoder.encode(imageTitle, "UTF-8") + "&prop=imageinfo&iiprop=url&format=json" ;
if (width != null)
url = url + "&iiurlwidth=" + width ;
if (height != null)
url = url + "&iiurlheight=" + height ;
//System.out.println(url) ;
String json = retriever.getWebContent(new URL(url)) ;
//System.out.println(json);
Response response = gson.fromJson(json, Response.class) ;
if (response == null)
return null ;
if (response.query == null)
return null ;
if (response.query.pages == null)
return null ;
for (Page page:response.query.pages.values()) {
if (page.imageinfo == null)
continue ;
for (ImageInfo imageinfo:page.imageinfo) {
if (imageinfo.thumburl != null)
return imageinfo.thumburl ;
if (imageinfo.url != null)
return imageinfo.url ;
}
}
return null ;
}
public static void main(String args[]) throws ParserConfigurationException, IOException, ClassNotFoundException, InstantiationException, IllegalAccessException, SAXException {
File conf = new File("../configs/hub.xml") ;
WebContentRetriever wcr = new WebContentRetriever(new HubConfiguration(conf)) ;
ImageRetriever ir = new ImageRetriever(wcr) ;
for(String img:ir.getImageTitles(852)) {
System.out.println(img + ": " + ir.getImageUrl(img, 100, null));
}
}
private static class Response {
public Query query ;
}
private static class Query {
public Map<Integer,Page> pages ;
}
private static class Page {
public int pageid ;
public int ns ;
public String title ;
public List<Image> images ;
public List<ImageInfo> imageinfo ;
}
private static class Image {
public int ns ;
public String title ;
}
private static class ImageInfo {
public String thumburl ;
public int thumbwidth ;
public int thumbheight ;
public String url ;
}
}