package org.gba.spritely.sitescrapers;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.apache.http.HttpRequest;
import org.apache.http.HttpResponse;
import org.apache.http.ProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.RedirectStrategy;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.impl.client.LaxRedirectStrategy;
import org.apache.http.protocol.HttpContext;
import org.ccil.cowan.tagsoup.Parser;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
public class WikimediaCommonsScraper extends DefaultHandler
{
public static List<String> searchWC(String query, boolean removethumbs)
{
WikimediaCommonsScraper scraper = new WikimediaCommonsScraper();
try {
URL u = new URL("http://commons.wikimedia.org/wiki/" + query);
Parser p = new Parser();
// SAXParserFactory factory = SAXParserFactory.newInstance();
// SAXParser saxParser = factory.newSAXParser();
p.setContentHandler(scraper);
HttpClientBuilder clientBuilder = HttpClientBuilder.create();
clientBuilder.setRedirectStrategy(new LaxRedirectStrategy());
HttpClient client = clientBuilder.build();
HttpGet getRequest = new HttpGet(u.toURI());
HttpResponse response = client.execute(getRequest);
p.parse(new InputSource(response.getEntity().getContent()));
}
catch (Exception e) {
// e.printStackTrace();
return Collections.EMPTY_LIST;
}
List<String> res = new ArrayList<String>();
String url = "";
for (int i = 0; i < scraper.urls.size(); i++) {
url = "http:" + (String)scraper.urls.get(i);
if ((removethumbs) && (url.contains("/thumb/"))) {
url = url.replaceAll("/thumb", "");
url = url.substring(0, url.lastIndexOf("/"));
}
if (!url.endsWith(".svg"))
{
res.add(url);
}
}
return res;
}
private List<String> urls;
boolean insideImageStructure;
int imageStructureNestingLevel;
public WikimediaCommonsScraper()
{
this.urls = new ArrayList<String>();
;
insideImageStructure = false;
imageStructureNestingLevel = 0;
}
public boolean isTagWithAttribute(String ttag, String attr, String tval, String tag, Attributes attributes) {
return (tag.equalsIgnoreCase(ttag)) && (attributes.getIndex(attr) > -1) && (attributes.getValue(attributes.getIndex(attr)).equalsIgnoreCase(tval));
}
public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
if(insideImageStructure)
{
imageStructureNestingLevel++;
if ((qName.equalsIgnoreCase("img")))
this.urls.add(attributes.getValue("src"));
}
else
{
if(attributes.getIndex("class") > -1 && attributes.getValue(attributes.getIndex("class")).equalsIgnoreCase("gallerybox"))
{
insideImageStructure = true;
imageStructureNestingLevel = 1;
}
}
}
public void characters(char[] ch, int st, int len)
throws SAXException
{
}
public void endElement(String uri, String localName, String qName)
throws SAXException
{
if(insideImageStructure)
{
imageStructureNestingLevel--;
}
if(imageStructureNestingLevel <= 0)
{
insideImageStructure = false;
}
}
public void endDocument()
{
}
}
/* Location: /develop/libs/spritely/
* Qualified Name: org.gba.spritely.sitescrapers.WikimediaCommonsScraper
* JD-Core Version: 0.6.2
*/