/** * This file is part of General Entity Annotator Benchmark. * * General Entity Annotator Benchmark is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * General Entity Annotator Benchmark is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with General Entity Annotator Benchmark. If not, see <http://www.gnu.org/licenses/>. */ package org.aksw.gerbil.semantic.sameas.impl.wiki; import java.io.IOException; import java.nio.charset.Charset; import java.util.HashSet; import java.util.Set; import org.aksw.gerbil.http.AbstractHttpRequestEmitter; import org.aksw.gerbil.semantic.sameas.SingleUriSameAsRetriever; import org.aksw.gerbil.semantic.sameas.impl.SimpleDomainExtractor; import org.aksw.gerbil.utils.WikipediaHelper; import org.apache.commons.io.IOUtils; import org.apache.http.HttpEntity; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.util.EntityUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.common.escape.Escaper; import com.google.common.net.UrlEscapers; public class WikipediaApiBasedSingleUriSameAsRetriever extends AbstractHttpRequestEmitter implements SingleUriSameAsRetriever { private static final Logger LOGGER = LoggerFactory.getLogger(WikipediaApiBasedSingleUriSameAsRetriever.class); private static final String URL_PROTOCOL_PART = "http://"; private static final String URL_QUERY_PART = "/w/api.php?format=xml&action=query&redirects=true&titles="; private static final String CHARSET_NAME = "UTF-8"; private static final Escaper TITLE_ESCAPER = UrlEscapers.urlFormParameterEscaper(); private Charset charset; private WikipediaXMLParser parser = new WikipediaXMLParser(); public WikipediaApiBasedSingleUriSameAsRetriever() { try { charset = Charset.forName(CHARSET_NAME); } catch (Exception e) { charset = Charset.defaultCharset(); } } @Override public Set<String> retrieveSameURIs(String uri) { return retrieveSameURIs(SimpleDomainExtractor.extractDomain(uri), uri); } @Override public Set<String> retrieveSameURIs(String domain, String uri) { if ((domain == null) || (uri == null)) { return null; } String title = WikipediaHelper.getWikipediaTitle(uri); if (title == null) { return null; } String redirectedTitle = queryRedirect(domain, title); if ((redirectedTitle != null) && (!title.equals(redirectedTitle))) { Set<String> uris = new HashSet<String>(); uris.add(uri); uris.add(WikipediaHelper.getWikipediaUri(domain, redirectedTitle)); return uris; } else { return null; } } public String queryRedirect(String domain, String title) { StringBuilder urlBuilder = new StringBuilder(150); urlBuilder.append(URL_PROTOCOL_PART); urlBuilder.append(domain); urlBuilder.append(URL_QUERY_PART); urlBuilder.append(TITLE_ESCAPER.escape(title)); HttpGet request = null; try { request = createGetRequest(urlBuilder.toString()); } catch (IllegalArgumentException e) { LOGGER.error("Got an exception while creating a request querying the wiki api of \"" + domain + "\". Returning null.", e); return null; } CloseableHttpResponse response = null; HttpEntity entity = null; try { response = sendRequest(request); entity = response.getEntity(); return parser.extractRedirect(IOUtils.toString(entity.getContent(), charset)); } catch (Exception e) { LOGGER.error("Got an exception while querying the wiki api of \"" + domain + "\". Returning null.", e); return null; } finally { if (entity != null) { try { EntityUtils.consume(entity); } catch (IOException e1) { } } IOUtils.closeQuietly(response); closeRequest(request); } } }