/**
* This file is part of General Entity Annotator Benchmark.
*
* General Entity Annotator Benchmark is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* General Entity Annotator Benchmark is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with General Entity Annotator Benchmark. If not, see <http://www.gnu.org/licenses/>.
*/
package org.aksw.gerbil.dataset.datahub;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import org.aksw.gerbil.config.GerbilConfiguration;
import org.aksw.gerbil.dataset.datahub.model.Dataset;
import org.aksw.gerbil.dataset.datahub.model.Resource;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.http.HttpHeaders;
import org.springframework.http.HttpStatus;
import org.springframework.http.ResponseEntity;
import org.springframework.web.client.RestTemplate;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
public class DatahubNIFLoader {
private static final Logger LOGGER = LoggerFactory.getLogger(DatahubNIFLoader.class);
private static final String DATAHUB_NIF_CORPUS_META_INF_URL_PROPERTY_NAME = "org.aksw.gerbil.datasets.DatahubNIFLoader.metaInfURL";
private static final String DATAHUB_TAG_INF_URL_PROPERTY_NAME = "org.aksw.gerbil.datasets.DatahubNIFLoader.tagInfURL";
private static final String DATAHUB_NEEDED_TAGS_ARRAY_PROPERTY_NAME = "org.aksw.gerbil.datasets.DatahubNIFLoader.corpusTags";
private RestTemplate rt;
private String neededTags[];
private Map<String, String> datasets;
public DatahubNIFLoader() {
rt = new RestTemplate();
init();
}
private void init() {
neededTags = GerbilConfiguration.getInstance().getStringArray(
DATAHUB_NEEDED_TAGS_ARRAY_PROPERTY_NAME);
if (neededTags == null) {
LOGGER.error("Couldn't load the needed property \"{}\".",
DATAHUB_NEEDED_TAGS_ARRAY_PROPERTY_NAME);
neededTags = new String[0];
}
List<String> nifDataSets = getNIFDataSets();
getNIFDataSetsMetaInformation(nifDataSets);
}
private void getNIFDataSetsMetaInformation(List<String> nifDataSets) {
datasets = Maps.newHashMap();
String nifCorpusMetaInfURL = GerbilConfiguration.getInstance().getString(
DATAHUB_NIF_CORPUS_META_INF_URL_PROPERTY_NAME);
if (nifCorpusMetaInfURL == null) {
LOGGER.error("Couldn't load the needed property \"{}\". Aborting.",
DATAHUB_NIF_CORPUS_META_INF_URL_PROPERTY_NAME);
return;
}
// go through all datasets tagged with nif
for (String d : nifDataSets) {
ResponseEntity<Dataset.Response> entity = rt.getForEntity(nifCorpusMetaInfURL + d,
Dataset.Response.class);
if (entity.getStatusCode().equals(HttpStatus.OK)) {
Dataset.Response body = entity.getBody();
List<Resource> resources = body.getResult().getResources();
// go through the downloadable Resources
for (Resource r : resources) {
String url = r.getUrl();
LOGGER.debug("checking {}", url);
HttpHeaders headers = rt.headForHeaders(url);
long contentLength = headers.getContentLength();
LOGGER.debug("{} bytes", contentLength);
// FIXME - put the magic number in application.properties
// add if less than 20mb ends with ttl (turtle) but not with dataid.ttl (we aint gonna need it yet)
if (contentLength < 20_000_000 && url.endsWith(".ttl") && !url.endsWith("dataid.ttl")) {
LOGGER.debug("{}: {} has less than 20mb and is turtle > add to Dataset", d, url);
datasets.put(d, url);
}
}
}
}
}
private List<String> getNIFDataSets() {
List<String> result = Lists.newArrayList();
String taggedCorpusURL = GerbilConfiguration.getInstance().getString(
DATAHUB_TAG_INF_URL_PROPERTY_NAME);
if (taggedCorpusURL == null) {
LOGGER.error("Couldn't load the needed property \"{}\". Aborting.",
DATAHUB_TAG_INF_URL_PROPERTY_NAME);
} else {
Set<String> requestResult, taggedCorpora = null;
String[] body;
for (int i = 0; i < neededTags.length; ++i) {
try {
ResponseEntity<String[]> forEntity = rt.getForEntity(taggedCorpusURL + neededTags[i],
String[].class);
if (forEntity.getStatusCode().equals(HttpStatus.OK)) {
body = forEntity.getBody();
if (taggedCorpora == null) {
taggedCorpora = Sets.newHashSet(body);
LOGGER.debug("corpora with \"{}\" tag {}", neededTags[i], taggedCorpora);
} else {
requestResult = Sets.newHashSet(body);
LOGGER.debug("corpora with \"{}\" tag {}", neededTags[i], requestResult);
taggedCorpora = Sets.intersection(taggedCorpora, requestResult);
}
} else {
LOGGER.warn("Couldn't get any datasets with the {} tag from DataHubIO. Status: ",
neededTags[i], forEntity.getStatusCode());
}
} catch (Exception e) {
LOGGER.warn("Couldn't get any datasets with the {} tag from DataHubIO. Exception: {}",
neededTags[i], e);
}
}
}
return result;
}
public static void main(String[] args) {
DatahubNIFLoader d = new DatahubNIFLoader();
// d.init();
for (Entry<String, String> e : d.datasets.entrySet()) {
LOGGER.debug("{}: {}", e.getKey(), e.getValue());
}
}
public Map<String, String> getDataSets() {
return datasets;
}
}