package org.dbpedia.mappings.missingbot.label; import org.apache.http.HttpEntity; import org.apache.http.HttpResponse; import org.apache.http.client.HttpClient; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.DefaultHttpClient; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.Collection; import java.util.Iterator; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * Iterable that list all titles of missing labels in the mappings wiki ("http://mappings.dbpedia.org") * of the dbpedia project. */ public class AllMissingLabelTitles implements Iterable<String> { /** * URL there the links for Articles with missing Labels are listed. */ private final String url; /** * Filter for specific titles. Options: OntologyClass, OntologyProperty and Datatype */ private final String filter; /** * Creates an iterator over all titles * * @param language language code for mapping wiki * @param filter filtering titles by category. Options: OntologyClass, OntologyProperty and Datatype */ public AllMissingLabelTitles(String language, String filter) { this.url = String.format("http://mappings.dbpedia.org/server/ontology/labels/missing/%s/", language); this.filter = filter; } /** * Parsing the links from the missing label url. * * @return list of links of articles where labels are missing. */ private ArrayList<String> getMissingLinks() { ArrayList<String> links = new ArrayList<String>(); String line; StringBuilder builder = new StringBuilder(); HttpClient httpclient = new DefaultHttpClient(); HttpGet httpget = new HttpGet(this.url); try { HttpResponse response = httpclient.execute(httpget); HttpEntity entity = response.getEntity(); if (entity != null) { InputStream instream = entity.getContent(); try { BufferedReader rd = new BufferedReader(new InputStreamReader(instream)); while ((line = rd.readLine()) != null) { builder.append(line); builder.append("\n"); } } finally { instream.close(); } } } catch (IOException e) { e.printStackTrace(); } String pattern = "href=\"(.*?)\""; Matcher m = Pattern.compile(pattern).matcher(builder.toString()); while(m.find()) { links.add(m.group(1)); } return links; } /** * Iterator over all missing label titles. * * @return list of titles */ public Iterator<String> iterator() { ArrayList<String> missingLinks = getMissingLinks(); Collection<String> missingTitles = new ArrayList<String>(); for (String link : missingLinks) { String[] splits = link.split("/"); String title = splits[splits.length -1]; if(!title.startsWith(this.filter)) { continue; } missingTitles.add(title); } return missingTitles.iterator(); } }