import com.freebase.api.Freebase; import com.freebase.json.JSON; import com.knowledgebooks.info_spiders.DBpediaLookupClient; import com.knowledgebooks.rdf.RdfServiceProxy; import com.knowledgebooks.rdf.SesameEmbeddedProxy; import java.io.PrintWriter; import java.util.*; /** * Copyright Mark Watson 2008-2010. All Rights Reserved. * License: LGPL version 3 (http://www.gnu.org/licenses/lgpl-3.0.txt) */ public class EntityToRdfHelpersDbpedia { public static void processEntity(PrintWriter out, String url, String name_type, List<String> name_list, List<String> possible_search_terms, Set<String> processed_DBpedia_queries) throws Exception { System.out.println("\n\n******* EntityToRdfHelpersDbpedia.processEntity "+name_list); for (String name : name_list) { System.out.println(" * name: " + name); boolean keep_processing = true; int num_to_take = Math.min(2, possible_search_terms.size()); loop: while (keep_processing && num_to_take >= 0) { for (int cycle = 0; cycle < 2; cycle++) { String query = name; List<String> search_terms = take(possible_search_terms, num_to_take); for (String key_word : search_terms) { if (query.indexOf(key_word) == -1) query += " " + key_word; } if (processed_DBpedia_queries.contains(query)) { System.out.println("** already processed DBpedia query: " + query); } else { System.out.println(" * query: " + query + ", num_to_take = " + num_to_take); DBpediaLookupClient lookup = new DBpediaLookupClient(query); List<Map<String, String>> results = lookup.variableBindings(); System.out.println("DBpedia search results: " + results); int num_results = results.size(); if (num_results > 0) { for (Map<String,String> bindings : results) { String uri = bindings.get("URI"); String label = bindings.get("Label"); String description = bindings.get("Description"); out.println("<" + uri + "> <http://knowledgebooks.com/rdf/datasource> <http://dbpedia.org> ."); out.println("<" + uri + "> <http://knowledgebooks.com/rdf/about/"+name_type+"> \"" + name + "\" ."); out.println("<" + url + "> <http://knowledgebooks.com/rdf/freebase_uri> <" + uri +"> ."); out.println("<" + uri + "> <http://knowledgebooks.com/rdf/description> \"" + description + "\" ."); out.println("<" + uri + "> <http://knowledgebooks.com/rdf/dbpedia/label> \"" + label + "\" ."); break loop; } } try { Thread.sleep(1200); } catch (Exception ignore) { } } } num_to_take--; } } } private static List<String> take(List<String> names, int num_to_take) { int size = names.size(), index = 0; List<String> ret = new ArrayList<String>(size); for (int i = 0; i < num_to_take; i++) { loop: for (int attempt = 0; attempt < 10; attempt++) { index = (int) (0.99 * Math.random() * size); if (!noise.contains(names.get(index).toLowerCase()) && !ret.contains(names.get(index))) { ret.add(names.get(index)); break loop; } } } return ret; } static Set<String> noise = new HashSet<String>(); static { noise.add("document"); noise.add("formats"); noise.add("company"); noise.add("text"); noise.add("system"); noise.add("product"); noise.add("documents"); noise.add("services"); noise.add("technology"); noise.add("technologies"); noise.add("implementing"); noise.add("implement"); noise.add("language"); noise.add("manage"); noise.add("management"); noise.add("research"); noise.add("library"); noise.add("libraries"); noise.add("language"); noise.add("languages"); noise.add("implement"); noise.add("implements"); noise.add("standard"); noise.add("standards"); noise.add("project"); noise.add("projects"); } static RdfServiceProxy rdfServiceProxy = null; // this is to get Lat/Lon RDF value static { try { rdfServiceProxy = new SesameEmbeddedProxy(); rdfServiceProxy.createRepository("test-repo1"); // must have a repository open } catch (Exception ex) { ex.printStackTrace(); } } }