import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.*;
import com.knowledgebooks.nlp.AutoTagger;
import com.knowledgebooks.nlp.util.NameValue;
import com.knowledgebooks.nlp.ExtractNames;
import com.knowledgebooks.nlp.util.ScoredList;
import com.knowledgebooks.info_spiders.WebSpider;
import org.apache.commons.io.FileUtils;
/**
* Copyright Mark Watson 2008-2010. All Rights Reserved.
* License: LGPL version 3 (http://www.gnu.org/licenses/lgpl-3.0.txt)
*/
public class KnowledgeBooksNlpGenerateRdfPropertiesFromWebPages {
public KnowledgeBooksNlpGenerateRdfPropertiesFromWebPages(String config_file_path, PrintWriter out) throws IOException {
this.out = out;
extractNames = new ExtractNames();
autoTagger = new AutoTagger();
List<String> lines = (List<String>) FileUtils.readLines(new File(config_file_path));
for (String line : lines) {
Scanner scanner = new Scanner(line);
scanner.useDelimiter(" ");
try {
String starting_url = scanner.next();
int spider_depth = Integer.parseInt(scanner.next());
spider(starting_url, spider_depth);
} catch (Exception ex) {
ex.printStackTrace();
}
}
this.out.close();
}
private void spider(String starting_url, int spider_depth) throws Exception {
System.out.println("** spider(" + starting_url + ", " + spider_depth + ")");
WebSpider ws = new WebSpider(starting_url, spider_depth);
for (List<String> ls : ws.url_content_lists) {
String url = ls.get(0);
String text = ls.get(1);
HashSet<String> hs = new HashSet<String>();
System.out.println("\n\n\n----URL:\n" + url + "\n content:\n" + text);
ScoredList[] names = extractNames.getProperNames(text);
ScoredList people = names[0];
ScoredList places = names[1];
List<NameValue<String, Float>> tags = autoTagger.getTags(text);
out.println("<" + url + "> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://knowledgebooks.com/rdf/webpage> .");
out.println("<" + url + "> <http://knowledgebooks.com/rdf/contents> \"" + text.trim().replaceAll("\"", "'") + "\" .");
for (String person : people.getStrings()) {
out.println("<" + url + "> <http://knowledgebooks.com/rdf/containsPerson> \"" + person.replaceAll("\"", "'") + "\" .");
}
for (String place : places.getStrings()) {
out.println("<" + url + "> <http://knowledgebooks.com/rdf/containsPlace> \"" + place.replaceAll("\"", "'") + "\" .");
}
for (NameValue nv : tags) {
out.println("<" + url + "> <http://knowledgebooks.com/rdf/" + nv.getName() + "> \"" + ("" + nv.getValue()) + "\" .");
hs.add("" + nv.getName());
}
inter_webpage_shared_tags.put(url, hs);
}
process_interpage_shared_properties();
}
private void process_interpage_shared_properties() throws Exception {
Set<String> unique_urls = inter_webpage_shared_tags.keySet();
for (String url_1 : unique_urls) {
for (String url_2 : unique_urls) {
if (url_1.equals(url_2) == false) {
System.out.println("\n\n^^^^^^^^^ " + url_1 + " : " + url_2 + "\n");
float url_similarity = score_mapset(inter_webpage_shared_tags.get(url_1), inter_webpage_shared_tags.get(url_2));
if (url_similarity > 12f) {
out.println("<" + url_1 + "> <http://knowledgebooks.com/rdf/high_similarity> <" + url_2 + "> .");
} else if (url_similarity > 5f) {
out.println("<" + url_1 + "> <http://knowledgebooks.com/rdf/medium_similarity> <" + url_2 + "> .");
} else if (url_similarity > 5f) {
out.println("<" + url_1 + "> <http://knowledgebooks.com/rdf/low_similarity> <" + url_2 + "> .");
}
}
}
}
}
private float score_mapset(Set<String> set_1, Set<String> set_2) {
set_1.retainAll(set_2); // replace contents of set_1 with intersection of set_1 and set_2
return set_1.size();
}
private PrintWriter out = null;
private Map<String, Set<String>> inter_webpage_shared_tags = new HashMap<String, Set<String>>();
;
private ExtractNames extractNames = null;
private AutoTagger autoTagger = null;
public static void main(String[] args) throws Exception {
new KnowledgeBooksNlpGenerateRdfPropertiesFromWebPages("testdata/websites.txt", new PrintWriter("tempdata/gen_rdf.nt"));
}
}