import com.knowledgebooks.info_spiders.WebSpider;
import com.knowledgebooks.nlp.ExtractNames;
import com.knowledgebooks.nlp.ExtractSearchTerms;
import com.knowledgebooks.nlp.util.ScoredList;
import java.io.FileWriter;
import java.io.PrintWriter;
import java.util.*;
/**
* Copyright Mark Watson 2008-2010. All Rights Reserved.
* License: LGPL version 3 (http://www.gnu.org/licenses/lgpl-3.0.txt)
*/
public class WebScrapingAndFreebaseSearch {
static public void main(String[] args) throws Exception {
PrintWriter out = new PrintWriter(new FileWriter("out.nt"));
WebSpider ws = new WebSpider("http://www.knowledgebooks.com", 2);
//WebSpider ws = new WebSpider("http://markwatson.com", 2);
for (List<String> ls : ws.url_content_lists) {
String url = ls.get(0);
String text = ls.get(1) + " Flagstaff";
// Get search terms for this web page's content:
ExtractSearchTerms extractor = new ExtractSearchTerms(text);
System.out.println("Best search terms " + extractor.getBest());
// Get people and place names in this web page's content:
ScoredList[] ret = new ExtractNames().getProperNames(text);
List<String> people = ret[0].getStrings();
List<String> places = ret[1].getStrings();
System.out.println("Human names: " + ret[0].getValuesAsString());
System.out.println("Place names: " + ret[1].getValuesAsString());
// Use Freebase to get more information about these people and places:
//Freebase freebase = Freebase.getFreebase();
EntityToRdfHelpersFreebase.processPeople(out, url, text, "person", people, extractor.getBest());
EntityToRdfHelpersFreebase.processPlaces(out, url, "place", places);
}
out.close();
}
}