package edu.uncc.cs.watsonsim.researchers; import java.io.ByteArrayInputStream; import java.io.InputStreamReader; import com.google.gson.reflect.TypeToken; import crawlercommons.fetcher.BaseFetchException; import crawlercommons.fetcher.http.SimpleHttpFetcher; import crawlercommons.fetcher.http.UserAgent; import de.l3s.boilerpipe.BoilerpipeProcessingException; import de.l3s.boilerpipe.extractors.ArticleExtractor; import edu.uncc.cs.watsonsim.Answer; import edu.uncc.cs.watsonsim.Environment; import edu.uncc.cs.watsonsim.Passage; import edu.uncc.cs.watsonsim.Phrase; /** Fill in the full text of an answer from it's URL, if it has one */ public class URLExpander extends Researcher { private SimpleHttpFetcher fetcher; private Environment env; public URLExpander(Environment env) { this.env = env; fetcher = new SimpleHttpFetcher(3, new UserAgent( "Watsonsim QA engine (bot)", "stgallag@gmail.com", "http://github.com/SeanTater/uncc2014watsonsim", "Mozilla/5.0", "10 May 2015")); //fetcher.setConnectionTimeout(2000); //fetcher.setSocketTimeout(2000); fetcher.setMaxRetryCount(1); } /** * Get a page from the Internet and clean it. */ private String fetch(String key) { try { byte[] payload = fetcher.fetch(key.substring(4)).getContent(); InputStreamReader isr = new InputStreamReader( new ByteArrayInputStream(payload)); return ArticleExtractor.INSTANCE.getText(isr); } catch (BaseFetchException | BoilerpipeProcessingException e) { // TODO Auto-generated catch block System.err.println("Can't connect to " + key); return ""; } } public Answer answer(Phrase q, Answer a) { a.passages.replaceAll( p -> { if (p.reference.startsWith("http") && p.reference.contains(".htm")) { /* This is roundabout because I really want to avoid * committing to a character set. (So I don't use String.) */ // Download String payload = env.computeIfAbsent("url:"+p.reference, this::fetch, new TypeToken<String>(){}.getType()); if (!payload.isEmpty()) { // Parse p = new Passage( "live-url", p.title, payload, p.reference); a.log(this, "Filled in passage from %s", p.reference); } } return p; }); return a; } }