package edu.stanford.nlp.semparse.open.util;
import java.io.IOException;
import java.util.*;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import fig.basic.Utils;
/**
* Handy utilities for interacting with the web.
*/
public class WebUtils {
private static ObjectMapper jsonMapper = new ObjectMapper();
/**
* Return the contents of a webpage.
*/
private static Document executeGetWebpageScript(String flags) {
try {
String contents = Utils.systemGetStringOutput("./scripts/get-webpage.py " + flags);
return Jsoup.parse(contents);
} catch (IOException e) {
throw new RuntimeException(e);
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
}
public static Document getWebpage(String url) {
url = url.replaceAll("'", "'\"'\"'");
return executeGetWebpageScript(" '" + url + "' ");
}
public static Document getWebpageFromHashcode(String cacheDirectory, String hashcode) {
String flags = " -H " + hashcode;
if (cacheDirectory != null && !cacheDirectory.isEmpty())
flags += " -d " + cacheDirectory;
return executeGetWebpageScript(flags);
}
/**
* Return the search results for a given query.
*/
public static List<SearchResult> googleSearch(String query) {
// Query is just a single webpage
if (query.startsWith("http://"))
return Collections.singletonList(new SearchResult(query, query, null));
try {
query = query.replaceAll("'", "'\"'\"'");
String contents = Utils.systemGetStringOutput("./scripts/google-search.py '" + query + "'");
JsonNode root = jsonMapper.readTree(contents.getBytes("UTF-8"));
List<SearchResult> pages = new ArrayList<>();
for (JsonNode item : root) {
pages.add(new SearchResult(query, item.get(0).asText(), item.get(1).asText()));
}
return pages;
} catch (IOException e) {
throw new RuntimeException(e);
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
}
/**
* Equivalent to doing Google Search but actually reading from file.
*/
public static List<SearchResult> fakeGoogleSearch(String query) {
try {
query = query.replaceAll("'", "'\"'\"'");
String contents = Utils.systemGetStringOutput("./scripts/fake-google-search.py '" + query + "'");
JsonNode root = jsonMapper.readTree(contents.getBytes("UTF-8"));
List<SearchResult> pages = new ArrayList<>();
for (JsonNode item : root) {
pages.add(new SearchResult(query, item.get("link").asText(), item.get("title").asText()));
}
return pages;
} catch (IOException e) {
throw new RuntimeException(e);
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
}
}