package focusedCrawler.seedfinder; import java.io.BufferedReader; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.net.HttpURLConnection; import java.net.MalformedURLException; import java.net.URL; import java.net.URLEncoder; import java.util.ArrayList; import java.util.List; import org.apache.commons.codec.binary.Base64; import org.apache.xerces.parsers.DOMParser; import org.w3c.dom.Document; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import focusedCrawler.target.model.Page; import focusedCrawler.util.parser.BackLinkNeighborhood; public class BingSearchAzureAPI implements SearchEngineApi { private static final String BING_ADRESS = "https://api.datamarket.azure.com/Data.ashx/Bing/SearchWeb/v1/Web"; private String accountKey = "d9zIG4ICwyPiUzBz0pDB9fvGr/UKDqk82fYBlJlXmhc"; private String accountKeyEnc = buildKey(accountKey); private int docsPerPage = 10; public BingSearchAzureAPI() { } public BingSearchAzureAPI(String accountKey) { this.accountKey = buildKey(accountKey); } private String buildKey(String accountKey) { byte[] accountKeyBytes = Base64.encodeBase64((accountKey + ":" + accountKey).getBytes()); return new String(accountKeyBytes); } @Override public List<BackLinkNeighborhood> submitQuery(String query, int page) throws IOException { List<String> urls = downloadResults(query, page); List<BackLinkNeighborhood> links = new ArrayList<>(); for (String link : urls) { links.add(new BackLinkNeighborhood(link, null)); } return links; } public List<String> downloadResults(String keyword, int page) throws IOException { keyword = URLEncoder.encode(keyword, "UTF-8"); URL url = null; try { int skip = page * docsPerPage; url = new URL(BING_ADRESS+"?Query=%27" + keyword + "%27"+"&$skip="+skip+"&$top="+docsPerPage); } catch (MalformedURLException e) { throw new IllegalArgumentException("Invalid URL", e); } System.out.println("URL:"+url); HttpURLConnection conn = (HttpURLConnection) url.openConnection(); conn.setRequestMethod("GET"); conn.setRequestProperty("Authorization", "Basic " + accountKeyEnc); conn.setConnectTimeout(60000); conn.setReadTimeout(60000); BufferedReader br = new BufferedReader(new InputStreamReader((conn.getInputStream()))); StringBuffer output = new StringBuffer(); String line; while ((line = br.readLine()) != null) { output = output.append(line); } conn.disconnect(); List<String> links = parseXMLPage(new Page(url, output.toString())); System.out.println(getClass().getSimpleName()+" hits: "+links.size()); return links; } private List<String> parseXMLPage(Page page) { DOMParser parser = new DOMParser(); try { parser.parse(new InputSource(new ByteArrayInputStream(page.getContent()))); } catch (SAXException | IOException e) { throw new RuntimeException("Failed to parse search results.", e); } Document doc = parser.getDocument(); NodeList list = doc.getElementsByTagName("d:Url"); List<String> urls = new ArrayList<String>(); for (int j = 0; j < list.getLength(); j++) { Node node = list.item(j); NodeList children = node.getChildNodes(); Node child = children.item(0); urls.add(child.getTextContent()); } return urls; } public static void main(String[] args) throws IOException { new BingSearchAzureAPI().submitQuery("onion", 0); } }