/** * Licensed to DigitalPebble Ltd under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * DigitalPebble licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.digitalpebble.stormcrawler.elasticsearch.util; import java.io.BufferedOutputStream; import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.net.UnknownHostException; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Map.Entry; import org.apache.commons.io.IOUtils; import org.apache.commons.lang.StringUtils; import org.apache.storm.Config; import org.elasticsearch.action.search.SearchResponse; import org.elasticsearch.client.Client; import org.elasticsearch.common.unit.TimeValue; import org.elasticsearch.index.query.QueryBuilders; import org.elasticsearch.search.SearchHit; import org.elasticsearch.search.SearchHits; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.digitalpebble.stormcrawler.elasticsearch.ElasticSearchConnection; import com.digitalpebble.stormcrawler.util.ConfUtils; /** * Extracts the URLs (and possibly metadata) from a status or doc index into a * file. **/ public class URLExtractor { private static final Logger LOG = LoggerFactory .getLogger(URLExtractor.class); // indexer or status private String boltType; private Client client; private int cumulated = 0; private BufferedOutputStream output = null; private String indexName; private String docType; URLExtractor(Map stormConf, String outfile, String boltType) throws FileNotFoundException, UnknownHostException { this.output = new BufferedOutputStream(new FileOutputStream(new File( outfile))); this.boltType = boltType; this.client = ElasticSearchConnection.getClient(stormConf, boltType); this.indexName = ConfUtils.getString(stormConf, "es." + boltType + ".index.name", "status"); this.docType = ConfUtils.getString(stormConf, "es." + boltType + ".doc.type", "status"); } public static void main(String[] args) throws IOException { if (args.length < 3) { LOG.error("Usage: URLExtractor <CONF_FILE> <OUTFILE> [indexer|status]"); System.exit(-1); } String confFile = args[0]; String outfile = args[1]; String boltType = args[2]; // load the conf Config conf = new Config(); ConfUtils.loadConf(confFile, conf); URLExtractor gen = new URLExtractor(conf, outfile, boltType); gen.queryES(); gen.output.close(); gen.client.close(); LOG.info("Total : {}", gen.cumulated); } private final void queryES() throws IOException { int maxBufferSize = 100; SearchResponse scrollResp = client.prepareSearch(this.indexName) .setTypes(this.docType).setScroll(new TimeValue(60000)) .setQuery(QueryBuilders.matchAllQuery()).setSize(maxBufferSize) .execute().actionGet(); long total = scrollResp.getHits().getTotalHits(); LOG.info("Total hits found {}", total); // Scroll until no hits are returned while (true) { SearchHits hits = scrollResp.getHits(); LOG.info("Processing {} documents - {} out of {}", hits.getHits().length, cumulated, total); for (SearchHit hit : hits) { String url = null; Map<String, Object> sourceMap = hit.getSource(); if (sourceMap == null) { hit.getFields().get("url"); } else { url = sourceMap.get("url").toString(); } if (StringUtils.isBlank(url)) { LOG.error("Can't retrieve URL for hit {}", hit); continue; } StringBuilder line = new StringBuilder(url); if (boltType.equalsIgnoreCase("status")) { sourceMap = (Map<String, Object>) sourceMap.get("metadata"); if (sourceMap != null) { Iterator<Entry<String, Object>> iter = sourceMap .entrySet().iterator(); while (iter.hasNext()) { Entry<String, Object> e = iter.next(); Object o = e.getValue(); if (o == null) { continue; } if (o instanceof String) { line.append("\t").append(e.getKey()) .append("=").append(o); } if (o instanceof List) { for (Object val : (List) o) { line.append("\t").append(e.getKey()) .append("=").append(val.toString()); } } } } } line.append("\n"); IOUtils.write(line.toString(), output, "UTF-8"); cumulated++; } scrollResp = client.prepareSearchScroll(scrollResp.getScrollId()) .setScroll(new TimeValue(600000)).execute().actionGet(); // Break condition: No hits are returned if (scrollResp.getHits().getHits().length == 0) { break; } } } }