package de.juwimm.cms.beans; import java.io.IOException; import java.io.InputStream; import java.io.Reader; import java.io.StringReader; import java.io.StringWriter; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import javax.swing.text.MutableAttributeSet; import javax.swing.text.html.HTML; import javax.swing.text.html.HTMLEditorKit; import javax.swing.text.html.parser.ParserDelegator; import org.apache.commons.httpclient.HttpClient; import org.apache.commons.httpclient.HttpMethod; import org.apache.commons.httpclient.methods.GetMethod; import org.apache.commons.httpclient.params.HttpMethodParams; import org.apache.commons.io.IOUtils; import org.apache.log4j.Logger; import org.apache.lucene.document.Document; import org.springframework.beans.factory.annotation.Autowired; import de.juwimm.cms.model.SiteHbm; import de.juwimm.cms.search.lucene.LuceneService; import de.juwimm.cms.search.res.HtmlDocumentLocator; import de.juwimm.cms.search.res.PDFDocumentLocator; import de.juwimm.cms.search.res.RTFDocumentLocator; import de.juwimm.cms.search.res.WordDocumentLocator; import de.juwimm.cms.util.AbstractCrawlUrlStrategy.FilterCrawlUrlStrategy; import de.juwimm.cms.util.AbstractCrawlUrlStrategy.ProtocolCrawlUrlStrategy; import de.juwimm.cms.util.SmallSiteConfigReader; /** * @author <a href="florin.zalum@juwimm.com">Florin Zalum</a> * @version $Id$ */ public class WebCrawlerService { private static Logger log = Logger.getLogger(WebCrawlerService.class); private final HttpClient httpClient = new HttpClient(); private static String HTML_CONTENT_TYPE = "text/html"; @Autowired private HtmlDocumentLocator htmlDocumentLocator; @Autowired private PDFDocumentLocator pdfResourceLocator; @Autowired private RTFDocumentLocator rtfResourceLocator; @Autowired private WordDocumentLocator wordResourceLocator; private Map<Integer, Set<String>> alreadyIndexedPages; /** * Used for broken links and non html pages */ private Set<String> alreadyIndexedNonHtmlPages; private FilterCrawlUrlStrategy filtersStrategy; private ProtocolCrawlUrlStrategy protocolsStrategy; private int baseDepth; @Autowired private LuceneService luceneService; public void indexSite(SiteHbm site) { log.info("Crawl and index on site" + site.getSiteId() + "started"); SmallSiteConfigReader configReader = new SmallSiteConfigReader(site); List<String> urls = configReader.readValues(SmallSiteConfigReader.EXTERNAL_SEARCH_URLS_PATH); if (urls == null || urls.size() == 0) { return; } List<String> positiveProtocols = configReader.readValues(SmallSiteConfigReader.getPositiveListTag("protocols")); List<String> negativeProtocols = configReader.readValues(SmallSiteConfigReader.getNegativeListTag("protocols")); List<String> positiveFilters = configReader.readValues(SmallSiteConfigReader.getPositiveListTag("filters")); List<String> negativeFilters = configReader.readValues(SmallSiteConfigReader.getNegativeListTag("filters")); alreadyIndexedPages = new HashMap<Integer, Set<String>>(); alreadyIndexedNonHtmlPages = new HashSet<String>(); filtersStrategy = new FilterCrawlUrlStrategy(positiveFilters, negativeFilters); protocolsStrategy = new ProtocolCrawlUrlStrategy(positiveProtocols, negativeProtocols); baseDepth = Integer.valueOf(configReader.readValue(SmallSiteConfigReader.EXTERNAL_SEARCH_DEPTH_PATH)); for (String url : urls) { try { indexUrl(url, baseDepth, url); } catch (Exception e) { if (log.isDebugEnabled()) { log.debug("failed indexing url:" + url + " - " + e.getMessage()); } continue; } } log.info("Crawl and index on site" + site.getSiteId() + " ended"); } @SuppressWarnings("deprecation") private void indexUrl(String url, int depth, String baseUrl) { //pages per depth must be indexed only once if (!alreadyIndexedPages.containsKey(depth)) { alreadyIndexedPages.put(depth, new HashSet<String>()); } else if (alreadyIndexedPages.get(depth).contains(url) || alreadyIndexedNonHtmlPages.contains(url)) { return; } if (baseDepth != depth) {//no check for first level url //check url validity if (!filtersStrategy.isUrlValid(url)) { //link does not respect the search constraints return; } } //configure http request httpClient.getParams().setConnectionManagerTimeout(5000); HttpMethod httpMethod = null; try { httpMethod = new GetMethod(escapeUrl(url)); } catch (IllegalStateException ex) { if (log.isDebugEnabled()) { log.debug("failed to create http method on url " + url + " with error:" + ex.getMessage()); } if (!url.startsWith(baseUrl)) { indexUrl(baseUrl + (url.startsWith("/") ? "" : "/") + url, depth, baseUrl); } return; } catch (IllegalArgumentException ex) { if (log.isDebugEnabled()) { log.debug("failed to create http method on url " + url + " with error:" + ex.getMessage()); } if (!url.startsWith(baseUrl)) { indexUrl(baseUrl + (url.startsWith("/") ? "" : "/") + url, depth, baseUrl); } return; } if (httpMethod.getHostConfiguration().getHost() == null) { if (!url.startsWith(baseUrl)) { indexUrl(baseUrl + (url.startsWith("/") ? "" : "/") + url, depth, baseUrl); } return; } if (baseDepth != depth) {//no check for first level url //protocol constraints check if (!protocolsStrategy.isUrlValid(url)) { return; } } httpMethod.setFollowRedirects(true); httpMethod.getParams().setBooleanParameter(HttpMethodParams.USE_EXPECT_CONTINUE, true); log.info("indexUrl : url " + url + " depth: " + depth); try { httpClient.executeMethod(httpMethod); //index content indexContent(httpMethod, url, httpMethod.getHostConfiguration().getHostURL(), depth); } catch (IOException e) { if (log.isDebugEnabled()) { log.debug("request failed to url:" + url + " - " + e.getMessage()); } //not to come second time at this link, if the link is broken alreadyIndexedNonHtmlPages.add(url); return; } catch (Exception e) { if (log.isDebugEnabled()) { log.debug("request failed to url:" + url + " - " + e.getMessage()); } //not to come second time at this link, if the link is broken alreadyIndexedNonHtmlPages.add(url); return; } } private String escapeUrl(String url) { return url.replaceAll(" ", "%20"); } private void indexContent(HttpMethod httpMethod, String url, String baseUrl, int depth) { String contentType = httpMethod.getResponseHeader("Content-Type").getValue(); //index content if (contentType.contains(HTML_CONTENT_TYPE)) { indexHtmlResource(httpMethod, url, baseUrl, depth); } else { indexNonHtmlResource(httpMethod, url, contentType); } } private void indexNonHtmlResource(HttpMethod httpMethod, String url, String contentType) { InputStream in; try { in = httpMethod.getResponseBodyAsStream(); Document resource = null; if (contentType.contains(PDFDocumentLocator.MIME_TYPE)) { resource = pdfResourceLocator.getExternalResource( url, in); } else if (contentType.contains(RTFDocumentLocator.MIME_TYPE)) { resource = rtfResourceLocator.getExternalResource( url, in); } else if (contentType.contains(WordDocumentLocator.MIME_TYPE)) { resource = wordResourceLocator.getExternalResource( url, in); } if (resource != null) { luceneService.addToIndex(resource); } } catch (IOException e) { if (log.isDebugEnabled()) { log.debug("request failed to url:" + url + " - " + e.getMessage()); } return; } catch (Exception e) { if (log.isDebugEnabled()) { log.debug("Error index url " + url + " : " + e.getMessage()); } } finally { httpMethod.releaseConnection(); alreadyIndexedNonHtmlPages.add(url); } } /** * Also this method goes to the next links * @param httpMethod * @throws InterruptedException * @throws IOException */ private void indexHtmlResource(HttpMethod httpMethod, String url, String baseUrl, int depth) { StringWriter out = new StringWriter(); InputStream in; try { in = httpMethod.getResponseBodyAsStream(); } catch (IOException e) { if (log.isDebugEnabled()) { log.debug("request failed to url:" + url + " - " + e.getMessage()); } return; } catch (Exception e) { if (log.isDebugEnabled()) { log.debug("request failed to url:" + url + " - " + e.getMessage()); } return; } try { IOUtils.copy(in, out); in.close(); } catch (IOException e) { log.debug("failed to copy content of url :" + url + " - " + e.getMessage()); return; } StringReader parseHtmlReader = new StringReader(out.toString()); StringReader findLinksReader = new StringReader(out.toString()); //index try { Document resource = htmlDocumentLocator.getExternalResource(url, parseHtmlReader); if (resource != null) { luceneService.addToIndex(resource); } } catch (Exception e) { if (log.isDebugEnabled()) { log.debug("Error index url " + url + " : " + e.getMessage()); } } finally { } httpMethod.releaseConnection(); alreadyIndexedPages.get(depth).add(url); parseHtmlReader.close(); if (depth == 0) { findLinksReader.close(); return; } //find new links final List<String> childUrls = extractLinks(findLinksReader, url); if (childUrls == null || childUrls.size() == 0) { return; } findLinksReader.close(); //index deeper for (String childUrl : childUrls) { //childUrl.replace(" ", "%20"); indexUrl(childUrl, depth - 1, baseUrl); } } /** * Input reader should be a html content * @param findLinksReader * @param url * @return */ private final List<String> extractLinks(Reader findLinksReader, String url) { final List<String> childUrls = new ArrayList<String>(); try { new ParserDelegator().parse(findLinksReader, new HTMLEditorKit.ParserCallback() { @Override public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) { if (t == HTML.Tag.A && a.getAttribute(HTML.Attribute.HREF) != null) { childUrls.add(a.getAttribute(HTML.Attribute.HREF).toString()); } } @Override public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int pos) { if (t == HTML.Tag.A && a.getAttribute(HTML.Attribute.HREF) != null) { childUrls.add(a.getAttribute(HTML.Attribute.HREF).toString()); } } }, true); } catch (IOException e) { if (log.isDebugEnabled()) { log.debug("error on parsing url: " + url + " trying to get links"); } } return childUrls; } }