/** * Copyright (c)2010-2011 Enterprise Website Content Management System(EWCMS), All rights reserved. * EWCMS PROPRIETARY/CONFIDENTIAL. Use is subject to license terms. * http://www.ewcms.com */ package com.ewcms.plugin.crawler.generate; import java.io.File; import java.io.IOException; import java.util.regex.Pattern; import org.apache.http.HttpStatus; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.ewcms.content.resource.ResourceFacable; import com.ewcms.content.resource.model.Resource; import com.ewcms.core.site.model.Site; import com.ewcms.plugin.crawler.generate.crawler.Page; import com.ewcms.plugin.crawler.generate.crawler.WebCrawler; import com.ewcms.plugin.crawler.generate.parser.BinaryParseData; import com.ewcms.plugin.crawler.generate.url.WebURL; import com.ewcms.plugin.crawler.generate.util.Cryptography; import com.ewcms.plugin.crawler.generate.util.IO; /** * * @author wu_zhijun * */ public class EwcmsResourceCrawler extends WebCrawler { private static final Logger logger = LoggerFactory.getLogger(EwcmsResourceCrawler.class); private static final Pattern IMG_PATTERNS = Pattern.compile(".*(\\.(bmp|gif|jpe?g|png|tiff?))$"); private static final Pattern FLASH_PATTERNS = Pattern.compile(".*(\\.(swf|flv))$"); private static final Pattern VIDEO_PATTERNS = Pattern.compile(".*(\\.(mid|mp2|mp3|mp4|wav|avi|mov|mpeg|ram|m4v|rm|smil|wmv|wma))$"); private String[] crawlDomains; private String storageFolderName; private Boolean isImage = false; private Boolean isFlash = false; private Boolean isVideo = false; private Boolean isAnnex = false; private Pattern annex_patterns; private Site site; private ResourceFacable resourceFac; private File storageFolder; @Override public void onStart() { super.onStart(); crawlDomains = (String[]) myController.getCustomData(); resourceFac = (ResourceFacable)getPassingParameters().get("resourceFac"); site = (Site)getPassingParameters().get("site"); isImage = (Boolean)getPassingParameters().get("isImage"); isFlash = (Boolean)getPassingParameters().get("isFlash"); isVideo = (Boolean)getPassingParameters().get("isVideo"); isAnnex = (Boolean)getPassingParameters().get("isAnnex"); String annexType = (String)getPassingParameters().get("annexType"); if (isAnnex){ if (annexType == null || annexType.trim().length() == 0){ annex_patterns = Pattern.compile(".*(\\.(*))$"); }else{ annex_patterns = Pattern.compile(".*(\\.(" + annexType + "))$"); } } storageFolderName = (String)getPassingParameters().get("storageFolderName"); storageFolder = new File(storageFolderName); if (!storageFolder.exists()) { storageFolder.mkdirs(); } } @Override public boolean shouldVisit(WebURL url) { String href = url.getURL().toLowerCase(); if (isImage && IMG_PATTERNS.matcher(href).matches()) return true; if (isFlash && FLASH_PATTERNS.matcher(href).matches()) return true; if (isVideo && VIDEO_PATTERNS.matcher(href).matches()) return true; if (isAnnex && annex_patterns.matcher(href).matches()) return true; if (crawlDomains != null && crawlDomains.length > 0){ for (String domain : crawlDomains) { if (href.startsWith(domain)) { return true; } } } return false; } @Override public void visit(Page page) { String url = page.getWebURL().getURL(); if (!(page.getParseData() instanceof BinaryParseData)) return; //除了附件资源以外,不处理下载小于1k的资源 if (!annex_patterns.matcher(url).matches() && page.getContentData().length < 10 * 1024) return; // 获取唯一的名称 String extension = url.substring(url.lastIndexOf(".")); String hashedName = Cryptography.MD5(url) + extension; String destination = storageFolder.getAbsolutePath() + "/" + hashedName; // 保存文件 IO.writeBytesToFile(page.getContentData(), destination); logger.info("Stored: {} ", url); File file = new File(destination); Resource.Type type = Resource.Type.ANNEX; if (IMG_PATTERNS.matcher(url).matches()) type = Resource.Type.IMAGE; else if (FLASH_PATTERNS.matcher(url).matches()) type = Resource.Type.FLASH; else if (VIDEO_PATTERNS.matcher(url).matches()) type = Resource.Type.VIDEO; try { resourceFac.uploadResource(site, file, url, type); } catch (IOException e) { e.printStackTrace(); } } @Override public void onBeforeExit() { crawlDomains = null; storageFolderName = null; isImage = null; isFlash = null; isVideo = null; isAnnex = null; annex_patterns = null; site = null; resourceFac = null; } @Override protected void handlePageStatusCode(WebURL webUrl, int statusCode, String statusDescription) { if (statusCode != HttpStatus.SC_OK) { if (statusCode == HttpStatus.SC_NOT_FOUND) { logger.info("Broken link: {} , this link was found in page with docid: {}", webUrl.getURL(), webUrl.getParentDocid()); } else { logger.info("Non success status for link: {} , status code: {} , description: {}" + webUrl.getURL(), statusCode, statusDescription); } } } }