/******************************************************
* Web crawler
*
*
* Copyright (C) 2012 by Peter Hedenskog (http://peterhedenskog.com)
*
******************************************************
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
* in compliance with the License. You may obtain a copy of the License at
*
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the License
* is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing permissions and limitations under
* the License.
*
*******************************************************
*/
package com.soulgalore.crawler.run;
import java.io.BufferedWriter;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import org.apache.commons.cli.ParseException;
import com.google.inject.Guice;
import com.google.inject.Injector;
import com.soulgalore.crawler.core.Crawler;
import com.soulgalore.crawler.core.CrawlerResult;
import com.soulgalore.crawler.core.assets.AssetResponse;
import com.soulgalore.crawler.core.assets.AssetsVerificationResult;
import com.soulgalore.crawler.core.assets.AssetsVerifier;
import com.soulgalore.crawler.guice.CrawlModule;
import com.soulgalore.crawler.util.StatusCode;
public class CrawlAndVerifyAssetsToCsv extends AbstractCrawl {
/**
* The default file name of the result.
*/
public static final String DEFAULT_FILENAME = "errorassets.csv";
private final String fileName;
CrawlAndVerifyAssetsToCsv(String[] args) throws ParseException {
super(args);
fileName = getLine().getOptionValue("filename", DEFAULT_FILENAME);
}
/**
* Run.
*
* @param args the args
*/
public static void main(String[] args) {
try {
final CrawlAndVerifyAssetsToCsv crawl = new CrawlAndVerifyAssetsToCsv(args);
crawl.crawl();
} catch (ParseException e) {
System.err.print(e.getMessage());
} catch (IllegalArgumentException e) {
System.err.println(e.getMessage());
}
}
private void crawl() {
final Injector injector = Guice.createInjector(new CrawlModule());
final Crawler crawler = injector.getInstance(Crawler.class);
final StringBuilder builder = new StringBuilder();
builder.append("URL,parent,error\n");
final CrawlerResult result = crawler.getUrls(getConfiguration());
System.out.println("Crawled " + result.getVerifiedURLResponses().size() + " pages");
System.out.println("Start verify assets ...");
AssetsVerifier verifier = injector.getInstance(AssetsVerifier.class);
AssetsVerificationResult assetsResult =
verifier.verify(result.getVerifiedURLResponses(), getConfiguration());
System.out.println(assetsResult.getWorkingAssets().size() + " assets is ok, "
+ assetsResult.getNonWorkingAssets().size() + " is not");
for (AssetResponse resp : assetsResult.getNonWorkingAssets()) {
builder.append(resp.getUrl()).append(",").append(resp.getReferer()).append(",")
.append(StatusCode.toFriendlyName(resp.getResponseCode())).append("\n");
}
Writer out = null;
try {
out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(fileName), "UTF-8"));
out.write(builder.toString());
} catch (Exception e) {
// TODO Auto-generated catch block
System.err.println(e);
} finally {
if (out != null) try {
out.close();
} catch (IOException e) {
// TODO Auto-generated catch block
System.err.println(e);
}
}
crawler.shutdown();
verifier.shutdown();
}
}