/******************************************************
* Web crawler
*
*
* Copyright (C) 2012 by Peter Hedenskog (http://peterhedenskog.com)
*
******************************************************
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
* in compliance with the License. You may obtain a copy of the License at
*
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the License
* is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing permissions and limitations under
* the License.
*
*******************************************************
*/
package com.soulgalore.crawler.run;
import java.io.BufferedWriter;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.io.Writer;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.http.HttpStatus;
import com.google.inject.Guice;
import com.google.inject.Injector;
import com.soulgalore.crawler.core.Crawler;
import com.soulgalore.crawler.core.CrawlerResult;
import com.soulgalore.crawler.core.HTMLPageResponse;
import com.soulgalore.crawler.core.CrawlerURL;
import com.soulgalore.crawler.guice.CrawlModule;
import com.soulgalore.crawler.util.StatusCode;
/**
* Crawl to File. To files will be created, one with the working urls & one with the none working
* urls. Each url will be on one new line.
*
* @author peter
*
*/
public class CrawlToFile extends AbstractCrawl {
public static final String DEFAULT_FILENAME = "urls.txt";
public static final String DEFAULT_ERROR_FILENAME = "errorurls.txt";
private final String fileName;
private final String errorFileName;
private final boolean verbose;
CrawlToFile(String[] args) throws ParseException {
super(args);
fileName = getLine().getOptionValue("filename", DEFAULT_FILENAME);
errorFileName = getLine().getOptionValue("errorfilename", DEFAULT_ERROR_FILENAME);
verbose = Boolean.valueOf(getLine().getOptionValue("verbose", "false"));
}
/**
* Run.
*
* @param args the args
*/
public static void main(String[] args) {
try {
final CrawlToFile crawl = new CrawlToFile(args);
crawl.crawl();
} catch (ParseException e) {
System.err.print(e.getMessage());
} catch (IllegalArgumentException e) {
System.err.println(e.getMessage());
}
}
private void crawl() {
final Injector injector = Guice.createInjector(new CrawlModule());
final Crawler crawler = injector.getInstance(Crawler.class);
final CrawlerResult result = crawler.getUrls(getConfiguration());
final StringBuilder workingUrls = new StringBuilder();
final StringBuilder nonWorkingUrls = new StringBuilder();
String separator = System.getProperty( "line.separator" );
for (CrawlerURL workingUrl : result.getUrls()) {
workingUrls.append(workingUrl.getUrl()).append(separator);
}
if (verbose) System.out.println("Start storing file working urls " + fileName);
writeFile(fileName, workingUrls.toString());
if (result.getNonWorkingUrls().size() > 0) {
for (HTMLPageResponse nonWorkingUrl : result.getNonWorkingUrls()) {
nonWorkingUrls.append(StatusCode.toFriendlyName(nonWorkingUrl.getResponseCode()))
.append(",").append(nonWorkingUrl.getUrl());
if (nonWorkingUrl.getResponseCode() >= HttpStatus.SC_NOT_FOUND)
nonWorkingUrls.append(" from ").append(nonWorkingUrl.getPageUrl().getReferer());
nonWorkingUrls.append(separator);
}
if (verbose) System.out.println("Start storing file non working urls " + errorFileName);
writeFile(errorFileName, nonWorkingUrls.toString());
}
crawler.shutdown();
}
/**
* Get the options.
*
* @return the specific CrawlToCsv options
*/
@Override
protected Options getOptions() {
final Options options = super.getOptions();
final Option filenameOption =
new Option("f", "the name of the output file, default name is " + DEFAULT_FILENAME
+ " [optional]");
filenameOption.setArgName("FILENAME");
filenameOption.setLongOpt("filename");
filenameOption.setRequired(false);
filenameOption.setArgs(1);
options.addOption(filenameOption);
final Option errorFilenameOption =
new Option("ef", "the name of the error output file, default name is "
+ DEFAULT_ERROR_FILENAME + " [optional]");
errorFilenameOption.setArgName("ERRORFILENAME");
errorFilenameOption.setLongOpt("errorfilename");
errorFilenameOption.setRequired(false);
errorFilenameOption.setArgs(1);
options.addOption(errorFilenameOption);
final Option verboseOption = new Option("ve", "verbose logging, default is false [optional]");
verboseOption.setArgName("VERBOSE");
verboseOption.setLongOpt("verbose");
verboseOption.setRequired(false);
verboseOption.setArgs(1);
verboseOption.setType(Boolean.class);
options.addOption(verboseOption);
return options;
}
private void writeFile(String fileName, String output) {
Writer out = null;
try {
out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(fileName), "UTF-8"));
out.write(output);
} catch (UnsupportedEncodingException e) {
// TODO Auto-generated catch block
System.err.println(e);
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
System.err.println(e);
} catch (IOException e) {
// TODO Auto-generated catch block
System.err.println(e);
} finally {
if (out != null) try {
out.close();
} catch (IOException e) {
// TODO Auto-generated catch block
System.err.println(e);
}
}
}
}