/******************************************************
* Web crawler
*
*
* Copyright (C) 2012 by Peter Hedenskog (http://peterhedenskog.com)
*
******************************************************
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
* in compliance with the License. You may obtain a copy of the License at
*
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the License
* is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing permissions and limitations under
* the License.
*
*******************************************************
*/
package com.soulgalore.crawler.run;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import com.google.inject.Guice;
import com.google.inject.Injector;
import com.soulgalore.crawler.core.Crawler;
import com.soulgalore.crawler.core.CrawlerResult;
import com.soulgalore.crawler.core.HTMLPageResponse;
import com.soulgalore.crawler.guice.CrawlModule;
/**
* Crawl and print urls that contains specific keyword in the HTML body.
*
*/
public class CrawlToPlainTxtOnlyMatching extends AbstractCrawl {
private final String keyword;
CrawlToPlainTxtOnlyMatching(String[] args) throws ParseException {
super(args);
keyword = getLine().getOptionValue("keyword");
}
/**
* Run.
*
* @param args the args
*/
public static void main(String[] args) {
try {
final CrawlToPlainTxtOnlyMatching crawl = new CrawlToPlainTxtOnlyMatching(args);
crawl.crawl();
} catch (ParseException e) {
System.err.print(e.getMessage());
} catch (IllegalArgumentException e) {
System.err.println(e.getMessage());
}
}
private void crawl() {
final Injector injector = Guice.createInjector(new CrawlModule());
final Crawler crawler = injector.getInstance(Crawler.class);
final CrawlerResult result = crawler.getUrls(getConfiguration());
for (HTMLPageResponse response : result.getVerifiedURLResponses()) {
if (response.getBody().toString().contains(keyword)) {
System.out.println(response.getUrl());
}
}
crawler.shutdown();
}
/**
* Get the options.
*
* @return the specific CrawlToCsv options
*/
@Override
protected Options getOptions() {
final Options options = super.getOptions();
final Option filenameOption =
new Option("k", "the keyword to search for in the page [required]");
filenameOption.setArgName("KEYWORD");
filenameOption.setLongOpt("keyword");
filenameOption.setRequired(true);
filenameOption.setArgs(1);
options.addOption(filenameOption);
return options;
}
}