/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package edu.uci.ics.crawler4j.examples.localdata;
import edu.uci.ics.crawler4j.crawler.Page;
import edu.uci.ics.crawler4j.crawler.WebCrawler;
import edu.uci.ics.crawler4j.parser.HtmlParseData;
import edu.uci.ics.crawler4j.url.WebURL;
import java.io.UnsupportedEncodingException;
import java.util.List;
import java.util.regex.Pattern;
public class LocalDataCollectorCrawler extends WebCrawler {
Pattern filters = Pattern.compile(".*(\\.(css|js|bmp|gif|jpe?g" + "|png|tiff?|mid|mp2|mp3|mp4"
+ "|wav|avi|mov|mpeg|ram|m4v|pdf" + "|rm|smil|wmv|swf|wma|zip|rar|gz))$");
CrawlStat myCrawlStat;
public LocalDataCollectorCrawler() {
myCrawlStat = new CrawlStat();
}
@Override
public boolean shouldVisit(WebURL url) {
String href = url.getURL().toLowerCase();
return !filters.matcher(href).matches() && href.startsWith("http://www.ics.uci.edu/");
}
@Override
public void visit(Page page) {
System.out.println("Visited: " + page.getWebURL().getURL());
myCrawlStat.incProcessedPages();
if (page.getParseData() instanceof HtmlParseData) {
HtmlParseData parseData = (HtmlParseData) page.getParseData();
List<WebURL> links = parseData.getOutgoingUrls();
myCrawlStat.incTotalLinks(links.size());
try {
myCrawlStat.incTotalTextSize(parseData.getText().getBytes("UTF-8").length);
} catch (UnsupportedEncodingException ignored) {
// Do nothing
}
}
// We dump this crawler statistics after processing every 50 pages
if (myCrawlStat.getTotalProcessedPages() % 50 == 0) {
dumpMyData();
}
}
// This function is called by controller to get the local data of this
// crawler when job is finished
@Override
public Object getMyLocalData() {
return myCrawlStat;
}
// This function is called by controller before finishing the job.
// You can put whatever stuff you need here.
@Override
public void onBeforeExit() {
dumpMyData();
}
public void dumpMyData() {
int id = getMyId();
// This is just an example. Therefore I print on screen. You may
// probably want to write in a text file.
System.out.println("Crawler " + id + "> Processed Pages: " + myCrawlStat.getTotalProcessedPages());
System.out.println("Crawler " + id + "> Total Links Found: " + myCrawlStat.getTotalLinks());
System.out.println("Crawler " + id + "> Total Text Size: " + myCrawlStat.getTotalTextSize());
}
}