/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package edu.uci.ics.crawler4j.examples.localdata; import org.apache.http.HttpStatus; import edu.uci.ics.crawler4j.crawler.CrawlConfig; import edu.uci.ics.crawler4j.crawler.Page; import edu.uci.ics.crawler4j.fetcher.PageFetchResult; import edu.uci.ics.crawler4j.fetcher.PageFetcher; import edu.uci.ics.crawler4j.parser.HtmlParseData; import edu.uci.ics.crawler4j.parser.ParseData; import edu.uci.ics.crawler4j.parser.Parser; import edu.uci.ics.crawler4j.url.WebURL; /** * This class is a demonstration of how crawler4j can be used to download a * single page and extract its title and text. */ public class Downloader { private Parser parser; private PageFetcher pageFetcher; public Downloader() { CrawlConfig config = new CrawlConfig(); parser = new Parser(config); pageFetcher = new PageFetcher(config); } private Page download(String url) { WebURL curURL = new WebURL(); curURL.setURL(url); PageFetchResult fetchResult = null; try { fetchResult = pageFetcher.fetchHeader(curURL); if (fetchResult.getStatusCode() == HttpStatus.SC_OK) { try { Page page = new Page(curURL); fetchResult.fetchContent(page); if (parser.parse(page, curURL.getURL())) { return page; } } catch (Exception e) { e.printStackTrace(); } } } finally { if (fetchResult != null) { fetchResult.discardContentIfNotConsumed(); } } return null; } public void processUrl(String url) { System.out.println("Processing: " + url); Page page = download(url); if (page != null) { ParseData parseData = page.getParseData(); if (parseData != null) { if (parseData instanceof HtmlParseData) { HtmlParseData htmlParseData = (HtmlParseData) parseData; System.out.println("Title: " + htmlParseData.getTitle()); System.out.println("Text length: " + htmlParseData.getText().length()); System.out.println("Html length: " + htmlParseData.getHtml().length()); } } else { System.out.println("Couldn't parse the content of the page."); } } else { System.out.println("Couldn't fetch the content of the page."); } System.out.println("=============="); } public static void main(String[] args) { Downloader downloader = new Downloader(); downloader.processUrl("http://en.wikipedia.org/wiki/Main_Page/"); downloader.processUrl("http://www.yahoo.com/"); } }