WebPageFetcher.java example

Explorer

jdcbot-master
- src
  - org
    - ag
      - sheriffbot
        FilelistDumpSearcher.java
        LegacyFilelistDumpSearcher.java
        SheriffBot.java
    - elite
      - jdcbot
        examples
        BotLogic.java
        BotMain.java
        DemoBot.java
        DownloadBot.java
        DownloadBot2.java
        ExampleBot.java
        QuickAndDirtyBot.java
        framework
        BotConfig.java
        BotEventDispatchThread.java
        BotException.java
        BotInterface.java
        BufferedServerSocket.java
        BufferedSocket.java
        DCIO.java
        DUEntity.java
        DownloadCentral.java
        DownloadHandler.java
        DownloadManager.java
        DownloadQEntry.java
        EventDispatchThread.java
        EventjDCBot.java
        EventjDCBotAdapter.java
        EventjDCBotListener.java
        EventjDCBotListenerAdapter.java
        GlobalObjects.java
        Hub.java
        InputThread.java
        InputThreadTarget.java
        JMethod.java
        JobThread.java
        MultiHubsAdapter.java
        PrimitiveWrapper.java
        TimeoutInputThread.java
        UDPInputThread.java
        UDPInputThreadTarget.java
        UploadHandler.java
        UploadManager.java
        User.java
        UserManager.java
        jDCBot.java
        shareframework
        FLDir.java
        FLFile.java
        FLInterface.java
        FileListManager.java
        FilelistConverter.java
        HashException.java
        HashManager.java
        HashUser.java
        SearchResultSet.java
        SearchSet.java
        ShareException.java
        ShareManager.java
        ShareManagerListener.java
        UploadStreamManager.java
        util
        DownloadEntityStream.java
        FloodMessageThread.java
        GlobalFunctions.java
        GoogleCalculation.java
        InputEntityStream.java
        MySQLWork.java
        NullPrinter.java
        OutputEntityStream.java
        ProgressMeter.java
        RateConstrainer.java
        StaticCommands.java
        TimerThread.java
        WebPageFetcher.java

/*
 * WebPageFetcher.java
 *
 * Copyright (C) 2005 Kokanovic Branko
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 */

package org.elite.jdcbot.util;

/*
 * From http://www.warnertechnology.com/Computers/Articles/JavaMOSXS/javacode1.shtml and
 * http://www.javapractices.com/Topic147.cjp
 * Slighty changed for our needs
 */
import java.io.*;
import java.net.*;

/**
* Fetches the HTML content of a web page as a String.
* Idea is to use derived class, example can be found in GoogleCalculation class
* This class just fetches the page, to parse things you want from web page, use derived class
* 
* @since 0.6
* @author  Kokanovic Branko
* @version    0.6
*/
public abstract class WebPageFetcher {

	private URL fURL;
	private static final String fHTTP = "http";
	private static final String fNEWLINE = System.getProperty("line.separator");
	
	public WebPageFetcher( URL aURL ){
		if ( !aURL.getProtocol().equals(fHTTP) ) {
			throw new IllegalArgumentException("URL is not for HTTP Protocol: " + aURL);
		}
		fURL = aURL;
	}

	public WebPageFetcher( String aUrlName ) throws MalformedURLException {
		this ( new URL(aUrlName) );
	}
 
	public WebPageFetcher(){}

  
	protected void SetURL(URL aURL){
		if ( !aURL.getProtocol().equals(fHTTP) ) {
			throw new IllegalArgumentException("URL is not for HTTP Protocol: " + aURL);
		}
		fURL = aURL;
	}
 
	protected void SetURL(String aUrlName ) throws MalformedURLException {
		SetURL(new URL(aUrlName));
	}
 
	/**
	* Fetch the HTML content of the page as simple text.
	*/
	protected String getPageContent() throws UnknownHostException, IOException {
		int result=0, fetched=0;
		char[] cbuf=new char[65000];
		String theInfo;
		try{
			URLConnection conn = fURL.openConnection();
			//google won't let us in without this property
			conn.setRequestProperty("User-Agent", "Mozilla/4.x");
			BufferedReader in = new BufferedReader(new InputStreamReader(conn.getInputStream()));
			// due to the nature of sockets, you probably wont get all the text back with one call
			// so we continue to call read until we get -1, which means were done,
			// or until we run out of space in our array of characters
			while ((fetched!=-1) && (result<65000)) {
				fetched=in.read(cbuf,result,65000-result);
				result+=fetched;
			}
			in.close();
		}catch (UnknownHostException e) {
			System.err.println("Don't know about host: "+e);
		}catch (IOException e) {
			System.err.println("Couldn't get I/O for the connection: "+e);
		}catch (Exception e) {
			System.err.println(e);
		}finally {
			// convert the array of characters to a String
			// being sure to convert only the characters that have
			// data, not the entire 65,000 character array
			theInfo=new String(cbuf,0,result);
		}
		return theInfo;
	}

	/**
	* Fetch the HTML headers as simple text.
	*/
	public String getPageHeader(){
		StringBuffer result = new StringBuffer();
		URLConnection connection = null;
		try {
			connection = fURL.openConnection();
		}catch (IOException ex) {
			System.err.println("Cannot open connection to URL: " + fURL);
		}

		//not all headers come in key-value pairs - sometimes the key is
		//null or an empty String
		int headerIdx = 0;
		String headerKey = null;
		String headerValue = null;
		while ( (headerValue = connection.getHeaderField(headerIdx)) != null ) {
			headerKey = connection.getHeaderFieldKey(headerIdx);
			if ( headerKey != null && headerKey.length()>0 ) {
				result.append( headerKey );
				result.append(" : ");
			}
			result.append( headerValue );
			result.append(fNEWLINE);
			headerIdx++;
		}
		return result.toString();
	}
}