/* * WebPageFetcher.java * * Copyright (C) 2005 Kokanovic Branko * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ package org.elite.jdcbot.util; /* * From http://www.warnertechnology.com/Computers/Articles/JavaMOSXS/javacode1.shtml and * http://www.javapractices.com/Topic147.cjp * Slighty changed for our needs */ import java.io.*; import java.net.*; /** * Fetches the HTML content of a web page as a String. * Idea is to use derived class, example can be found in GoogleCalculation class * This class just fetches the page, to parse things you want from web page, use derived class * * @since 0.6 * @author Kokanovic Branko * @version 0.6 */ public abstract class WebPageFetcher { private URL fURL; private static final String fHTTP = "http"; private static final String fNEWLINE = System.getProperty("line.separator"); public WebPageFetcher( URL aURL ){ if ( !aURL.getProtocol().equals(fHTTP) ) { throw new IllegalArgumentException("URL is not for HTTP Protocol: " + aURL); } fURL = aURL; } public WebPageFetcher( String aUrlName ) throws MalformedURLException { this ( new URL(aUrlName) ); } public WebPageFetcher(){} protected void SetURL(URL aURL){ if ( !aURL.getProtocol().equals(fHTTP) ) { throw new IllegalArgumentException("URL is not for HTTP Protocol: " + aURL); } fURL = aURL; } protected void SetURL(String aUrlName ) throws MalformedURLException { SetURL(new URL(aUrlName)); } /** * Fetch the HTML content of the page as simple text. */ protected String getPageContent() throws UnknownHostException, IOException { int result=0, fetched=0; char[] cbuf=new char[65000]; String theInfo; try{ URLConnection conn = fURL.openConnection(); //google won't let us in without this property conn.setRequestProperty("User-Agent", "Mozilla/4.x"); BufferedReader in = new BufferedReader(new InputStreamReader(conn.getInputStream())); // due to the nature of sockets, you probably wont get all the text back with one call // so we continue to call read until we get -1, which means were done, // or until we run out of space in our array of characters while ((fetched!=-1) && (result<65000)) { fetched=in.read(cbuf,result,65000-result); result+=fetched; } in.close(); }catch (UnknownHostException e) { System.err.println("Don't know about host: "+e); }catch (IOException e) { System.err.println("Couldn't get I/O for the connection: "+e); }catch (Exception e) { System.err.println(e); }finally { // convert the array of characters to a String // being sure to convert only the characters that have // data, not the entire 65,000 character array theInfo=new String(cbuf,0,result); } return theInfo; } /** * Fetch the HTML headers as simple text. */ public String getPageHeader(){ StringBuffer result = new StringBuffer(); URLConnection connection = null; try { connection = fURL.openConnection(); }catch (IOException ex) { System.err.println("Cannot open connection to URL: " + fURL); } //not all headers come in key-value pairs - sometimes the key is //null or an empty String int headerIdx = 0; String headerKey = null; String headerValue = null; while ( (headerValue = connection.getHeaderField(headerIdx)) != null ) { headerKey = connection.getHeaderFieldKey(headerIdx); if ( headerKey != null && headerKey.length()>0 ) { result.append( headerKey ); result.append(" : "); } result.append( headerValue ); result.append(fNEWLINE); headerIdx++; } return result.toString(); } }