/* * HTMLLinkParser.java * * Copyright (C) 2005-2006 Tommi Laukkanen * http://www.substanceofcode.com * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * */ // Expand to define memory size define //#define DREGULARMEM // Expand to define logging define //#define DNOLOGGING /* This functionality adds to jar size, so don't do it for small memory */ /* devices. */ //#ifndef DSMALLMEM package com.substanceofcode.rssreader.businesslogic; import com.substanceofcode.rssreader.businessentities.RssItunesFeed; import com.substanceofcode.utils.EncodingStreamReader; import com.substanceofcode.utils.StringUtil; import com.substanceofcode.utils.HTMLParser; import com.substanceofcode.utils.XmlParser; import java.io.InputStream; import java.io.InputStreamReader; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.util.Vector; import com.substanceofcode.utils.EncodingUtil; import com.substanceofcode.utils.CauseException; import com.substanceofcode.utils.CauseMemoryException; //#ifdef DLOGGING import net.sf.jlogmicro.util.logging.Logger; import net.sf.jlogmicro.util.logging.LogManager; import net.sf.jlogmicro.util.logging.Level; //#endif /** * HTMLLinkParser class is used when we are parsing RSS feed list * using HTML hyperlinks <a href="link">Name</a>. * For example, the BBC page has such links with URL rss.xml, so one * would use URL http://news.bbc.co.uk/2/hi/help/3223484.stm with * URL search string as rss.xml to weed out the unrelated links. * * @author Irving Bunton */ public class HTMLLinkParser extends FeedListParser { protected boolean m_acceptErrors = true; // Allow some errors //#ifdef DLOGGING private Logger logger = Logger.getLogger("HTMLLinkParser"); private boolean fineLoggable = logger.isLoggable(Level.FINE); private boolean finerLoggable = logger.isLoggable(Level.FINER); private boolean finestLoggable = logger.isLoggable(Level.FINEST); //#endif /** Creates a new instance of HTMLLinkParser */ public HTMLLinkParser(String url, String username, String password) { super(url, username, password); } /** Parse HTML hyper links '<a' and create feeds from them based on link name and url if specified. */ public RssItunesFeed[] parseFeeds(InputStream is) throws IOException, CauseMemoryException, CauseException, Exception { // Init in case we get a severe error. try { return HTMLLinkParser.parseFeeds(new EncodingUtil(is), m_url, m_feedNameFilter, m_feedURLFilter, m_acceptErrors //#ifdef DLOGGING ,logger ,fineLoggable ,finerLoggable ,finestLoggable //#endif ); } catch (CauseException e) { throw e; } catch (Throwable t) { CauseException cex = new CauseException( "Error while parsing HTML Link feed " + m_url, t); //#ifdef DLOGGING logger.severe(cex.getMessage(), cex); //#endif System.err.println(cex.getMessage() + " " + t + " " + t.getMessage()); throw cex; } } /** Parse HTML hyper links '<a' and create feeds from them based on link name and url if specified. */ static public RssItunesFeed[] parseFeeds(EncodingUtil encodingUtil, String url, String feedNameFilter, String feedURLFilter, boolean acceptErrors //#ifdef DLOGGING ,Logger logger, boolean fineLoggable, boolean finerLoggable, boolean finestLoggable //#endif ) throws IOException, CauseMemoryException, CauseException, Exception { /** Initialize item collection */ Vector rssFeeds = new Vector(); /** Initialize XML parser and parse OPML XML */ HTMLParser parser = new HTMLParser(encodingUtil); try { // The first element is the main tag. int elementType = parser.parse(); // If we found the prologue, get the next entry. if( elementType == XmlParser.PROLOGUE ) { elementType = parser.parse(); } if (elementType == XmlParser.END_DOCUMENT ) { return null; } boolean bodyFound = false; do { if (elementType == HTMLParser.REDIRECT_URL) { RssItunesFeed [] feeds = new RssItunesFeed[1]; feeds[0] = new RssItunesFeed("", parser.getRedirectUrl(), "", ""); return feeds; } /** RSS item properties */ String title = ""; String link = ""; String tagName = parser.getName(); //#ifdef DLOGGING if (finerLoggable) {logger.finer("tagname: " + tagName);} //#endif if (tagName.length() == 0) { continue; } switch (tagName.charAt(0)) { case 'm': case 'M': if (bodyFound) { break; } break; case 'b': case 'B': if (!bodyFound) { bodyFound = parser.isBodyFound(); } break; case 'a': case 'A': //#ifdef DLOGGING if (finerLoggable) {logger.finer("Parsing <a> tag");} //#endif title = parser.getText(); // Title can be 0 as this is used also for // getting title = title.trim(); title = StringUtil.removeHtml( title ); if (((link = parser.getAttributeValue( "href" )) == null) || ( link.length() == 0 )) { continue; } link = link.trim(); if ( link.length() == 0 ) { continue; } if (link.indexOf("://") >= 0) { if (!link.startsWith("http:") && !link.startsWith("https:") && !link.startsWith("file:") && !link.startsWith("jar:")) { //#ifdef DLOGGING if (finerLoggable) {logger.finer("Not support for protocol or no protocol=" + link);} //#endif continue; } } else { if (link.charAt(0) == '/') { int purl = url.indexOf("://"); if ((purl + 4) >= url.length()) { //#ifdef DLOGGING if (finerLoggable) {logger.finer("Url too short=" + url + "," + purl);} //#endif continue; } int pslash = url.indexOf("/", purl + 3); String burl = url; if (pslash >= 0) { burl = url.substring(0, pslash); } link = burl + link; } else { link = url + "/" + link; } } /** Debugging information */ //#ifdef DLOGGING if (finerLoggable) {logger.finer("Title: " + title);} if (finerLoggable) {logger.finer("Link: " + link);} //#endif if (( feedURLFilter != null) && ( link.toLowerCase().indexOf(feedURLFilter) < 0)) { continue; } if (( feedNameFilter != null) && ((title != null) && (title.toLowerCase().indexOf(feedNameFilter) < 0))) { continue; } RssItunesFeed feed = new RssItunesFeed(title, link, "", ""); rssFeeds.addElement( feed ); break; default: } } while( (elementType = parser.parse()) != XmlParser.END_DOCUMENT ); } catch (CauseMemoryException ex) { CauseMemoryException cex = new CauseMemoryException( "Out of memory error while parsing HTML Link feed " + url, ex); throw cex; } catch (Exception ex) { CauseException cex = new CauseException( "Error while parsing HTML Link feed " + url, ex); System.err.println(cex.getMessage() + " " + ex + " " + ex.toString()); ex.printStackTrace(); //#ifdef DLOGGING logger.severe(cex.getMessage(), cex); //#endif throw cex; } catch (Throwable t) { CauseException cex = new CauseException( "Error while parsing HTML Link feed " + url, t); System.err.println(cex.getMessage() + " " + t + " " + t.toString()); t.printStackTrace(); //#ifdef DLOGGING logger.severe(cex.getMessage(), cex); //#endif throw cex; } /** Create array */ RssItunesFeed[] feeds = new RssItunesFeed[ rssFeeds.size() ]; rssFeeds.copyInto(feeds); return feeds; } } //#endif