/* * HTMLParser.java * * Copyright (C) 2007-2008 Tommi Laukkanen * Copyright (C) 2007-2008 Irving Bunton * http://www.substanceofcode.com * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * */ // Expand to define memory size define //#define DREGULARMEM // Expand to define logging define //#define DNOLOGGING /* This functionality adds to jar size, so don't do it for small memory */ /* devices. */ //#ifndef DSMALLMEM package com.substanceofcode.utils; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.UnsupportedEncodingException; import java.util.Hashtable; import com.substanceofcode.utils.CauseException; import com.substanceofcode.utils.CauseMemoryException; //#ifdef DLOGGING import net.sf.jlogmicro.util.logging.Logger; import net.sf.jlogmicro.util.logging.Level; //#endif /** * Simple and lightweight HTML parser without complete error handling. * * @author Irving Bunton */ public class HTMLParser extends XmlParser { private boolean m_headerFound = false; private boolean m_metaFound = false; private boolean m_bodyFound = false; //#ifdef DLOGGING private Logger logger = Logger.getLogger("HTMLParser"); private boolean fineLoggable = logger.isLoggable(Level.FINE); private boolean finerLoggable = logger.isLoggable(Level.FINER); private boolean finestLoggable = logger.isLoggable(Level.FINEST); //#endif private String m_redirectUrl = ""; /** Enumerations for parse function */ public static final int REDIRECT_URL = 3; /** Creates a new instance of XmlParser */ public HTMLParser(InputStream inputStream) { super(inputStream); m_defEncoding = "ISO-8859-1"; } /** Creates a new instance of XmlParser */ public HTMLParser(EncodingUtil encodingUtil) { super(encodingUtil); m_defEncoding = "ISO-8859-1"; } /** Parse next element */ protected int parseStream(InputStreamReader is) throws IOException, CauseException { int elementType = super.parseStream(is); if (elementType != XmlParser.ELEMENT) { return elementType; } if (m_bodyFound) { return elementType; } else if (m_headerFound) { String elementName = super.getName(); switch (elementName.charAt(0)) { case 'b': case 'B': m_bodyFound = elementName.toLowerCase().equals("body"); // Default HTML to iso-8859-1 if (m_bodyFound) { if (!m_encoding_set) { //#ifdef DLOGGING if (finerLoggable) {logger.finer("Body found without encoding set.");} //#endif m_encodingUtil.getEncoding(m_fileEncoding, EncodingUtil.getIsoEncoding()); m_docEncoding = m_encodingUtil.getDocEncoding(); m_encoding_set = true; } //#ifdef DLOGGING if (finerLoggable) {logger.finer("Body found m_docEncoding,m_fileEncoding=" + m_docEncoding + "," + m_fileEncoding);} //#endif } break; case 'm': case 'M': m_metaFound = elementName.toLowerCase().equals("meta"); if (m_metaFound) { //#ifdef DLOGGING if (finerLoggable) {logger.finer("Parsing <meta> tag");} //#endif String httpEquiv; if (((httpEquiv = getAttributeValue( "http-equiv" )) == null) || ( httpEquiv.length() == 0 )) { break; } String content; if (((content = getAttributeValue( "content" )) == null) || ( content.length() == 0 )) { break; } int pcharset = content.toLowerCase().indexOf( "charset="); if (pcharset >= 0) { String encoding = content.substring(pcharset + 8); //#ifdef DLOGGING if (finerLoggable) {logger.finer("encoding=" + encoding);} //#endif m_encodingUtil.getEncoding(m_fileEncoding, encoding); m_docEncoding = m_encodingUtil.getDocEncoding(); m_encoding_set = true; } else { int purl = content.toLowerCase().indexOf("url="); if (purl < 0) { break; } String url = content.substring(purl + 4); if (url.length() > 0) { m_redirectUrl = url; //#ifdef DLOGGING if (finerLoggable) {logger.finer("m_redirectUrl=" + m_redirectUrl);} //#endif return REDIRECT_URL; } } } break; default: } } else if (!m_headerFound) { String elementName = super.getName(); switch (elementName.charAt(0)) { case 'h': case 'H': m_headerFound = elementName.toLowerCase().equals("head"); //#ifdef DLOGGING if (finerLoggable && m_headerFound) {logger.finer("m_headerFound=" + m_headerFound);} //#endif break; default: } } return elementType; } /** Parse next element */ public int parse() throws IOException, CauseException { if (m_encodingStreamReader.isModEncoding()) { return parseStream(m_encodingStreamReader); } else { return parseStream(m_inputStream); } } /** Get element text including inner xml */ private String getTextStream(InputStreamReader is) throws IOException, CauseMemoryException, CauseException { if(!m_currentElementContainsText) { return ""; } boolean endParsing = false; String text = ""; try { StringBuffer textBuffer = new StringBuffer(); int inputCharacter; char c; char lastChars[] = {' ', ' ', ' '}; char elementNameChars[] = new char[3]; // Handle length < 3 using min. int elen = m_currentElementName.length(); switch (elen) { case 1: elementNameChars[0] = m_currentElementName.charAt( 0 ); elementNameChars[1] = '>'; break; case 2: elementNameChars[0] = m_currentElementName.charAt( 0 ); elementNameChars[1] = m_currentElementName.charAt( 1 ); elementNameChars[2] = '>'; break; default: m_currentElementName.toString().getChars(elen - 3, 3, elementNameChars, 0); } String endCurrentElement = m_currentElementName.insert(0, "</").toString(); while (((inputCharacter = is.read()) != -1) && !endParsing) { c = (char)inputCharacter; lastChars[0] = lastChars[1]; lastChars[1] = lastChars[2]; lastChars[2] = c; //System.out.print(c); textBuffer.append(c); if( lastChars[0] == elementNameChars[0] && lastChars[1] == elementNameChars[1]) { switch (elen) { case 1: int tlen1 = textBuffer.length(); textBuffer.delete(tlen1 - 2, tlen1); endParsing = true; break; case 2: if (lastChars[2] == '>') { endParsing = true; int tlen2 = textBuffer.length(); textBuffer.delete(tlen2 - 1, tlen2); break; } default: if (lastChars[2] == elementNameChars[2]) { if( textBuffer.toString().endsWith(endCurrentElement)) { endParsing = true; } } } } } if (m_docEncoding.length() == 0) { text = textBuffer.toString(); } else { try { // We read the bytes in as ISO8859_1, so we must get them // out as that and then encode as they should be. if (m_fileEncoding.length() == 0) { text = new String(textBuffer.toString().getBytes(), m_docEncoding); } else { text = new String(textBuffer.toString().getBytes( m_fileEncoding), m_docEncoding); } } catch (IOException e) { //#ifdef DLOGGING logger.severe("getTextStream Could not convert string from,to" + m_fileEncoding + "," + m_docEncoding, e); //#endif System.out.println("getTextStream Could not convert string " + "from,to=" + m_fileEncoding + "," + m_docEncoding + " " + e + " " + e.getMessage()); e.printStackTrace(); text = textBuffer.toString(); } } textBuffer = null; text = StringUtil.replace(text, endCurrentElement, ""); /** Handle some entities and encoded characters */ text = StringUtil.replace(text, "<![CDATA[", ""); text = StringUtil.replace(text, "]]>", ""); text = EncodingUtil.replaceAlphaEntities(true, text); // No need to convert from UTF-8 to Unicode using replace // umlauts now because it is done with new String...,encoding. // Replace numeric entities including ’, ‘ // “, and ” text = m_encodingUtil.replaceNumEntity(text); // Replace special chars like left quote, etc. text = m_encodingUtil.replaceSpChars(text); } catch (OutOfMemoryError t) { CauseMemoryException ce = new CauseMemoryException( "Unable to read text. Out of memory.", t); //#ifdef DLOGGING logger.severe(ce.getMessage(), ce); //#endif System.out.println("getTextStream Could not read a char run time." + t + " " + t.getMessage()); t.printStackTrace(); throw ce; } catch (Throwable t) { CauseException ce = new CauseException("Unable to read text. " + "Internal error.", t); //#ifdef DLOGGING logger.severe(ce.getMessage(), t); //#endif System.out.println(ce.getMessage() + " " + t + " " + t.getMessage()); t.printStackTrace(); if (m_acceptErrors) { return null; } else { throw ce; } } //#ifdef DLOGGING if (finerLoggable) {logger.finer("text=" + text);} //#endif return text; } /** Get element text including inner xml */ public String getText() throws IOException, CauseException { if (m_encodingStreamReader.isModEncoding()) { return getTextStream(m_encodingStreamReader); } else { return getTextStream(m_inputStream); } } /** * Get attribute value from current element */ public String getAttributeValue(String attributeName) throws IOException, CauseMemoryException, CauseException { try { /** Check whatever the element contains given attribute */ String ccurrentElementData = EncodingUtil.replaceSpChars( EncodingUtil.replaceSpChars( m_currentElementData.toString(), true, false), false, false); int attributeStartIndex = ccurrentElementData.toLowerCase().indexOf( " " + attributeName.toLowerCase()); if( attributeStartIndex<0 ) { return null; } /** Calculate actual value start index */ int valueStartIndex = attributeStartIndex + attributeName.length() + 1; String attribData = ccurrentElementData.substring( valueStartIndex).trim(); if (attribData.length() == 0) { return null; } String quote = null; if (attribData.charAt(0) == '=') { attribData = attribData.substring(1).trim(); if (attribData.length() == 0) { return null; } } switch (attribData.charAt(0)) { case '\"': attribData = attribData.substring(1); if (attribData.length() == 0) { return null; } quote = "\""; break; case EncodingUtil.CLEFT_SGL_QUOTE: attribData = attribData.substring(1); quote = EncodingUtil.RIGHT_SGL_QUOTE; if (attribData.length() == 0) { return null; } break; case EncodingUtil.CWLEFT_SGL_QUOTE: attribData = attribData.substring(1); if (attribData.length() == 0) { return null; } quote = EncodingUtil.WRIGHT_SGL_QUOTE; break; default: } /** Check the attribute value end index */ int valueEndIndex; if (quote != null) { valueEndIndex = attribData.indexOf(quote); } else { attribData = attribData.trim(); valueEndIndex = attribData.indexOf(' '); int lpos = attribData.indexOf('>'); if (lpos > 0) { if (valueEndIndex > 0) { valueEndIndex = Math.min(lpos, valueEndIndex); } else { valueEndIndex = lpos; } } if( valueEndIndex<0 ) { valueEndIndex = attribData.length(); } } if( valueEndIndex<=0 ) { return null; } /** Parse value */ String value = attribData.substring(0, valueEndIndex); if (m_docEncoding.length() != 0) { // We read the bytes in as ISO8859_1, so we must get them // out as that and then encode as they should be. if (m_fileEncoding.length() == 0) { value = new String(value.getBytes(), m_docEncoding); } else { value = new String(value.getBytes( m_fileEncoding), m_docEncoding); } } return value; } catch (Throwable t) { CauseException ce = new CauseException( "Parse attribute read error. Internal error.", t); //#ifdef DLOGGING logger.severe(ce.getMessage(), ce); //#endif System.out.println(ce.getMessage() + " " + t + " " + t.getMessage()); if (m_acceptErrors) { return null; } else { throw ce; } } } public void setMetaFound(boolean metaFound) { this.m_metaFound = metaFound; } public boolean isMetaFound() { return (m_metaFound); } public void setBodyFound(boolean bodyFound) { this.m_bodyFound = bodyFound; } public boolean isBodyFound() { return (m_bodyFound); } public void setRedirectUrl(String redirectUrl) { this.m_redirectUrl = redirectUrl; } public String getRedirectUrl() { return (m_redirectUrl); } } //#endif