package com.idega.block.websearch.business; import java.io.BufferedReader; import java.io.InputStream; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.List; import javax.swing.text.html.parser.ParserDelegator; import javax.swing.text.html.HTMLEditorKit.ParserCallback; import javax.swing.text.MutableAttributeSet; import javax.swing.text.html.HTML.Tag; import java.text.ParseException; import java.text.SimpleDateFormat; import javax.swing.text.html.HTML; /** * <p><code>HTMLHandler</code> * Content handler for HTML documents.</p> * This class is a part of the websearch webcrawler and search engine block. <br> * It is based on the <a href="http://lucene.apache.org">Lucene</a> java search engine from the Apache group and loosly <br> * from the work of David Duddleston of i2a.com.<br> * * @copyright Idega Software 2002 * @author <a href="mailto:eiki@idega.is">Eirikur Hrafnsson</a> */ public final class HTMLHandler extends ParserCallback implements ContentHandler{ // Content private String title; private String description; private String keywords; private String categories; private long published; private String href; private String author; private StringBuffer contents; private ArrayList links; // Robot Instructions private boolean robotIndex; private boolean robotFollow; private static final char space = ' '; private char state; private static final char NONE = 0; private static final char TITLE = 1; private static final char HREF = 2; private static final char SCRIPT = 3; private SimpleDateFormat dateFormatter; private static ParserDelegator pd = new ParserDelegator(); /** * Constructor - initializes variables */ public HTMLHandler() { this.contents = new StringBuffer(); this.links = new ArrayList(); this.published = -1; // 1996.07.10 15:08:56 PST this.dateFormatter = new SimpleDateFormat("yyyy.MM.dd HH:mm:ss z"); } /** * Parse Content. [24] 320:1 */ public String getAuthor() { return this.author; } /** * Return categories (from META tags) */ public String getCategories() { return this.categories; } /** * Return contents */ public String getContents() { return this.contents.toString(); } /** * Return description (from META tags) */ public String getDescription() { return this.description; } /** * Return META HREF */ public String getHREF() { return this.href; } /** * Return keywords (from META tags) */ public String getKeywords() { return this.keywords; } /** * Return links */ public List getLinks() { return this.links; } /** * Return published date (from META tag) */ public long getPublished() { return this.published; } /** * Return boolean true if links are to be followed */ public boolean getRobotFollow() { return this.robotFollow; } /** * Return boolean true if this is to be indexed */ public boolean getRobotIndex() { return this.robotIndex; } /** * Return page title */ public String getTitle() { return this.title; } /** * Handle Anchor <A HREF="~"> tags */ public void handleAnchor(MutableAttributeSet attribs) { String href = new String(); href = (String) attribs.getAttribute(HTML.Attribute.HREF); if (href == null) { return; } this.links.add(href); this.state = HREF; } /** * Closing tag */ public void handleEndTag(Tag tag, int pos) { if (this.state == NONE) { return; } // In order of precedence == > && > || if (this.state == TITLE && tag.equals(HTML.Tag.TITLE)) { this.state = NONE; return; } if (this.state == HREF && tag.equals(HTML.Tag.A)) { //links.add(linktext); this.state = NONE; return; } if (this.state == SCRIPT && tag.equals(HTML.Tag.SCRIPT)) { this.state = NONE; return; } } /** * Handle META tags */ public void handleMeta(MutableAttributeSet attribs) { String name = new String(); String content = new String(); name = (String) attribs.getAttribute(HTML.Attribute.NAME); content = (String) attribs.getAttribute(HTML.Attribute.CONTENT); if (name == null || content == null) { return; } name = name.toUpperCase(); if (name.equals("DESCRIPTION")) { this.description = content; return; } if (name.equals("KEYWORDS")) { this.keywords = content; return; } if (name.equals("CATEGORIES")) { this.categories = content; return; } if (name.equals("PUBLISHED")) { try { this.published = this.dateFormatter.parse(content).getTime(); } catch(ParseException e) {e.printStackTrace();} return; } if (name.equals("HREF")) { this.href = content; return; } if (name.equals("AUTHOR")) { this.author = content; return; } if (name.equals("ROBOTS")) { if (content.indexOf("noindex") != -1) { this.robotIndex = false; } if (content.indexOf("nofollow") != -1) { this.robotFollow = false; } this.author = content; return; } } /** * Handle standalone tags */ public void handleSimpleTag(Tag tag, MutableAttributeSet attribs, int pos) { if (tag.equals(HTML.Tag.META)) { handleMeta(attribs); } } /** * Opening tag */ public void handleStartTag(Tag tag, MutableAttributeSet attribs, int pos) { if (tag.equals(HTML.Tag.TITLE)) { this.state = TITLE; } else if (tag.equals(HTML.Tag.A)) { handleAnchor(attribs); } else if (tag.equals(HTML.Tag.SCRIPT)) { this.state = SCRIPT; } } /** * Handle page text */ public void handleText(char[] text, int pos) { switch (this.state) { case NONE : this.contents.append(text); this.contents.append(space); break; case TITLE : this.title = new String(text); break; case HREF : this.contents.append(text); this.contents.append(space); //linktext = new String(text); break; } } /** * Parse Content. */ public void parse(InputStream in) { try { reset(); pd.parse(new BufferedReader(new InputStreamReader(in)), this, true); //System.out.println("Title: " + getTitle()); //System.out.println("Author: " + getAuthor()); //System.out.println("Published " + getPublished()); //System.out.println("Keywords: " + getKeywords()); //System.out.println("Description: " + getDescription()); //System.out.println("Content: " + getContents()); } catch (Exception e) {e.printStackTrace();} } /** * Return contents */ private void reset() { this.title = null; this.description = null; this.keywords = null; this.categories = null; this.href = null; this.author = null; this.contents.setLength(0); this.links = new ArrayList(); this.published = -1; // Robot Instructions this.robotIndex = true; this.robotFollow = true; this.state = NONE; } }