/* * #! * Ontopia Classify * #- * Copyright (C) 2001 - 2013 The Ontopia Project * #- * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * !# */ package net.ontopia.topicmaps.classify; import java.util.Arrays; import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; import org.xml.sax.XMLReader; import org.xml.sax.helpers.DefaultHandler; /** * INTERNAL: */ public class HTMLFormatModule extends XMLFormatModule { protected byte[][] magicBytes = FormatModule.getBytes(new String[] {"<HTML", "<html", "<!DOCTYPE html ", "<!DOCTYPE HTML ", "<!DOCTYPE HTML ", "<!doctype html "}); public HTMLFormatModule() { this.extensions = new String[] {".htm", ".html", ".xhtml", ".shtml"}; setSkipElements(Arrays.asList(new String[] {"style", "STYLE", "pre", "PRE", "script", "SCRIPT"})); } public boolean matchesContent(ClassifiableContentIF cc) { return FormatModule.startsWithSkipWhitespace(cc.getContent(), magicBytes); } protected XMLReader createXMLReader() throws SAXException { return new org.ccil.cowan.tagsoup.Parser(); } protected ContentHandler getContentHandler(TextHandlerIF handler) { return new HTMLHandler(handler); } private class HTMLHandler extends DefaultHandler { private TextHandlerIF thandler; private int skipLevel; private HTMLHandler(TextHandlerIF thandler) { this.thandler = thandler; } public void startElement(String nsuri, String lname, String qname, Attributes attrs) throws SAXException { if (skipElements != null && skipElements.contains(lname)) { skipLevel++; } else if (skipLevel == 0) { thandler.startRegion(lname); } } public void characters (char[] ch, int start, int length) { if (skipLevel == 0) thandler.text(ch, start, length); } public void endElement(String nsuri, String lname, String qname) throws SAXException { if (skipElements != null && skipElements.contains(lname)) { skipLevel--; } else if (skipLevel == 0) { thandler.endRegion(); } } } }