package org.apache.lucene.benchmark.byTask.feeds; /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.io.Reader; import java.io.StringReader; import java.util.Collections; import java.util.Date; import java.util.HashSet; import java.util.Locale; import java.util.Properties; import java.util.Set; import org.cyberneko.html.parsers.SAXParser; import org.xml.sax.Attributes; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; /** * Simple HTML Parser extracting title, meta tags, and body text * that is based on <a href="http://nekohtml.sourceforge.net/">NekoHTML</a>. */ public class DemoHTMLParser implements HTMLParser { /** The actual parser to read HTML documents */ public static final class Parser { public final Properties metaTags = new Properties(); public final String title, body; public Parser(Reader reader) throws IOException, SAXException { this(new InputSource(reader)); } public Parser(InputSource source) throws IOException, SAXException { final SAXParser parser = new SAXParser(); parser.setFeature("http://xml.org/sax/features/namespaces", true); parser.setFeature("http://cyberneko.org/html/features/balance-tags", true); parser.setFeature("http://cyberneko.org/html/features/report-errors", false); parser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower"); parser.setProperty("http://cyberneko.org/html/properties/names/attrs", "lower"); final StringBuilder title = new StringBuilder(), body = new StringBuilder(); final DefaultHandler handler = new DefaultHandler() { private int inBODY = 0, inHEAD = 0, inTITLE = 0, suppressed = 0; @Override public void startElement(String namespaceURI, String localName, String qName, Attributes atts) throws SAXException { if (inHEAD > 0) { if (equalsIgnoreTurkish("title", localName)) { inTITLE++; } else { if (equalsIgnoreTurkish("meta", localName)) { String name = atts.getValue("name"); if (name == null) { name = atts.getValue("http-equiv"); } final String val = atts.getValue("content"); if (name != null && val != null) { metaTags.setProperty(name.toLowerCase(Locale.ROOT), val); } } } } else if (inBODY > 0) { if (SUPPRESS_ELEMENTS.contains(localName)) { suppressed++; } else if (equalsIgnoreTurkish("img", localName)) { // the original javacc-based parser preserved <IMG alt="..."/> // attribute as body text in [] parenthesis: final String alt = atts.getValue("alt"); if (alt != null) { body.append('[').append(alt).append(']'); } } } else if (equalsIgnoreTurkish("body", localName)) { inBODY++; } else if (equalsIgnoreTurkish("head", localName)) { inHEAD++; } else if (equalsIgnoreTurkish("frameset", localName)) { throw new SAXException("This parser does not support HTML framesets."); } } @Override public void endElement(String namespaceURI, String localName, String qName) throws SAXException { if (inBODY > 0) { if (equalsIgnoreTurkish("body", localName)) { inBODY--; } else if (ENDLINE_ELEMENTS.contains(localName)) { body.append('\n'); } else if (SUPPRESS_ELEMENTS.contains(localName)) { suppressed--; } } else if (inHEAD > 0) { if (equalsIgnoreTurkish("head", localName)) { inHEAD--; } else if (inTITLE > 0 && equalsIgnoreTurkish("title", localName)) { inTITLE--; } } } @Override public void characters(char[] ch, int start, int length) throws SAXException { if (inBODY > 0 && suppressed == 0) { body.append(ch, start, length); } else if (inTITLE > 0) { title.append(ch, start, length); } } @Override public InputSource resolveEntity(String publicId, String systemId) { // disable network access caused by DTDs return new InputSource(new StringReader("")); } }; parser.setContentHandler(handler); parser.setErrorHandler(handler); parser.parse(source); // the javacc-based parser trimmed title (which should be done for HTML in all cases): this.title = title.toString().trim(); // assign body text this.body = body.toString(); } // TODO: remove the Turkish workaround once this is fixed in NekoHTML: // https://sourceforge.net/tracker/?func=detail&aid=3544334&group_id=195122&atid=952178 // BEGIN: workaround static final String convertTurkish(String s) { return s.replace('i', 'ı'); } static final boolean equalsIgnoreTurkish(String s1, String s2) { final int len1 = s1.length(), len2 = s2.length(); if (len1 != len2) return false; for (int i = 0; i < len1; i++) { char ch1 = s1.charAt(i), ch2 = s2.charAt(i); if (ch1 == 'ı') ch1 = 'i'; if (ch2 == 'ı') ch2 = 'i'; if (ch1 != ch2) return false; } return true; } // END: workaround static final Set<String> createElementNameSet(String... names) { final HashSet<String> set = new HashSet<String>(); for (final String name : names) { set.add(name); set.add(convertTurkish(name)); } return Collections.unmodifiableSet(set); } /** HTML elements that cause a line break (they are block-elements) */ static final Set<String> ENDLINE_ELEMENTS = createElementNameSet( "p", "h1", "h2", "h3", "h4", "h5", "h6", "div", "ul", "ol", "dl", "pre", "hr", "blockquote", "address", "fieldset", "table", "form", "noscript", "li", "dt", "dd", "noframes", "br", "tr", "select", "option" ); /** HTML elements with contents that are ignored */ static final Set<String> SUPPRESS_ELEMENTS = createElementNameSet( "style", "script" ); } @Override public DocData parse(DocData docData, String name, Date date, Reader reader, TrecContentSource trecSrc) throws IOException { try { return parse(docData, name, date, new InputSource(reader), trecSrc); } catch (SAXException saxe) { throw new IOException("SAX exception occurred while parsing HTML document.", saxe); } } public DocData parse(DocData docData, String name, Date date, InputSource source, TrecContentSource trecSrc) throws IOException, SAXException { final Parser p = new Parser(source); // properties final Properties props = p.metaTags; String dateStr = props.getProperty("date"); if (dateStr != null) { final Date newDate = trecSrc.parseDate(dateStr); if (newDate != null) { date = newDate; } } docData.clear(); docData.setName(name); docData.setBody(p.body); docData.setTitle(p.title); docData.setProps(props); docData.setDate(date); return docData; } }