DemoHTMLParser.java example

Explorer
solr-analytics-master
- lucene
- solr
package org.apache.lucene.benchmark.byTask.feeds;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.Collections;
import java.util.Date;
import java.util.HashSet;
import java.util.Locale;
import java.util.Properties;
import java.util.Set;

import org.cyberneko.html.parsers.SAXParser;

import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

/**
 * Simple HTML Parser extracting title, meta tags, and body text
 * that is based on <a href="http://nekohtml.sourceforge.net/">NekoHTML</a>.
 */
public class DemoHTMLParser implements HTMLParser {
  
  /** The actual parser to read HTML documents */
  public static final class Parser {
    
    public final Properties metaTags = new Properties();
    public final String title, body;
    
    public Parser(Reader reader) throws IOException, SAXException {
      this(new InputSource(reader));
    }
    
    public Parser(InputSource source) throws IOException, SAXException {
      final SAXParser parser = new SAXParser();
      parser.setFeature("http://xml.org/sax/features/namespaces", true);
      parser.setFeature("http://cyberneko.org/html/features/balance-tags", true);
      parser.setFeature("http://cyberneko.org/html/features/report-errors", false);
      parser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
      parser.setProperty("http://cyberneko.org/html/properties/names/attrs", "lower");

      final StringBuilder title = new StringBuilder(), body = new StringBuilder();
      final DefaultHandler handler = new DefaultHandler() {
        private int inBODY = 0, inHEAD = 0, inTITLE = 0, suppressed = 0;

        @Override
        public void startElement(String namespaceURI, String localName, String qName, Attributes atts) throws SAXException {
          if (inHEAD > 0) {
            if (equalsIgnoreTurkish("title", localName)) {
              inTITLE++;
            } else {
              if (equalsIgnoreTurkish("meta", localName)) {
                String name = atts.getValue("name");
                if (name == null) {
                  name = atts.getValue("http-equiv");
                }
                final String val = atts.getValue("content");
                if (name != null && val != null) {
                  metaTags.setProperty(name.toLowerCase(Locale.ROOT), val);
                }
              }
            }
          } else if (inBODY > 0) {
            if (SUPPRESS_ELEMENTS.contains(localName)) {
              suppressed++;
            } else if (equalsIgnoreTurkish("img", localName)) {
              // the original javacc-based parser preserved <IMG alt="..."/>
              // attribute as body text in [] parenthesis:
              final String alt = atts.getValue("alt");
              if (alt != null) {
                body.append('[').append(alt).append(']');
              }
            }
          } else if (equalsIgnoreTurkish("body", localName)) {
            inBODY++;
          } else if (equalsIgnoreTurkish("head", localName)) {
            inHEAD++;
          } else if (equalsIgnoreTurkish("frameset", localName)) {
            throw new SAXException("This parser does not support HTML framesets.");
          }
        }

        @Override
        public void endElement(String namespaceURI, String localName, String qName) throws SAXException {
          if (inBODY > 0) {
            if (equalsIgnoreTurkish("body", localName)) {
              inBODY--;
            } else if (ENDLINE_ELEMENTS.contains(localName)) {
              body.append('\n');
            } else if (SUPPRESS_ELEMENTS.contains(localName)) {
              suppressed--;
            }
          } else if (inHEAD > 0) {
            if (equalsIgnoreTurkish("head", localName)) {
              inHEAD--;
            } else if (inTITLE > 0 && equalsIgnoreTurkish("title", localName)) {
              inTITLE--;
            }
          }
        }
        
        @Override
        public void characters(char[] ch, int start, int length) throws SAXException { 
          if (inBODY > 0 && suppressed == 0) {
            body.append(ch, start, length);
          } else if (inTITLE > 0) {
            title.append(ch, start, length);
          }
        }

        @Override
        public InputSource resolveEntity(String publicId, String systemId) {
          // disable network access caused by DTDs
          return new InputSource(new StringReader(""));
        }
      };
      
      parser.setContentHandler(handler);
      parser.setErrorHandler(handler);
      parser.parse(source);
      
      // the javacc-based parser trimmed title (which should be done for HTML in all cases):
      this.title = title.toString().trim();
      
      // assign body text
      this.body = body.toString();
    }
    
    // TODO: remove the Turkish workaround once this is fixed in NekoHTML:
    // https://sourceforge.net/tracker/?func=detail&aid=3544334&group_id=195122&atid=952178
    
    // BEGIN: workaround
    static final String convertTurkish(String s) {
      return s.replace('i', 'ı');
    }
    
    static final boolean equalsIgnoreTurkish(String s1, String s2) {
      final int len1 = s1.length(), len2 = s2.length();
      if (len1 != len2)
        return false;
      for (int i = 0; i < len1; i++) {
        char ch1 = s1.charAt(i), ch2 = s2.charAt(i);
        if (ch1 == 'ı') ch1 = 'i';
        if (ch2 == 'ı') ch2 = 'i';
        if (ch1 != ch2)
          return false;
      }
      return true;
    }
    // END: workaround
    
    static final Set<String> createElementNameSet(String... names) {
      final HashSet<String> set = new HashSet<String>();
      for (final String name : names) {
        set.add(name);
        set.add(convertTurkish(name));
      }
      return Collections.unmodifiableSet(set);
    }
    
    /** HTML elements that cause a line break (they are block-elements) */
    static final Set<String> ENDLINE_ELEMENTS = createElementNameSet(
      "p", "h1", "h2", "h3", "h4", "h5", "h6", "div", "ul", "ol", "dl",
      "pre", "hr", "blockquote", "address", "fieldset", "table", "form",
      "noscript", "li", "dt", "dd", "noframes", "br", "tr", "select", "option"
    );

    /** HTML elements with contents that are ignored */
    static final Set<String> SUPPRESS_ELEMENTS = createElementNameSet(
      "style", "script"
    );
  }

  @Override
  public DocData parse(DocData docData, String name, Date date, Reader reader, TrecContentSource trecSrc) throws IOException {
    try {
      return parse(docData, name, date, new InputSource(reader), trecSrc);
    } catch (SAXException saxe) {
      throw new IOException("SAX exception occurred while parsing HTML document.", saxe);
    }
  }
  
  public DocData parse(DocData docData, String name, Date date, InputSource source, TrecContentSource trecSrc) throws IOException, SAXException {
    final Parser p = new Parser(source);
    
    // properties 
    final Properties props = p.metaTags;
    String dateStr = props.getProperty("date");
    if (dateStr != null) {
      final Date newDate = trecSrc.parseDate(dateStr);
      if (newDate != null) {
        date = newDate;
      }
    }
    
    docData.clear();
    docData.setName(name);
    docData.setBody(p.body);
    docData.setTitle(p.title);
    docData.setProps(props);
    docData.setDate(date);
    return docData;
  }

}