/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package de.l3s.boilerpipe.sax; import java.io.IOException; import java.net.URI; import java.net.URISyntaxException; import java.net.URL; import java.util.Arrays; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import net.htmlparser.jericho.Attributes; import net.htmlparser.jericho.Element; import net.htmlparser.jericho.HTMLElementName; import net.htmlparser.jericho.OutputDocument; import net.htmlparser.jericho.Segment; import net.htmlparser.jericho.Source; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import de.l3s.boilerpipe.BoilerpipeExtractor; import de.l3s.boilerpipe.BoilerpipeProcessingException; import de.l3s.boilerpipe.document.TextDocument; /** * an Extractor for extracting an article from an document with its basic HTML structure. * * @author manuel.codiga@gmail.com */ public class HtmlArticleExtractor { public static final HtmlArticleExtractor INSTANCE = new HtmlArticleExtractor(); private static final Set<String> NOT_ALLOWED_HTML_TAGS = new HashSet<String>(Arrays.asList( HTMLElementName.HEAD, HTMLElementName.HTML, HTMLElementName.SCRIPT, HTMLElementName.STYLE, HTMLElementName.FORM, HTMLElementName.BODY, HTMLElementName.DIV, HTMLElementName.SPAN) ); private HtmlArticleExtractor() {} /** * Returns the singleton instance * * @return */ public static HtmlArticleExtractor getInstance() { return INSTANCE; } /** * returns the article from an url with its basic html structure. * */ public String process(final BoilerpipeExtractor extractor, final URL url) throws IOException, BoilerpipeProcessingException, SAXException, URISyntaxException { final HTMLDocument htmlDoc = HTMLFetcher.fetch(url); return process(htmlDoc, url.toURI(), extractor); } /** * returns the article from an document with its basic html structure. * * @param HTMLDocument * @param URI the uri from the document for resolving the relative anchors in the document to absolute anchors * @return String */ public String process(HTMLDocument htmlDoc, URI docUri, final BoilerpipeExtractor extractor) { final HTMLHighlighter hh = HTMLHighlighter.newExtractingInstance(); hh.setOutputHighlightOnly(true); TextDocument doc; String text = ""; try { doc = new BoilerpipeSAXInput(htmlDoc.toInputSource()).getTextDocument(); extractor.process(doc); final InputSource is = htmlDoc.toInputSource(); text = hh.process(doc, is); } catch (Exception ex) { return null; } return removeNotAllowedTags(text, docUri); } /** * Serduszko dla Bartka od Kasi <3 * @param htmlFragment * @param docUri * @return */ private String removeNotAllowedTags(String htmlFragment, URI docUri) { Source source = new Source(htmlFragment); OutputDocument outputDocument = new OutputDocument(source); List<Element> elements = source.getAllElements(); for (Element element : elements) { Attributes attrs = element.getAttributes(); Map<String, String> attrsUpdate = outputDocument.replace(attrs, true); if (!element.getName().contains("a")) { attrsUpdate.clear(); } else { if (attrsUpdate.get("href")!=null) { String link = attrsUpdate.get("href"); if (!link.contains("http")) { URI documentUri = docUri; URI anchorUri; try { anchorUri = new URI(link); URI result = documentUri.resolve(anchorUri); attrsUpdate.put("href", result.toString()); } catch (URISyntaxException e) { outputDocument.remove(element); } } } } if (NOT_ALLOWED_HTML_TAGS.contains(element.getName())) { Segment content = element.getContent(); if (element.getName() == "script" || element.getName() == "style" || element.getName() == "form") { outputDocument.remove(content); } outputDocument.remove(element.getStartTag()); if (!element.getStartTag().isSyntacticalEmptyElementTag()) { outputDocument.remove(element.getEndTag()); } } } String out = outputDocument.toString(); out = out.replaceAll("\\n", ""); out = out.replaceAll("\\t", ""); return out; } }