/** * Copyright (C) 2013 Christian Kohlschütter (ckkohl79@gmail.com) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.l3s.boilerpipe.sax; import java.io.IOException; import java.io.StringReader; import java.net.MalformedURLException; import java.net.URI; import java.net.URISyntaxException; import java.net.URL; import java.util.ArrayList; import java.util.BitSet; import java.util.HashMap; import java.util.List; import java.util.Map; import mf.org.apache.xerces.parsers.AbstractSAXParser; import org.cyberneko.html.HTMLConfiguration; import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.InputSource; import org.xml.sax.Locator; import org.xml.sax.SAXException; import de.l3s.boilerpipe.BoilerpipeExtractor; import de.l3s.boilerpipe.BoilerpipeProcessingException; import de.l3s.boilerpipe.document.Image; import de.l3s.boilerpipe.document.Media; import de.l3s.boilerpipe.document.TextBlock; import de.l3s.boilerpipe.document.TextDocument; import de.l3s.boilerpipe.document.VimeoVideo; import de.l3s.boilerpipe.document.YoutubeVideo; import de.l3s.boilerpipe.sax.BoilerpipeSAXInput; import de.l3s.boilerpipe.sax.HTMLDocument; import de.l3s.boilerpipe.sax.HTMLFetcher; /** * Extracts youtube and vimeo videos that are enclosed by extracted content. * * @author Christian Kohlschütter, manuel.codiga@gmail.com */ public final class MediaExtractor { /** */ public static final MediaExtractor INSTANCE = new MediaExtractor(); /** * @return the singleton instance of {@link de.l3s.boilerpipe.sax.MediaExtractor}. */ public static MediaExtractor getInstance() { return INSTANCE; } /** * Processes the given {@link TextDocument} and the original HTML text (as a String). * * @param doc The processed {@link TextDocument}. * @param origHTML The original HTML document. * @return A List of enclosed {@link Image}s * @throws BoilerpipeProcessingException if an error during extraction occure */ public List<Media> process(final TextDocument doc, final String origHTML) throws BoilerpipeProcessingException { return process(doc, new InputSource(new StringReader(origHTML))); } /** * Processes the given {@link TextDocument} and the original HTML text (as an {@link org.xml.sax.InputSource}). * * @param doc The processed {@link TextDocument}. The original HTML document. * @return A List of enclosed {@link Image}s * @throws BoilerpipeProcessingException */ public List<Media> process(final TextDocument doc, final InputSource is) throws BoilerpipeProcessingException { final Implementation implementation = new Implementation(); implementation.process(doc, is); return implementation.linksHighlight; } /** * Fetches the given {@link java.net.URL} using {@link de.l3s.boilerpipe.sax.HTMLFetcher} and processes the retrieved HTML using the specified * {@link BoilerpipeExtractor}. * * @param url the url of the document to fetch * @param extractor extractor to use * * @return A List of enclosed {@link Image}s * @throws java.io.IOException * @throws BoilerpipeProcessingException * @throws org.xml.sax.SAXException */ @SuppressWarnings("javadoc") public List<Media> process(final URL url, final BoilerpipeExtractor extractor) throws IOException, BoilerpipeProcessingException, SAXException { final HTMLDocument htmlDoc = HTMLFetcher.fetch(url); final TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource()).getTextDocument(); extractor.process(doc); final InputSource is = htmlDoc.toInputSource(); return process(doc, is); } /** * parses the media (picture, video) out of doc * * @param doc document to parse the media out * @param extractor extractor to use * @return list of extracted media, with size = 0 if no media found */ public List<Media> process(String doc, final BoilerpipeExtractor extractor) { final HTMLDocument htmlDoc = new HTMLDocument(doc); List<Media> media = new ArrayList<Media>(); TextDocument tdoc; try { tdoc = new BoilerpipeSAXInput(htmlDoc.toInputSource()).getTextDocument(); extractor.process(tdoc); final InputSource is = htmlDoc.toInputSource(); media = process(tdoc, is); } catch (Exception e) { return null; } return media; } private final class Implementation extends AbstractSAXParser implements ContentHandler { List<Media> linksHighlight = new ArrayList<Media>(); private List<Media> linksBuffer = new ArrayList<Media>(); private int inIgnorableElement = 0; private int characterElementIdx = 0; private final BitSet contentBitSet = new BitSet(); private boolean inHighlight = false; Implementation() { super(new HTMLConfiguration()); setContentHandler(this); } void process(final TextDocument doc, final InputSource is) throws BoilerpipeProcessingException { for (TextBlock block : doc.getTextBlocks()) { if (block.isContent()) { final BitSet bs = block.getContainedTextElements(); if (bs != null) { contentBitSet.or(bs); } } } try { parse(is); } catch (SAXException e) { throw new BoilerpipeProcessingException(e); } catch (IOException e) { throw new BoilerpipeProcessingException(e); } } public void endDocument() throws SAXException { } public void endPrefixMapping(String prefix) throws SAXException { } public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException { } public void processingInstruction(String target, String data) throws SAXException { } public void setDocumentLocator(Locator locator) { } public void skippedEntity(String name) throws SAXException { } public void startDocument() throws SAXException { } public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException { TagAction ta = TAG_ACTIONS.get(localName); if (ta != null) { ta.beforeStart(this, localName); } try { if (inIgnorableElement == 0) { if (inHighlight && "IFRAME".equalsIgnoreCase(localName)) { String src = atts.getValue("src"); if (src != null) { src = src.replaceAll("\\\\\"", ""); } if (src != null && src.length() > 0 && src.contains("youtube.com/embed/")) { String originUrl = null; if (!src.startsWith("http:")) { src = "http:" + src; } try { URL url = new URL(src); String path = url.getPath(); String[] pathParts = path.split("/"); originUrl = "http://www.youtube.com/watch?v=" + pathParts[pathParts.length - 1]; linksBuffer.add(new YoutubeVideo(originUrl, src)); } catch (MalformedURLException e) { } } if (src != null && src.length() > 0 && src.contains("player.vimeo.com")) { String originUrl = null; if (!src.startsWith("http:")) { src = "http:" + src; } try { URL url = new URL(src); String path = url.getPath(); String[] pathParts = path.split("/"); originUrl = "http://vimeo.com/" + pathParts[pathParts.length - 1]; linksBuffer.add(new VimeoVideo(originUrl, src)); } catch (MalformedURLException e) { } } } if (inHighlight && "IMG".equalsIgnoreCase(localName)) { String src = atts.getValue("src"); try { URI image = new URI(src); if (src != null && src.length() > 0) { linksBuffer.add(new Image(src, atts.getValue("width"), atts.getValue("height"), atts .getValue("alt"))); } } catch (URISyntaxException e) { } } } } finally { if (ta != null) { ta.afterStart(this, localName); } } } public void endElement(String uri, String localName, String qName) throws SAXException { TagAction ta = TAG_ACTIONS.get(localName); if (ta != null) { ta.beforeEnd(this, localName); } try { if (inIgnorableElement == 0) { // } } finally { if (ta != null) { ta.afterEnd(this, localName); } } } public void characters(char[] ch, int start, int length) throws SAXException { characterElementIdx++; if (inIgnorableElement == 0) { boolean highlight = contentBitSet.get(characterElementIdx); if (!highlight) { if (length == 0) { return; } boolean justWhitespace = true; for (int i = start; i < start + length; i++) { if (!Character.isWhitespace(ch[i])) { justWhitespace = false; break; } } if (justWhitespace) { return; } } inHighlight = highlight; if (inHighlight) { linksHighlight.addAll(linksBuffer); linksBuffer.clear(); } } } public void startPrefixMapping(String prefix, String uri) throws SAXException { } } @SuppressWarnings("synthetic-access") private static final TagAction TA_IGNORABLE_ELEMENT = new TagAction() { @Override void beforeStart(final Implementation instance, final String localName) { instance.inIgnorableElement++; } @Override void afterEnd(final Implementation instance, final String localName) { instance.inIgnorableElement--; } }; private static Map<String, TagAction> TAG_ACTIONS = new HashMap<String, TagAction>(); static { TAG_ACTIONS.put("STYLE", TA_IGNORABLE_ELEMENT); TAG_ACTIONS.put("SCRIPT", TA_IGNORABLE_ELEMENT); TAG_ACTIONS.put("OPTION", TA_IGNORABLE_ELEMENT); TAG_ACTIONS.put("NOSCRIPT", TA_IGNORABLE_ELEMENT); TAG_ACTIONS.put("EMBED", TA_IGNORABLE_ELEMENT); TAG_ACTIONS.put("APPLET", TA_IGNORABLE_ELEMENT); TAG_ACTIONS.put("LINK", TA_IGNORABLE_ELEMENT); TAG_ACTIONS.put("HEAD", TA_IGNORABLE_ELEMENT); } private abstract static class TagAction { void beforeStart(final Implementation instance, final String localName) { } void afterStart(final Implementation instance, final String localName) { } void beforeEnd(final Implementation instance, final String localName) { } void afterEnd(final Implementation instance, final String localName) { } } }