/** * Copyright (C) 2013 Christian Kohlschütter (ckkohl79@gmail.com) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.l3s.boilerpipe.extractors; import java.io.IOException; import java.io.Reader; import java.io.StringReader; import java.net.URL; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import de.l3s.boilerpipe.BoilerpipeExtractor; import de.l3s.boilerpipe.BoilerpipeProcessingException; import de.l3s.boilerpipe.document.TextDocument; import de.l3s.boilerpipe.sax.BoilerpipeSAXInput; import de.l3s.boilerpipe.sax.HTMLFetcher; /** * The base class of Extractors. Also provides some helper methods to quickly * retrieve the text that remained after processing. * * @author Christian Kohlschütter */ public abstract class ExtractorBase implements BoilerpipeExtractor { /** * Extracts text from the HTML code given as a String. * * @param html The HTML code as a String. * @return The extracted text. * @throws BoilerpipeProcessingException */ public String getText(final String html) throws BoilerpipeProcessingException { try { return getText(new BoilerpipeSAXInput(new InputSource( new StringReader(html))).getTextDocument()); } catch (SAXException e) { throw new BoilerpipeProcessingException(e); } } /** * Extracts text from the HTML code available from the given {@link org.xml.sax.InputSource}. * * @param is The InputSource containing the HTML * @return The extracted text. * @throws BoilerpipeProcessingException */ public String getText(final InputSource is) throws BoilerpipeProcessingException { try { return getText(new BoilerpipeSAXInput(is).getTextDocument()); } catch (SAXException e) { throw new BoilerpipeProcessingException(e); } } /** * Extracts text from the HTML code available from the given {@link java.net.URL}. * NOTE: This method is mainly to be used for show case purposes. If you are * going to crawl the Web, consider using {@link #getText(org.xml.sax.InputSource)} * instead. * * @param url The URL pointing to the HTML code. * @return The extracted text. * @throws BoilerpipeProcessingException */ public String getText(final URL url) throws BoilerpipeProcessingException { try { return getText(HTMLFetcher.fetch(url).toInputSource()); } catch (IOException e) { throw new BoilerpipeProcessingException(e); } } /** * Extracts text from the HTML code available from the given {@link java.io.Reader}. * * @param r The Reader containing the HTML * @return The extracted text. * @throws BoilerpipeProcessingException */ public String getText(final Reader r) throws BoilerpipeProcessingException { return getText(new InputSource(r)); } /** * Extracts text from the given {@link de.l3s.boilerpipe.document.TextDocument} object. * * @param doc The {@link de.l3s.boilerpipe.document.TextDocument}. * @return The extracted text. * @throws BoilerpipeProcessingException */ public String getText(TextDocument doc) throws BoilerpipeProcessingException { process(doc); return doc.getContent(); } }