/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.stanbol.enhancer.engines.metaxa.core.html; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.List; import javax.xml.transform.TransformerFactory; import org.ontoware.aifbcommons.collection.ClosableIterator; import org.ontoware.rdf2go.model.Model; import org.ontoware.rdf2go.model.Statement; import org.ontoware.rdf2go.model.Syntax; import org.ontoware.rdf2go.model.node.URI; import org.ontoware.rdf2go.model.node.Variable; import org.ontoware.rdf2go.util.RDFTool; import org.semanticdesktop.aperture.extractor.ExtractorException; import org.semanticdesktop.aperture.rdf.RDFContainer; import org.semanticdesktop.aperture.vocabulary.NCO; import org.semanticdesktop.aperture.vocabulary.NIE; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.w3c.dom.Document; /** * Utility class that provides core HTML text and metadata extraction independent of the configuration of Metaxa's main HTML extractor * * @author <a href="mailto:kasper@dfki.de">Walter Kasper</a> * */ public class HtmlTextExtractUtil { private static final Logger LOG = LoggerFactory.getLogger(HtmlTextExtractUtil.class); private static HtmlParser htmlParser = new HtmlParser(); private static XsltExtractor htmlExtractor; public HtmlTextExtractUtil() throws InitializationException { if (HtmlTextExtractUtil.htmlExtractor == null) { TransformerFactory transFac = TransformerFactory.newInstance(); transFac.setURIResolver(new BundleURIResolver()); HtmlTextExtractUtil.htmlExtractor = new XsltExtractor("any", "xslt/htmlmetadata.xsl", transFac); HtmlTextExtractUtil.htmlExtractor.setSyntax(Syntax.RdfXml); } } public String getTitle(Model meta) { Statement stmt = RDFTool.findStatement(meta, Variable.ANY, NIE.title, Variable.ANY); if (stmt != null) { return stmt.getObject().toString(); } return null; } public String getAuthor(Model meta) { Statement stmt = RDFTool.findStatement(meta, Variable.ANY, NCO.creator, Variable.ANY); if (stmt != null) { stmt = RDFTool.findStatement(meta, stmt.getSubject(), NCO.fullname, Variable.ANY); if (stmt != null) { return stmt.getObject().toString(); } } return null; } public String getDescription(Model meta) { Statement stmt = RDFTool.findStatement(meta, Variable.ANY, NIE.description, Variable.ANY); if (stmt != null) { return stmt.getObject().toString(); } return null; } public List<String> getKeywords(Model meta) { List<String> kws = new ArrayList<String>(); ClosableIterator<Statement> it = meta.findStatements(Variable.ANY, NIE.keyword, Variable.ANY); while (it.hasNext()) { kws.add(it.next().getObject().toString()); } it.close(); return kws; } public String getText(Model meta) { Statement stmt = RDFTool.findStatement(meta, Variable.ANY, NIE.plainTextContent, Variable.ANY); if (stmt != null) { return stmt.getObject().toString(); } return null; } public void extract(URI id, String charset, InputStream input, RDFContainer result) throws ExtractorException { String encoding = charset; if (charset == null) { try { encoding = CharsetRecognizer.detect(input, "html", null); } catch (IOException e) { LOG.error("Charset detection problem: " + e.getMessage()); throw new ExtractorException("Charset detection problem: " + e.getMessage()); } } Document doc = htmlParser.getDOM(input, encoding); htmlExtractor.extract(id.toString(), doc, null, result); } }