/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.stanbol.enhancer.engines.htmlextractor; import java.io.IOException; import java.nio.charset.Charset; import java.util.Arrays; import java.util.Collections; import java.util.Dictionary; import java.util.HashSet; import java.util.Map; import java.util.Set; import org.apache.clerezza.commons.rdf.Graph; import org.apache.clerezza.commons.rdf.IRI; import org.apache.clerezza.commons.rdf.impl.utils.simple.SimpleGraph; import org.apache.felix.scr.annotations.Component; import org.apache.felix.scr.annotations.Property; import org.apache.felix.scr.annotations.Reference; import org.apache.felix.scr.annotations.Service; import org.apache.stanbol.enhancer.engines.htmlextractor.impl.BundleURIResolver; import org.apache.stanbol.enhancer.engines.htmlextractor.impl.ClerezzaRDFUtils; import org.apache.stanbol.enhancer.engines.htmlextractor.impl.ExtractorException; import org.apache.stanbol.enhancer.engines.htmlextractor.impl.HtmlExtractionRegistry; import org.apache.stanbol.enhancer.engines.htmlextractor.impl.HtmlExtractor; import org.apache.stanbol.enhancer.engines.htmlextractor.impl.HtmlParser; import org.apache.stanbol.enhancer.engines.htmlextractor.impl.InitializationException; import org.apache.stanbol.enhancer.servicesapi.Blob; import org.apache.stanbol.enhancer.servicesapi.ContentItem; import org.apache.stanbol.enhancer.servicesapi.ContentItemFactory; import org.apache.stanbol.enhancer.servicesapi.EngineException; import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine; import org.apache.stanbol.enhancer.servicesapi.ServiceProperties; import org.apache.stanbol.enhancer.servicesapi.impl.AbstractEnhancementEngine; import org.osgi.framework.BundleContext; import org.osgi.service.cm.ConfigurationException; import org.osgi.service.component.ComponentContext; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * * @author <a href="mailto:kasper@dfki.de">Walter Kasper</a> * */ @Component(immediate = true, metatype = true, inherit = true) @Service @org.apache.felix.scr.annotations.Properties(value={ @Property(name=EnhancementEngine.PROPERTY_NAME, value="htmlextractor") }) public class HtmlExtractorEngine extends AbstractEnhancementEngine<IOException,RuntimeException> implements EnhancementEngine, ServiceProperties { private static final Logger LOG = LoggerFactory.getLogger(HtmlExtractorEngine.class); /** * The default charset */ private static final Charset UTF8 = Charset.forName("UTF-8"); /** * The default value for the Execution of this Engine. Currently set to * {@link ServiceProperties#ORDERING_PRE_PROCESSING} */ public static final Integer defaultOrder = ORDERING_PRE_PROCESSING; private static final String DEFAULT_HTML_EXTRACTOR_REGISTRY = "htmlextractors.xml"; /** * name of a file that defines the set of extractors for HTML documents. By default, the builtin file 'htmlextractors.xml' is used." */ @Property(value=HtmlExtractorEngine.DEFAULT_HTML_EXTRACTOR_REGISTRY) public static final String HTML_EXTRACTOR_REGISTRY = "org.apache.stanbol.enhancer.engines.htmlextractor.htmlextractors"; /** * Internally used to create additional {@link Blob} for transformed * versions af the original content */ @Reference private ContentItemFactory ciFactory; BundleContext bundleContext; private Set<String> supportedMimeTypes = new HashSet<String>(Arrays.asList(new String[]{"text/html","application/xhtml+xml"})); private HtmlExtractionRegistry htmlExtractorRegistry; private HtmlParser htmlParser; private boolean singleRootRdf = true; // define the Nepomuk NIE namespace locally here private static final String NIE_NS = "http://www.semanticdesktop.org/ontologies/2007/01/19/nie#"; protected void activate(ComponentContext ce) throws ConfigurationException, IOException { super.activate(ce); this.bundleContext = ce.getBundleContext(); BundleURIResolver.BUNDLE = this.bundleContext.getBundle(); String htmlExtractors = DEFAULT_HTML_EXTRACTOR_REGISTRY; Dictionary<String, Object> properties = ce.getProperties(); String confFile = (String)properties.get(HTML_EXTRACTOR_REGISTRY); if (confFile != null && confFile.trim().length() > 0) { htmlExtractors = confFile; } try { this.htmlExtractorRegistry = new HtmlExtractionRegistry(htmlExtractors); } catch (InitializationException e) { LOG.error("Registry Initialization Error: " + e.getMessage()); throw new IOException(e.getMessage()); } this.htmlParser = new HtmlParser(); } /** * The deactivate method. * * @param ce the {@link ComponentContext} */ protected void deactivate(ComponentContext ce) { super.deactivate(ce); this.htmlParser = null; this.htmlExtractorRegistry = null; } @Override public Map<String,Object> getServiceProperties() { return Collections.unmodifiableMap(Collections.singletonMap(ENHANCEMENT_ENGINE_ORDERING, (Object) defaultOrder)); } @Override public int canEnhance(ContentItem ci) throws EngineException { LOG.info("MimeType: {}", ci.getMimeType()); if (isSupported(ci.getMimeType())) { return ENHANCE_ASYNC; } return CANNOT_ENHANCE; } @Override public void computeEnhancements(ContentItem ci) throws EngineException { HtmlExtractor extractor = new HtmlExtractor(htmlExtractorRegistry, htmlParser); Graph model = new SimpleGraph(); ci.getLock().readLock().lock(); try { extractor.extract(ci.getUri().getUnicodeString(), ci.getStream(),null, ci.getMimeType(), model); } catch (ExtractorException e) { throw new EngineException("Error while processing ContentItem " + ci.getUri()+" with HtmlExtractor",e); } finally { ci.getLock().readLock().unlock(); } ClerezzaRDFUtils.urifyBlankNodes(model); // make the model single rooted if (singleRootRdf) { ClerezzaRDFUtils.makeConnected(model,ci.getUri(),new IRI(NIE_NS+"contains")); } //add the extracted triples to the metadata of the ContentItem ci.getLock().writeLock().lock(); try { LOG.info("Model: {}",model); ci.getMetadata().addAll(model); model = null; } finally { ci.getLock().writeLock().unlock(); } } private boolean isSupported(String mimeType) { return this.supportedMimeTypes.contains(mimeType); } }