/** * Licensed to The Apereo Foundation under one or more contributor license * agreements. See the NOTICE file distributed with this work for additional * information regarding copyright ownership. * * * The Apereo Foundation licenses this file to you under the Educational * Community License, Version 2.0 (the "License"); you may not use this file * except in compliance with the License. You may obtain a copy of the License * at: * * http://opensource.org/licenses/ecl2.txt * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. * */ package org.opencastproject.oaipmh.harvester; import static org.opencastproject.oaipmh.harvester.LastHarvested.cleanup; import static org.opencastproject.oaipmh.harvester.LastHarvested.getLastHarvestDate; import static org.opencastproject.oaipmh.harvester.LastHarvested.update; import static org.opencastproject.oaipmh.util.ConcurrencyUtil.shutdownAndAwaitTermination; import static org.opencastproject.oaipmh.util.OsgiUtil.checkDictionary; import static org.opencastproject.oaipmh.util.OsgiUtil.getCfg; import static org.opencastproject.oaipmh.util.OsgiUtil.getCfgAsInt; import static org.opencastproject.oaipmh.util.PersistenceUtil.newPersistenceEnvironment; import org.opencastproject.oaipmh.util.PersistenceEnv; import org.opencastproject.security.api.Organization; import org.opencastproject.security.api.OrganizationDirectoryService; import org.opencastproject.security.api.SecurityService; import org.opencastproject.security.api.User; import org.opencastproject.security.api.UserDirectoryService; import org.opencastproject.util.NotFoundException; import org.opencastproject.util.data.Function0; import org.opencastproject.util.data.Option; import org.joda.time.DateTime; import org.osgi.service.cm.ConfigurationException; import org.osgi.service.cm.ManagedService; import org.osgi.service.component.ComponentContext; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.w3c.dom.Node; import java.util.Date; import java.util.Dictionary; import java.util.concurrent.Executors; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; import javax.persistence.EntityManagerFactory; /** * The harvester queries OAI-PMH repositories for a certain metadata prefix and passes * the retrieved records to the configured {@link RecordHandler} for the actual processing. * <p> * todo * <h3>Currently not supported</h3> * <ul> * <li>Recovery from network errors while processing a resumable request. Currently * the request sequence terminates and processing goes on with the next configured repository. * <li>Selective harvesting by time is not yet implemented. The harvester always requests the whole repository.</li> * </ul> */ public class OaiPmhHarvester implements ManagedService { private static final Logger logger = LoggerFactory.getLogger(OaiPmhHarvester.class); // config keys private static final String CFG_USER_ORGANIZATION = "user.organization"; private static final String CFG_USER_NAME = "user.name"; private static final String CFG_PERIOD = "period"; private static final String CFG_INITIAL_DELEY = "initial-delay"; private static final String CFG_URLS = "urls"; // service reference names - make sure they match the names used in the component.xml private static final String REF_ORG_SERVICE = "orgDirectory"; private static final String REF_SECURITY_SERVICE = "securityService"; private static final String REF_USER_SERVICE = "userDirectory"; private static final String REF_RECORD_HANDLER = "recordHandler"; private ComponentContext componentContext; private EntityManagerFactory emf; private ScheduledExecutorService scheduler; private PersistenceEnv penv; /** OSGi DI */ void setEntityManagerFactory(EntityManagerFactory emf) { this.emf = emf; } /** * @see #activate(ComponentContext) */ @Override public synchronized void updated(Dictionary<String, ?> properties) throws ConfigurationException { logger.info("Updated"); try { checkDictionary(properties, componentContext); // locate all services final RecordHandler recordhandler = (RecordHandler) componentContext.locateService(REF_RECORD_HANDLER); // collect all config params final int period = getCfgAsInt(properties, CFG_PERIOD); final int initialDelay = getCfgAsInt(properties, CFG_INITIAL_DELEY); final String urlsRaw = getCfg(properties, CFG_URLS); final String[] urls = urlsRaw.split("\\s*,\\s*"); // shutdown currently running tasks if (scheduler != null) scheduler.shutdown(); scheduler = Executors.newSingleThreadScheduledExecutor(); logger.info("Schedule harvesting " + urlsRaw + " at " + initialDelay + ", " + period + " (minutes)"); final Function0<Void> secConf = createSecurityConfigurator(properties, componentContext); // get persistence provider penv = newPersistenceEnvironment(emf); // create a new worker Worker worker = new Worker(urls, recordhandler, secConf, penv); scheduler.scheduleAtFixedRate(worker, initialDelay, period, TimeUnit.MINUTES); } catch (ConfigurationException e) { logger.info("Configuration not complete since at least property " + e.getProperty() + " is missing or malformed. " + "Please provide a clean configuration to enable harvesting."); } } /** * Return a function that configures the security service with a {@link User} and {@link Organization}. */ private static Function0<Void> createSecurityConfigurator(Dictionary properties, ComponentContext cc) throws ConfigurationException { // get services final OrganizationDirectoryService organizationDirectoryService = (OrganizationDirectoryService) cc.locateService(REF_ORG_SERVICE); final SecurityService securityService = (SecurityService) cc.locateService(REF_SECURITY_SERVICE); final UserDirectoryService userDirectoryService = (UserDirectoryService) cc.locateService(REF_USER_SERVICE); // get the organization String organizationName = getCfg(properties, CFG_USER_ORGANIZATION); final Organization organization; try { organization = organizationDirectoryService.getOrganization(organizationName); } catch (NotFoundException e) { throw new ConfigurationException(CFG_USER_ORGANIZATION, "Organization '" + organizationName + "' does not exist"); } // get the user final User user; final Organization originalOrg = securityService.getOrganization(); try { String userName = getCfg(properties, CFG_USER_NAME); securityService.setOrganization(organization); user = userDirectoryService.loadUser(userName); } finally { securityService.setOrganization(originalOrg); } return new Function0<Void>() { @Override public Void apply() { securityService.setOrganization(organization); securityService.setUser(user); return null; } }; } /** * OSGi component activation. Called by the container. Declare in the component xml. * Called before {@link #updated(java.util.Dictionary)} but needs to be synchronized with it. */ public synchronized void activate(ComponentContext cc) { logger.info("Activate"); this.componentContext = cc; } /** * OSGi component deactivation. Called by the container. Declare in the component xml. */ public synchronized void deactivate() { logger.info("Deactivate"); if (scheduler != null) shutdownAndAwaitTermination(scheduler, 60, new Function0<Void>() { @Override public Void apply() { logger.error("Scheduler does not terminate"); return null; } }); if (penv != null) penv.close(); } static class Worker implements Runnable { private final String[] urls; private final RecordHandler handler; private final Function0<Void> securityConfigurator; private final PersistenceEnv penv; /** * @param urls the urls, i.e. the repositories, to harvest * @param securityConfigurator a function to configure the security service in order to access the episode service */ Worker(String[] urls, RecordHandler handler, Function0<Void> securityConfigurator, PersistenceEnv penv) { this.urls = urls; this.handler = handler; this.securityConfigurator = securityConfigurator; this.penv = penv; } @Override public void run() { // configure security settings for this thread securityConfigurator.apply(); for (String url : urls) { try { DateTime now = new DateTime(); harvest(url, getLastHarvestDate(penv, url)); // save the time of the last harvest but with a security delta of 1 minutes update(penv, new LastHarvested(url, now.minusMinutes(1).toDate())); } catch (Exception e) { logger.error("An error occured while harvesting " + url + ". Skipping this repository for now...", e); } } cleanup(penv, urls); } private void harvest(String url, Option<Date> from) throws Exception { logger.info("Harvesting " + url + " from " + from + " on thread " + Thread.currentThread()); OaiPmhRepositoryClient repositoryClient = OaiPmhRepositoryClient.newHarvester(url); ListRecordsResponse response = repositoryClient.listRecords(handler.getMetadataPrefix(), from, Option.<Date>none(), Option.<String>none()); if (!response.isError()) { for (Node recordNode : ListRecordsResponse.getAllRecords(response, repositoryClient)) { handler.handle(recordNode); } } else if (response.isErrorNoRecordsMatch()) { logger.info("Repository returned no records."); } else { logger.error("Repository returned error code: " + response.getErrorCode().getOrElse("?")); } } } }