package dk.statsbiblioteket.medieplatform.hadoop;
import dk.statsbiblioteket.doms.central.connectors.BackendInvalidCredsException;
import dk.statsbiblioteket.doms.central.connectors.BackendMethodFailedException;
import dk.statsbiblioteket.doms.central.connectors.EnhancedFedora;
import dk.statsbiblioteket.doms.central.connectors.EnhancedFedoraImpl;
import dk.statsbiblioteket.doms.central.connectors.fedora.pidGenerator.PIDGeneratorException;
import dk.statsbiblioteket.sbutil.webservices.authentication.Credentials;
import dk.statsbiblioteket.medieplatform.autonomous.ConfigConstants;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.log4j.Logger;
import javax.xml.bind.JAXBException;
import java.io.IOException;
import java.util.List;
/**
* Generic hadoop reduce job which has access to a DOMS instance.
*/
@SuppressWarnings("deprecation")//Credentials
public abstract class AbstractDomsReducer extends Reducer<Text, Text, Text, Text> {
public static final String HADOOP_SAVER_DATASTREAM = "hadoop.saver.doms.datastream";
private static Logger log = Logger.getLogger(AbstractDomsReducer.class);
protected EnhancedFedora fedora;
protected String batchID = null;
protected String datastreamName;
@Override
protected void setup(Context context) throws IOException, InterruptedException {
super.setup(context);
fedora = createFedoraClient(context);
batchID = context.getConfiguration().get(ConfigConstants.BATCH_ID);
datastreamName = context.getConfiguration().get(HADOOP_SAVER_DATASTREAM);
}
/**
* Get the fedora client
*
* @param context the hadoop context
*
* @return the fedora client
* @throws java.io.IOException
*/
@SuppressWarnings("deprecation")//Credentials
protected EnhancedFedora createFedoraClient(Context context) throws IOException {
try {
Configuration conf = context.getConfiguration();
String username = conf.get(ConfigConstants.DOMS_USERNAME);
String password = conf.get(ConfigConstants.DOMS_PASSWORD);
String domsUrl = conf.get(ConfigConstants.DOMS_URL);
int retries = Integer.parseInt(conf.get(ConfigConstants.FEDORA_RETRIES, "1"));
int retryDelay = Integer.parseInt(conf.get(ConfigConstants.FEDORA_DELAY_BETWEEN_RETRIES, "100"));
return new EnhancedFedoraImpl(
new Credentials(username, password), domsUrl, null, null, retries, retryDelay);
} catch (JAXBException e) {
throw new IOException(e);
} catch (PIDGeneratorException e) {
throw new IOException(e);
}
}
/**
* Reduce method which can access DOMS via the EnhancedFedora interface.
*
* @param key the input filename
* @param values the corresponding values generated by the final mapper
* @param context the task context
*
* @throws java.io.IOException Any checked exception that is not an InterruptedException
* @throws InterruptedException from Hadoop
*/
@Override
protected abstract void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException;
/**
* Get the doms pid from the filename
*
* @param key the filename
*
* @return the doms pid
*/
protected String getDomsPid(Text key) throws BackendInvalidCredsException, BackendMethodFailedException {
String filePath = translate(key.toString());
String path = "path:" + filePath;
List<String> hits = fedora.findObjectFromDCIdentifier(path);
if (hits.isEmpty()) {
throw new RuntimeException("Failed to look up doms object for DC identifier '" + path + "'");
} else {
if (hits.size() > 1) {
log.warn("Found multipe pids for dc identifier '" + path + "', using the first one '" + hits.get(0) + "'");
}
return hits.get(0);
}
}
/**
* Translate the filename back to the original path as stored in doms
*
* @param file the filename
*
* @return the original path
*/
protected String translate(String file) {
return file.substring(file.indexOf(batchID)).replaceAll("_", "/");
}
}