package eu.dnetlib.iis.wf.importer.infospace; import static eu.dnetlib.iis.common.WorkflowRuntimeParameters.DEFAULT_CSV_DELIMITER; import static eu.dnetlib.iis.wf.importer.ImportWorkflowRuntimeParameters.IMPORT_APPROVED_COLUMNFAMILIES_CSV; import java.io.IOException; import java.util.Collections; import java.util.Set; import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import com.google.common.base.Splitter; import com.google.common.collect.Sets; import eu.dnetlib.iis.common.WorkflowRuntimeParameters; /** * Hbase dump importer mapper. Reads records from sequence file and groups them by row identifier. * * @author mhorst * */ public class ImportInformationSpaceMapper extends Mapper<Text, Text, Text, InfoSpaceRecord> { protected static final char KEY_SEPARATOR = '@'; private Text id; private Set<String> approvedColumnFamilies = Collections.emptySet(); // ------------------------ LOGIC -------------------------- @Override protected void setup(final Context context) throws IOException, InterruptedException { super.setup(context); id = new Text(); String approvedColumnFamiliesCSV = WorkflowRuntimeParameters.getParamValue(IMPORT_APPROVED_COLUMNFAMILIES_CSV, context.getConfiguration()); if (StringUtils.isNotBlank(approvedColumnFamiliesCSV)) { approvedColumnFamilies = Sets.newHashSet(Splitter.on(DEFAULT_CSV_DELIMITER).trimResults().split(approvedColumnFamiliesCSV)); } } @Override protected void map(Text key, Text value, Mapper<Text, Text, Text, InfoSpaceRecord>.Context context) throws IOException, InterruptedException { String oafJson = value.toString(); if (StringUtils.isNotBlank(oafJson)) { String[] split = StringUtils.split(key.toString(), KEY_SEPARATOR); if (split.length != 3) { throw new IOException("invalid key, " + "expected 'rowkey" + KEY_SEPARATOR + "columnFamily" + KEY_SEPARATOR + "qualifier', got: " + key); } if (approvedColumnFamilies.contains(split[1])) { id.set(split[0]); context.write(id, new InfoSpaceRecord(new Text(split[1]), new Text(split[2]), new Text(oafJson))); } } } }