package org.gbif.checklistbank.index.backfill; import org.gbif.api.model.checklistbank.Description; import org.gbif.api.model.checklistbank.Distribution; import org.gbif.api.model.checklistbank.NameUsage; import org.gbif.api.model.checklistbank.SpeciesProfile; import org.gbif.api.model.checklistbank.VernacularName; import org.gbif.checklistbank.index.model.NameUsageAvro; import org.gbif.checklistbank.model.UsageExtensions; import org.gbif.checklistbank.service.UsageService; import org.gbif.checklistbank.service.mybatis.DescriptionServiceMyBatis; import org.gbif.checklistbank.service.mybatis.DistributionServiceMyBatis; import org.gbif.checklistbank.service.mybatis.SpeciesProfileServiceMyBatis; import org.gbif.checklistbank.service.mybatis.VernacularNameServiceMyBatis; import java.io.File; import java.io.IOException; import java.util.List; import java.util.Map; import java.util.concurrent.Callable; import com.google.common.base.Throwables; import org.apache.avro.Schema; import org.apache.avro.file.DataFileWriter; import org.apache.avro.io.DatumWriter; import org.apache.avro.specific.SpecificDatumWriter; import org.apache.commons.lang3.time.StopWatch; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileUtil; import org.apache.hadoop.fs.Path; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Executable job that creates a list of {@link org.apache.solr.common.SolrInputDocument} using a list of {@link org.gbif.api.model.checklistbank.NameUsage} objects. */ public class AvroExportJob implements Callable<Integer> { private final Logger log = LoggerFactory.getLogger(getClass()); /** * Minimum usage key, inclusive, to process. */ private final int startKey; /** * Maximum usage key, inclusive, to process. */ private final int endKey; private String nameNode; private String targetHdfsDir; /** * Service layer. */ private final UsageService nameUsageService; private final VernacularNameServiceMyBatis vernacularNameService; private final DescriptionServiceMyBatis descriptionService; private final DistributionServiceMyBatis distributionService; private final SpeciesProfileServiceMyBatis speciesProfileService; private StopWatch stopWatch = new StopWatch(); /** * {@link org.gbif.api.model.checklistbank.NameUsage}/{@link org.apache.solr.common.SolrInputDocument} converter. */ private final NameUsageAvroConverter nameUsageAvroConverter; /** * Default constructor. */ public AvroExportJob( final UsageService nameUsageService, final int startKey, final int endKey, final VernacularNameServiceMyBatis vernacularNameService, final DescriptionServiceMyBatis descriptionService, final DistributionServiceMyBatis distributionService, final SpeciesProfileServiceMyBatis speciesProfileService, String nameNode, String targetHdfsDir ) { this.nameUsageService = nameUsageService; this.vernacularNameService = vernacularNameService; this.descriptionService = descriptionService; this.distributionService = distributionService; this.speciesProfileService = speciesProfileService; this.startKey = startKey; this.endKey = endKey; nameUsageAvroConverter = new NameUsageAvroConverter(); this.targetHdfsDir = targetHdfsDir; this.nameNode= nameNode; } /** * Iterates over the assigned {@link org.gbif.api.model.checklistbank.NameUsage} objects to insert the corresponding {@link org.apache.solr.common.SolrInputDocument} * objects. * * @return the total number of documents added by this Thread. */ @Override public Integer call() throws Exception { // Timing information initialization stopWatch.start(); log.info("Adding usages from id {} to {}", startKey, endKey); int docCount = 0; // Get all usages List<NameUsage> usages = nameUsageService.listRange(startKey, endKey); // get all component maps into memory first Map<Integer, List<VernacularName>> vernacularNameMap = vernacularNameService.listRange(startKey, endKey); Map<Integer, List<Description>> descriptionMap = descriptionService.listRange(startKey, endKey); Map<Integer, List<Distribution>> distributionMap = distributionService.listRange(startKey, endKey); Map<Integer, List<SpeciesProfile>> speciesProfileMap = speciesProfileService.listRange(startKey, endKey); File file = new File(startKey+ "-" + endKey + ".avro"); file.createNewFile(); log.info("Creating file " + file.getAbsolutePath()); ClassLoader classLoader = AvroExporter.class.getClassLoader(); Schema schema = new Schema.Parser().parse(classLoader.getResource("solr.avrsc").openStream()); DatumWriter<NameUsageAvro> datumWriter = new SpecificDatumWriter<>(NameUsageAvro.class); try(DataFileWriter<NameUsageAvro> dataFileWriter = new DataFileWriter<NameUsageAvro>(datumWriter)) { dataFileWriter.create(schema, file); // now we're ready to build the solr indices quicky! for (NameUsage usage : usages) { if (usage == null) { log.warn("Unexpected numm usage found in range {}-{}, docCount={}", startKey, endKey, docCount); continue; } try { UsageExtensions ext = new UsageExtensions(); ext.speciesProfiles = speciesProfileMap.get(usage.getKey()); ext.vernacularNames = vernacularNameMap.get(usage.getKey()); ext.descriptions = descriptionMap.get(usage.getKey()); ext.distributions = distributionMap.get(usage.getKey()); List<Integer> parents = nameUsageService.listParents(usage.getKey()); dataFileWriter.append(nameUsageAvroConverter.toObject(usage, parents, ext)); } catch (Exception e) { log.error("Error exporting usage {} extension {} to avro", usage, e); } docCount++; NameUsageBatchProcessor.counter.incrementAndGet(); } } moveToHdfs(file,nameNode); log.info(file.getName() + " moved to hdfs"); // job finished notice stopWatch.stop(); log.info("Finished indexing of usages in range {}-{}. Total time: {}", new Object[] {startKey, endKey, stopWatch.toString()}); return docCount; } private boolean moveToHdfs(File file, String nameNode) throws IOException { try { Configuration configuration = new Configuration(); configuration.set(FileSystem.FS_DEFAULT_NAME_KEY, nameNode); Path targetPath = new Path(targetHdfsDir, file.getName()); log.info("Moving file {} to HDFS path {}", file, targetPath); return FileUtil.copy(file, FileSystem.get(configuration), targetPath, true, configuration); } catch (IOException ioe) { log.error("Error moving file to HDFS",ioe); throw Throwables.propagate(ioe); } } }