package org.gbif.checklistbank.col; import org.gbif.checklistbank.model.ColAnnotation; import org.gbif.checklistbank.service.ColAnnotationService; import org.gbif.checklistbank.service.mybatis.guice.ChecklistBankServiceMyBatisModule; import org.gbif.dwca.record.Record; import org.gbif.dwca.record.StarRecord; import org.gbif.dwc.terms.DwcTerm; import org.gbif.dwca.io.Archive; import org.gbif.dwca.io.ArchiveFactory; import org.gbif.utils.HttpUtil; import org.gbif.utils.file.FileUtils; import org.gbif.utils.file.properties.PropertiesUtil; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStream; import java.net.URI; import java.net.URISyntaxException; import java.util.Properties; import java.util.regex.Matcher; import java.util.regex.Pattern; import com.google.common.annotations.VisibleForTesting; import com.google.inject.Guice; import com.google.inject.Injector; import org.apache.http.HttpEntity; import org.apache.http.HttpResponse; import org.apache.http.auth.AuthScope; import org.apache.http.auth.UsernamePasswordCredentials; import org.apache.http.client.CredentialsProvider; import org.apache.http.client.HttpClient; import org.apache.http.client.methods.HttpGet; import org.apache.http.client.protocol.ClientContext; import org.apache.http.impl.client.BasicCredentialsProvider; import org.apache.http.protocol.BasicHttpContext; import org.apache.http.protocol.HttpContext; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Main class to run a full import of CoL GSD annotations for the GBIF Backbone, populating the col_annotation table * in Checklist Bank. * @see <a href="http://dev.gbif.org/issues/browse/CLB-248">CLB-248</a> */ public class ColAnnotationImport { private static final Logger LOG = LoggerFactory.getLogger(ColAnnotationImport.class); private Properties props; private ColAnnotationService annotationService; @VisibleForTesting protected Pattern ANNOTATION_SPLITTER = Pattern.compile("^\\s*(.*)\\s*\\|\\s*(.*)\\s*\\|\\s*(.*)\\s*$"); public ColAnnotationImport(String propsFile) throws IOException { props = PropertiesUtil.readFromFile(propsFile); // init guice ChecklistBankServiceMyBatisModule myBatisModule = new ChecklistBankServiceMyBatisModule(props); Injector inj = Guice.createInjector(myBatisModule); annotationService = inj.getInstance(ColAnnotationService.class); } public void importAnnotations() throws IOException, URISyntaxException { Archive arch = downloadArchive(); int counter = 0; int failures = 0; int empty = 0; for (StarRecord star : arch) { Record rec = star.core(); counter++; try { int nubKey = counter;//TODO: use taxonIDs once available: Integer.parseInt(rec.value(DwcTerm.taxonID)); Matcher m = ANNOTATION_SPLITTER.matcher(rec.value(DwcTerm.taxonRemarks)); if (m.find()) { ColAnnotation annotation = new ColAnnotation(nubKey, rec.value(DwcTerm.datasetName), rec.value(DwcTerm.scientificName), isRejected(m.group(1)), m.group(3), m.group(2)); annotationService.insertAnnotation(annotation); } else { empty++; LOG.warn("No annotation given for {}", rec.value(DwcTerm.scientificName)); } } catch (NumberFormatException e) { failures++; LOG.warn("No valid taxonID given for {}", rec.value(DwcTerm.scientificName)); } } LOG.info("{} annotations processed, {} lack notes, {} failed to import", counter, empty, failures); } @VisibleForTesting protected static boolean isRejected(String annotation) { if (annotation.toLowerCase().startsWith("placed")) { return false; } return true; } private Archive downloadArchive() throws IOException, URISyntaxException { // download url final String url = props.getProperty("col.annotation.url"); // local work dir File workDir = FileUtils.createTempDir(); workDir.deleteOnExit(); // local zip file File zip = new File(workDir, "annotation.zip"); // local decompressed dwca File dwca = new File(workDir, "annotation"); // insert folders org.apache.commons.io.FileUtils.forceMkdir(dwca); // use a 10 minutes timeout HttpClient client = HttpUtil.newMultithreadedClient(600000, 10, 10); // authentication HttpContext authContext = new BasicHttpContext(); URI authUri = new URI(url); AuthScope scope = new AuthScope(authUri.getHost(), AuthScope.ANY_PORT, AuthScope.ANY_REALM); CredentialsProvider credsProvider = new BasicCredentialsProvider(); credsProvider.setCredentials(scope, new UsernamePasswordCredentials(props.getProperty("col.annotation.user"), props.getProperty("col.annotation.password"))); authContext.setAttribute(ClientContext.CREDS_PROVIDER, credsProvider); HttpGet get = new HttpGet(url); HttpResponse response = client.execute(get, authContext); if (response.getStatusLine().getStatusCode() != 200) { LOG.error("{} error downloading annotations from {}: {}", response.getStatusLine(), url, response.getStatusLine().getReasonPhrase()); System.exit(1); } HttpEntity entity = response.getEntity(); if (entity != null) { // copy stream to local file OutputStream fos = new FileOutputStream(zip, false); try { entity.writeTo(fos); } finally { fos.close(); } } LOG.info("Successfully downloaded {} to {}", url, zip.getAbsolutePath()); // open archive Archive arch = ArchiveFactory.openArchive(zip, dwca); return arch; } public static void main (String[] args) throws Exception { if (args.length != 1) { System.err.println("Pass the full path to the config properties file as first argument please"); } ColAnnotationImport imp = new ColAnnotationImport(args[0]); imp.importAnnotations(); } }