/*
* Copyright 2016
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.dkpro.core.api.datasets;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;
import java.security.DigestInputStream;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.codec.binary.Hex;
import org.apache.commons.compress.archivers.ArchiveEntry;
import org.apache.commons.compress.archivers.ArchiveInputStream;
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.input.CloseShieldInputStream;
import org.apache.commons.io.output.NullOutputStream;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import de.tudarmstadt.ukp.dkpro.core.api.datasets.internal.ud.UDDataset;
@Deprecated
public class DatasetLoader
{
private final Log LOG = LogFactory.getLog(getClass());
private File cacheRoot;
public DatasetLoader()
{
}
public DatasetLoader(File aCacheRoot)
{
setCacheRoot(aCacheRoot);
}
public void setCacheRoot(File aCacheRoot)
{
cacheRoot = aCacheRoot;
}
public File getCacheRoot()
{
return cacheRoot;
}
public List<Dataset> loadUniversalDependencyTreebankV1_3()
throws IOException
{
File dataDir = new File(cacheRoot, "ud-treebanks-v1.3");
DataPackage data = new DataPackage.Builder()
.url("https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/"
+ "1-1699/ud-treebanks-v1.3.tgz?sequence=1&isAllowed=y")
.sha1("44367112880cf0af3f293cb3f0cc6ce50c0e65c0")
.target("ud-treebanks-v1.3.tgz")
.postAction((d) -> { untgz(new File(dataDir, d.getTarget()), dataDir); })
.build();
fetch(dataDir, data);
List<Dataset> sets = new ArrayList<>();
for (File f : new File(dataDir, "ud-treebanks-v1.3").listFiles()) {
sets.add(new UDDataset(f));
}
return sets;
}
private void fetch(File aTarget, DataPackage... aPackages)
throws IOException
{
// First validate if local copies are still up-to-date
boolean reload = false;
packageValidationLoop: for (DataPackage pack : aPackages) {
File cachedFile = new File(aTarget, pack.getTarget());
if (!cachedFile.exists()) {
continue;
}
if (pack.getSha1() != null) {
String actual = getDigest(cachedFile, "SHA1");
if (!pack.getSha1().equals(actual)) {
LOG.info("Local SHA1 hash mismatch on [" + cachedFile + "] - expected ["
+ pack.getSha1() + "] - actual [" + actual + "]");
reload = true;
break packageValidationLoop;
}
else {
LOG.info("Local SHA1 hash verified on [" + cachedFile + "] - [" + actual + "]");
}
}
if (pack.getMd5() != null) {
String actual = getDigest(cachedFile, "MD5");
if (!pack.getMd5().equals(actual)) {
LOG.info("Local MD5 hash mismatch on [" + cachedFile + "] - expected ["
+ pack.getMd5() + "] - actual [" + actual + "]");
reload = true;
break packageValidationLoop;
}
else {
LOG.info("Local MD5 hash verified on [" + cachedFile + "] - [" + actual + "]");
}
}
}
// If any of the packages are outdated, clear the cache and download again
if (reload) {
LOG.info("Clearing local cache for [" + aTarget + "]");
FileUtils.deleteQuietly(aTarget);
}
for (DataPackage pack : aPackages) {
File cachedFile = new File(aTarget, pack.getTarget());
if (cachedFile.exists()) {
continue;
}
MessageDigest md5;
try {
md5 = MessageDigest.getInstance("MD5");
}
catch (NoSuchAlgorithmException e) {
throw new IOException(e);
}
MessageDigest sha1;
try {
sha1 = MessageDigest.getInstance("SHA1");
}
catch (NoSuchAlgorithmException e) {
throw new IOException(e);
}
cachedFile.getParentFile().mkdirs();
URL source = new URL(pack.getUrl());
LOG.info("Fetching [" + cachedFile + "]");
URLConnection connection = source.openConnection();
connection.setRequestProperty("User-Agent", "Java");
try (InputStream is = connection.getInputStream()) {
DigestInputStream md5Filter = new DigestInputStream(is, md5);
DigestInputStream sha1Filter = new DigestInputStream(md5Filter, sha1);
FileUtils.copyInputStreamToFile(sha1Filter, cachedFile);
if (pack.getMd5() != null) {
String md5Hex = new String(
Hex.encodeHex(md5Filter.getMessageDigest().digest()));
if (!pack.getMd5().equals(md5Hex)) {
String message = "MD5 mismatch. Expected [" + pack.getMd5() + "] but got ["
+ md5Hex + "].";
LOG.error(message);
throw new IOException(message);
}
}
if (pack.getSha1() != null) {
String sha1Hex = new String(
Hex.encodeHex(sha1Filter.getMessageDigest().digest()));
if (!pack.getSha1().equals(sha1Hex)) {
String message = "SHA1 mismatch. Expected [" + pack.getSha1()
+ "] but got [" + sha1Hex + "].";
LOG.error(message);
throw new IOException(message);
}
}
}
}
// Perform a post-fetch action such as unpacking
for (DataPackage pack : aPackages) {
File cachedFile = new File(aTarget, pack.getTarget());
File postActionCompleteMarker = new File(cachedFile.getPath()+".postComplete");
if (pack.getPostAction() != null && !postActionCompleteMarker.exists()) {
try {
pack.getPostAction().run(pack);
FileUtils.touch(postActionCompleteMarker);
}
catch (IOException e) {
throw e;
}
catch (Exception e) {
throw new IllegalStateException(e);
}
}
}
}
private String getDigest(File aFile, String aDigest) throws IOException
{
MessageDigest digest;
try {
digest = MessageDigest.getInstance(aDigest);
}
catch (NoSuchAlgorithmException e) {
throw new IOException(e);
}
try (InputStream is = new FileInputStream(aFile)) {
DigestInputStream digestFilter = new DigestInputStream(is, digest);
IOUtils.copy(digestFilter, new NullOutputStream());
return new String(Hex.encodeHex(digestFilter.getMessageDigest().digest()));
}
}
private void untgz(File aArchive, File aTarget)
throws IOException
{
try (ArchiveInputStream archive = new TarArchiveInputStream(new GzipCompressorInputStream(
new BufferedInputStream(new FileInputStream(aArchive))))) {
extract(aArchive, archive, aTarget);
}
}
private void extract(File aArchive, ArchiveInputStream aArchiveStream, File aTarget)
throws IOException
{
ArchiveEntry entry = null;
while ((entry = aArchiveStream.getNextEntry()) != null) {
String name = entry.getName();
// Ensure that the filename will not break the manifest
if (name.contains("\n")) {
throw new IllegalStateException("Filename must not contain line break");
}
File out = new File(aTarget, name);
if (entry.isDirectory()) {
FileUtils.forceMkdir(out);
}
else {
FileUtils.copyInputStreamToFile(new CloseShieldInputStream(aArchiveStream), out);
}
}
}
}