/* * Copyright 2016 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.dkpro.core.api.datasets.internal.actions; import static java.util.Arrays.asList; import java.io.BufferedInputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.util.List; import java.util.Locale; import java.util.Map; import org.apache.commons.compress.archivers.ArchiveEntry; import org.apache.commons.compress.archivers.ArchiveInputStream; import org.apache.commons.compress.archivers.ArchiveStreamFactory; import org.apache.commons.compress.archivers.sevenz.SevenZArchiveEntry; import org.apache.commons.compress.archivers.sevenz.SevenZFile; import org.apache.commons.compress.compressors.CompressorException; import org.apache.commons.compress.compressors.CompressorStreamFactory; import org.apache.commons.io.FilenameUtils; import org.apache.commons.io.IOUtils; import com.github.junrar.Archive; import com.github.junrar.exception.RarException; import com.github.junrar.impl.FileVolumeManager; import com.github.junrar.rarfile.FileHeader; import de.tudarmstadt.ukp.dkpro.core.api.datasets.ActionDescription; import de.tudarmstadt.ukp.dkpro.core.api.datasets.ArtifactDescription; import de.tudarmstadt.ukp.dkpro.core.api.datasets.DatasetDescription; import de.tudarmstadt.ukp.dkpro.core.api.datasets.internal.DatasetDescriptionImpl; import de.tudarmstadt.ukp.dkpro.core.api.datasets.internal.util.AntFileFilter; public class Explode extends Action_ImplBase { @Override public void apply(ActionDescription aAction, DatasetDescription aDataset, ArtifactDescription aPack, Path aCachedFile) throws Exception { DatasetDescriptionImpl dsi = (DatasetDescriptionImpl) aDataset; Map<String, Object> cfg = aAction.getConfiguration(); // Sometimes, we have to explode a file that was created as the result of exploding the // main artifact. Thus, we can override the target Path targetFile = cfg.containsKey("file") ? dsi.getOwner().resolve(dsi).resolve((String) cfg.get("file")) : aCachedFile; // Apache Commons Compress does not handle RAR files, so we handle them separately if (targetFile.toString().toLowerCase(Locale.ENGLISH).endsWith(".rar")) { extractRar(aAction, targetFile, dsi.getOwner().resolve(dsi)); } if (targetFile.toString().toLowerCase(Locale.ENGLISH).endsWith(".7z")) { // 7z does not support streaming in Apache Commons Compress extract7z(aAction, targetFile, dsi.getOwner().resolve(dsi)); } else { // Auto-detect the archive format using Apache Commons Compress try (InputStream is = new BufferedInputStream(Files.newInputStream(targetFile))) { InputStream uncompressed; try { uncompressed = new BufferedInputStream( new CompressorStreamFactory().createCompressorInputStream(is)); } catch (CompressorException e) { // If the compressor is not detected, we may be dealing with an archive format that // compresses internally, e.g. ZIP. uncompressed = is; } ArchiveInputStream archive = new ArchiveStreamFactory() .createArchiveInputStream(uncompressed); extract(aAction, targetFile, archive, dsi.getOwner().resolve(dsi)); } } } private void extract7z(ActionDescription aAction, Path aCachedFile, Path aTarget) throws IOException, RarException { // We always extract archives into a subfolder. Figure out the name of the folder. String base = getBase(aCachedFile.getFileName().toString()); Map<String, Object> cfg = aAction.getConfiguration(); int strip = cfg.containsKey("strip") ? (int) cfg.get("strip") : 0; AntFileFilter filter = new AntFileFilter(coerceToList(cfg.get("includes")), coerceToList(cfg.get("excludes"))); try (SevenZFile archive = new SevenZFile(aCachedFile.toFile())) { SevenZArchiveEntry entry = archive.getNextEntry(); while (entry != null) { String name = stripLeadingFolders(entry.getName(), strip); if (name == null) { // Stripped to null - nothing left to extract - continue; continue; } if (filter.accept(name)) { Path out = aTarget.resolve(base).resolve(name); if (entry.isDirectory()) { Files.createDirectories(out); } else { Files.createDirectories(out.getParent()); try (OutputStream os = Files.newOutputStream(out)) { InputStream is = new SevenZEntryInputStream(archive, entry); IOUtils.copyLarge(is, os); } } } entry = archive.getNextEntry(); } } } private void extractRar(ActionDescription aAction, Path aCachedFile, Path aTarget) throws IOException, RarException { // We always extract archives into a subfolder. Figure out the name of the folder. String base = getBase(aCachedFile.getFileName().toString()); Map<String, Object> cfg = aAction.getConfiguration(); int strip = cfg.containsKey("strip") ? (int) cfg.get("strip") : 0; AntFileFilter filter = new AntFileFilter(coerceToList(cfg.get("includes")), coerceToList(cfg.get("excludes"))); try (Archive archive = new Archive(new FileVolumeManager(aCachedFile.toFile()))) { FileHeader fh = archive.nextFileHeader(); while (fh != null) { String name = stripLeadingFolders(fh.getFileNameString(), strip); if (name == null) { // Stripped to null - nothing left to extract - continue; continue; } if (filter.accept(name)) { Path out = aTarget.resolve(base).resolve(name); if (fh.isDirectory()) { Files.createDirectories(out); } else { Files.createDirectories(out.getParent()); try (OutputStream os = Files.newOutputStream(out)) { archive.extractFile(fh, os); } } } fh = archive.nextFileHeader(); } } } private void extract(ActionDescription aAction, Path aArchive, ArchiveInputStream aAStream, Path aTarget) throws IOException { // We always extract archives into a subfolder. Figure out the name of the folder. String base = getBase(aArchive.getFileName().toString()); Map<String, Object> cfg = aAction.getConfiguration(); int strip = cfg.containsKey("strip") ? (int) cfg.get("strip") : 0; AntFileFilter filter = new AntFileFilter(coerceToList(cfg.get("includes")), coerceToList(cfg.get("excludes"))); ArchiveEntry entry = null; while ((entry = aAStream.getNextEntry()) != null) { String name = stripLeadingFolders(entry.getName(), strip); if (name == null) { // Stripped to null - nothing left to extract - continue; continue; } if (filter.accept(name)) { Path out = aTarget.resolve(base).resolve(name); if (entry.isDirectory()) { Files.createDirectories(out); } else { Files.createDirectories(out.getParent()); Files.copy(aAStream, out); } } } } private String stripLeadingFolders(String aName, int aLevels) { if (aLevels > 0) { Path p = Paths.get(aName); if (p.getNameCount() <= aLevels) { return null; } else { p = p.subpath(aLevels, p.getNameCount()); aName = p.toString(); return aName; } } else { return aName; } } public static String getBase(String aFilename) { // We always extract archives into a subfolder. Figure out the name of the folder. String base = aFilename; while (base.contains(".")) { base = FilenameUtils.removeExtension(base); } return base; } @SuppressWarnings("unchecked") public static List<String> coerceToList(Object aRaw) { List<String> cooked; if (aRaw == null) { return null; } else if (aRaw instanceof String) { cooked = asList((String) aRaw); } else if (aRaw instanceof List) { cooked = (List<String>) aRaw; } else { throw new IllegalArgumentException("Cannot coerce to String list: [" + aRaw + "]"); } return cooked; } private static class SevenZEntryInputStream extends InputStream { private SevenZFile archive; private SevenZArchiveEntry entry; private int totalRead; public SevenZEntryInputStream(SevenZFile aArchive, SevenZArchiveEntry aEnty) { archive = aArchive; entry = aEnty; } @Override public int read() throws IOException { if (totalRead < entry.getSize()) { totalRead++; return archive.read(); } else { return -1; } } @Override public int read(byte[] aB, int aOff, int aLen) throws IOException { if (totalRead < entry.getSize()) { int blocksize = (int) Math.min(aLen, entry.getSize() - totalRead); int read = archive.read(aB, aOff, blocksize); totalRead += read; return read; } else { return -1; } } } }