/**
* Copyright (c) 2002-2014 "Neo Technology,"
* Network Engine for Objects in Lund AB [http://neotechnology.com]
*
* This file is part of Neo4j.
*
* Neo4j is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package com.ldbc.driver.csv.charseeker;
import java.io.*;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.Iterator;
import java.util.List;
import java.util.zip.GZIPInputStream;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
/**
* Means of instantiating common {@link CharReadable} instances.
* <p/>
* There are support for compressed files as well for those methods accepting a {@link File} argument.
* <ol>
* <li>ZIP: is both an archive and a compression format. In many cases the order of files
* is important and for a ZIP archive with multiple files, the order of the files are whatever the order
* set by the tool that created the ZIP archive. Therefore only single-file-zip files are supported.
* The single file in the given ZIP archive will be decompressed on the fly, while reading.</li>
* <li>GZIP: is only a compression format and so will be decompressed on the fly, while reading.</li>
* </ol>
*/
public class Readables {
/**
* First 4 bytes of a ZIP file have this signature.
*/
private static final int ZIP_MAGIC = 0x504b0304;
/**
* First 2 bytes of a GZIP file have this signature.
*/
private static final int GZIP_MAGIC = 0x1f8b;
private Readables() {
throw new AssertionError("No instances allowed");
}
public static final CharReadable EMPTY = new CharReadable() {
@Override
public int read(char[] buffer, int offset, int length) throws IOException {
return -1;
}
@Override
public void close() throws IOException { // Nothing to close
}
};
public static CharReadable wrap(final Reader reader) {
return new CharReadable() {
@Override
public int read(char[] buffer, int offset, int length) throws IOException {
return reader.read(buffer, offset, length);
}
@Override
public void close() throws IOException {
reader.close();
}
};
}
private static final RawFunction<File, CharReadable, IOException> FROM_FILE = new RawFunction<File, CharReadable, IOException>() {
@Override
public CharReadable apply(File file) throws IOException {
int magic = magic(file);
if (magic == ZIP_MAGIC) { // ZIP file
ZipFile zipFile = new ZipFile(file);
ZipEntry entry = getSingleSuitableEntry(zipFile);
return wrap(new InputStreamReader(zipFile.getInputStream(entry)));
} else if ((magic >>> 16) == GZIP_MAGIC) { // GZIP file. GZIP isn't an archive like ZIP, so this is purely data that is compressed.
// Although a very common way of compressing with GZIP is to use TAR which can combine many
// files into one blob, which is then compressed. If that's the case then
// the data will look like garbage and the reader will fail for whatever it will be used for.
// TODO add tar support
GZIPInputStream zipStream = new GZIPInputStream(new FileInputStream(file));
return wrap(new InputStreamReader(zipStream));
}
return wrap(new FileReader(file));
}
private ZipEntry getSingleSuitableEntry(ZipFile zipFile) throws IOException {
List<String> unsuitableEntries = new ArrayList<>();
Enumeration<? extends ZipEntry> enumeration = zipFile.entries();
ZipEntry found = null;
while (enumeration.hasMoreElements()) {
ZipEntry entry = enumeration.nextElement();
if (entry.isDirectory() || invalidZipEntry(entry.getName())) {
unsuitableEntries.add(entry.getName());
continue;
}
if (found != null) {
throw new IOException("Multiple suitable files found in zip file " + zipFile.getName() +
", at least " + found.getName() + " and " + entry.getName() +
". Only a single file per zip file is supported");
}
found = entry;
}
if (found == null) {
throw new IOException("No suitable file found in zip file " + zipFile.getName() + "." +
(!unsuitableEntries.isEmpty() ?
" Although found these unsuitable entries " + unsuitableEntries : ""));
}
return found;
}
private int magic(File file) throws IOException {
try (DataInputStream in = new DataInputStream(new FileInputStream(file))) {
return in.readInt();
} catch (EOFException e) {
return -1;
}
}
};
private static boolean invalidZipEntry(String name) {
return name.contains("__MACOSX") ||
name.startsWith(".") ||
name.contains("/.");
}
private static final RawFunction<CharReadable, CharReadable, IOException> IDENTITY =
new RawFunction<CharReadable, CharReadable, IOException>() {
@Override
public CharReadable apply(CharReadable in) {
return in;
}
};
public static CharReadable file(File file) throws IOException {
return FROM_FILE.apply(file);
}
public static CharReadable multipleFiles(File... files) {
return new MultiReadable(iterator(files, FROM_FILE));
}
public static CharReadable multipleSources(CharReadable... sources) {
return new MultiReadable(iterator(sources, IDENTITY));
}
public static CharReadable multipleFiles(Iterator<File> files) {
return new MultiReadable(iterator(files, FROM_FILE));
}
public static CharReadable multipleSources(RawIterator<CharReadable, IOException> sources) {
return new MultiReadable(sources);
}
private static <IN, OUT> RawIterator<OUT, IOException> iterator(final Iterator<IN> items,
final RawFunction<IN, OUT, IOException> converter) {
return new RawIterator<OUT, IOException>() {
@Override
public boolean hasNext() {
return items.hasNext();
}
@Override
public OUT next() throws IOException {
return converter.apply(items.next());
}
@Override
public void remove() {
items.remove();
}
};
}
private static <IN, OUT> RawIterator<OUT, IOException> iterator(final IN[] items,
final RawFunction<IN, OUT, IOException> converter) {
if (items.length == 0) {
throw new IllegalStateException("No source items specified");
}
return new RawIterator<OUT, IOException>() {
private int cursor;
@Override
public boolean hasNext() {
return cursor < items.length;
}
@Override
public OUT next() throws IOException {
if (!hasNext()) {
throw new IllegalStateException();
}
return converter.apply(items[cursor++]);
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
};
}
}