/**
* Copyright (c) 2008-2009 Mark Logic Corporation. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* The use of the Apache License does not indicate that this project is
* affiliated with the Apache Software Foundation.
*/
package com.marklogic.recordloader;
import java.io.File;
import java.io.FileFilter;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.Iterator;
import java.util.zip.GZIPInputStream;
import java.util.zip.ZipEntry;
import java.util.zip.ZipException;
/**
* @author Michael Blakeley, michael.blakeley@marklogic.com
*
*/
public class DefaultInputHandler extends AbstractInputHandler {
private FileFilter filter;
private ArrayList<File> plainFiles = new ArrayList<File>();
private ArrayList<File> zipFiles = new ArrayList<File>();
private ArrayList<File> gzFiles = new ArrayList<File>();
private boolean hadInputs;
private int inputCount = 0;
private long sizeLimit;
/*
* (non-Javadoc)
*
* @see com.marklogic.recordloader.InputHandlerInterface#run()
*/
public void run() throws LoaderException, FatalException {
sizeLimit = config.getFileSizeLimit();
configureInputs();
logger.fine("zipFiles.size = " + zipFiles.size());
logger.fine("gzFiles.size = " + gzFiles.size());
logger.fine("plainFiles.size = " + plainFiles.size());
if (zipFiles.size() > 0 || gzFiles.size() > 0
|| plainFiles.size() > 0) {
getFactory();
if (config.isFirstLoop()) {
logger.info("populating queue");
}
// queue any zip-entries first
try {
handleZipFiles();
handleGzFiles();
handleFiles();
if (config.isFirstLoop()) {
logger.info("queued " + inputCount + " loader(s)");
}
} catch (ZipException e) {
throw new LoaderException(e);
} catch (IOException e) {
throw new LoaderException(e);
} finally {
// cleanup
}
} else if (hadInputs) {
throw new FatalException(
"input files specified, but none found");
} else {
if (config.getThreadCount() > 1) {
logger.warning("Will not use multiple threads!");
// pointless, since there will only be one input anyway
pool.setCorePoolSize(1);
pool.setMaximumPoolSize(1);
}
// NOTE: cannot use file-based identifiers
if (config.isUseFilenameIds()) {
logger.warning("Ignoring configured "
+ Configuration.ID_NAME_KEY + "="
+ config.getIdNodeName() + " for standard input");
config.setUseAutomaticIds();
}
getFactory();
handleStandardInput();
}
}
/**
* @throws IOException
* @throws LoaderException
*/
private void handleFiles() throws IOException, LoaderException {
filter = new FileFilter() {
public boolean accept(File _f) {
String inputPattern = config.getInputPattern();
String name = _f.getName();
return _f.isDirectory()
|| (_f.isFile() && (name.matches(inputPattern)) || name
.endsWith(Configuration.ZIP_SUFFIX));
}
};
handleFiles(plainFiles);
}
/**
* @param _files
* @throws IOException
* @throws LoaderException
*/
private void handleFiles(ArrayList<File> _files) throws IOException,
LoaderException {
Iterator<File> iter;
File file;
String canonicalPath;
// queue any files, recursing into directories
iter = _files.iterator();
while (iter.hasNext()) {
file = iter.next();
canonicalPath = file.getCanonicalPath();
if (file.isDirectory()) {
logger.fine("directory " + canonicalPath);
File[] dirList = file.listFiles(filter);
if (dirList.length > 0) {
logger.info("queuing contents of " + canonicalPath
+ ": " + dirList.length);
ArrayList<File> newlist = new ArrayList<File>();
for (int i = 0; i < dirList.length; i++) {
newlist.add(dirList[i]);
}
logger.finer("queuing " + newlist.size() + " items");
handleFiles(newlist);
} else {
logger.fine("skipping " + canonicalPath
+ ": no matches");
}
continue;
}
if (canonicalPath.endsWith(Configuration.ZIP_SUFFIX)) {
// inefficient, but how many zip files will you queue?
ArrayList<File> zipList = new ArrayList<File>();
zipList.add(file);
handleZipFiles(zipList);
continue;
}
// check size
if (0 < sizeLimit && file.length() > sizeLimit) {
logger.info("skipping " + canonicalPath
+ ": larger than " + sizeLimit + " B");
continue;
}
// plain file - add to the queue
submit(canonicalPath, factory.newLoader(file));
}
}
/**
* @throws IOException
* @throws LoaderException
*/
private void handleGzFiles() throws IOException, LoaderException {
if (null == gzFiles) {
return;
}
File file;
String name;
String path;
Iterator<File> iter = gzFiles.iterator();
if (iter.hasNext()) {
while (iter.hasNext()) {
file = iter.next();
name = file.getName();
if (name.endsWith(".tar.gz") || name.endsWith(".tgz")) {
// assume a tar file
logger.warning("skipping unsupported tar file "
+ file.getCanonicalPath());
continue;
}
path = file.getPath();
submit(path, factory.newLoader(new GZIPInputStream(
new FileInputStream(file)), name, path));
}
}
}
private void submit(String _path, LoaderInterface _loader) {
pool.submit(_loader);
inputCount++;
logger.fine("queued " + inputCount + ": " + _path);
}
private void handleStandardInput() throws LoaderException,
SecurityException {
// use standard input
logger.info("Reading from standard input...");
submit("standard input", factory.newLoader(System.in));
}
/**
* @throws ZipException
* @throws IOException
* @throws LoaderException
*/
private void handleZipFiles() throws ZipException, IOException,
LoaderException {
if (null == zipFiles || 1 > zipFiles.size()) {
return;
}
handleZipFiles(zipFiles);
}
/**
* @throws ZipException
* @throws IOException
* @throws LoaderException
*/
private void handleZipFiles(ArrayList<File> zipFiles)
throws ZipException, IOException, LoaderException {
String entryName;
Iterator<File> fileIter;
Enumeration<? extends ZipEntry> entries;
File file;
ZipReference zipFile;
ZipEntry ze;
String inputPattern = config.getInputPattern();
fileIter = zipFiles.iterator();
int size;
if (!fileIter.hasNext()) {
return;
}
int fileCount = 0;
while (fileIter.hasNext()) {
file = fileIter.next();
try {
zipFile = new ZipReference(file, logger);
} catch (ZipException e) {
// user-friendly error message
logger.warning("Error opening " + file.getCanonicalPath()
+ ": " + e + " " + e.getMessage());
throw e;
}
// prevent the zip from closing while we queue
zipFile.addReference();
entries = zipFile.entries();
size = zipFile.size();
String canonicalPath = file.getCanonicalPath();
logger.fine("queuing " + size + " entries from zip file "
+ canonicalPath);
int count = 0;
String zipFileName = zipFile.getName();
// monitor will track the references for us
// TODO teach the Callable to track the zip references?
monitor.add(zipFile, zipFileName);
while (entries.hasMoreElements()) {
ze = entries.nextElement();
logger.fine("found zip entry " + ze);
// getName returns full entry path
entryName = ze.getName();
if (ze.isDirectory()) {
// skip it
logger.finer("skipping directory entry " + entryName);
continue;
}
// check inputPattern
if (!entryName.matches(inputPattern)) {
// skip it
logger.info("skipping " + entryName);
continue;
}
// to avoid closing zip inputs randomly,
// we have to "leak" them temporarily
// via reference counts.
zipFile.addReference();
submit(zipFileName + "/" + entryName, factory.newLoader(
zipFile.getInputStream(ze), zipFileName,
entryName));
count++;
if (0 == count % 1000) {
logger.finer("queued " + count
+ " entries from zip file " + canonicalPath);
}
}
logger.fine("queued " + count + " entries from zip file "
+ canonicalPath);
zipFile.closeReference();
if (1 > count) {
// nothing from this one
logger.info("no entries queued from " + zipFileName);
// does not leak - we just closed the last reference
continue;
}
fileCount++;
if (0 == fileCount % 100) {
logger.info("queued " + fileCount + " zip files");
}
}
}
private void configureInputs() {
File file;
// handle input-path property, if any
String path = config.getInputPath();
if (null != path) {
hadInputs = true;
file = new File(path);
if (checkPath(file)) {
logger.info("adding " + path);
plainFiles.add(file);
}
}
if (0 != inputs.length) {
hadInputs = true;
}
for (int i = 0; i < inputs.length; i++) {
file = new File(inputs[i]);
if (!checkPath(file)) {
continue;
}
if (inputs[i].endsWith(Configuration.ZIP_SUFFIX)) {
zipFiles.add(file);
} else if (inputs[i].endsWith(".gz")) {
gzFiles.add(file);
} else {
plainFiles.add(file);
}
}
}
/**
* @param _file
* @return
*/
private boolean checkPath(File _file) {
if (!_file.exists()) {
logger.warning("skipping " + _file.getPath()
+ ": file does not exist.");
return false;
}
if (!_file.canRead()) {
logger.warning("skipping " + _file.getPath()
+ ": file cannot be read.");
return false;
}
return true;
}
}