package edu.isi.bmkeg.lapdf.uima.cr; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import org.apache.log4j.Logger; import org.apache.uima.cas.CAS; import org.apache.uima.cas.CASException; import org.apache.uima.collection.CollectionException; import org.apache.uima.collection.CollectionReader_ImplBase; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.util.Progress; import org.uimafit.descriptor.ConfigurationParameter; import org.uimafit.factory.ConfigurationParameterFactory; import edu.isi.bmkeg.utils.ISI_UIMA_PDFUtils; public class DirectoryCollectionReader extends CollectionReader_ImplBase { private static Logger logger = Logger.getLogger(DirectoryCollectionReader.class); private final boolean DEBUG = false; public static final String ITEMS_TO_SKIP = ConfigurationParameterFactory.createConfigurationParameterName( DirectoryCollectionReader.class, "itemsToSkip"); @ConfigurationParameter(mandatory=false, description="this number indicates the number of items to skip before beginning processing") private int itemsToSkip; public static final String END_INDEX = ConfigurationParameterFactory.createConfigurationParameterName( DirectoryCollectionReader.class, "endIndex"); @ConfigurationParameter(mandatory=false, description="this number indicates the termination point") private int endIndex; /** * The directory that files will be read from. If a file is used as the * input parameter, then only that file will be processed. */ public static final String DIRECTORY = ConfigurationParameterFactory.createConfigurationParameterName( DirectoryCollectionReader.class,"directory"); @ConfigurationParameter(mandatory=false, description="This is the input directory.") private String directory; /** * A boolean flag which determines if the files from subdirectories are * processed, or if only the files in the top-level directory are used. */ public static final String DIR_RECURSION = ConfigurationParameterFactory.createConfigurationParameterName( DirectoryCollectionReader.class,"dirRecursion" ); @ConfigurationParameter(mandatory=false, description="This is a flag to decide whether to recurse into subdirs.") private Boolean dirRecursion; /** * Enables the user to filter based on file suffix. The user must include * the "." as part of the suffix. */ public static final String FILE_SUFFIX = ConfigurationParameterFactory.createConfigurationParameterName( DirectoryCollectionReader.class,"fileSuffix" ); @ConfigurationParameter(mandatory=false, description="This is the file suffix to match files agains and process.") private String fileSuffix; private int numberOfFilesProcessed; private List<String> fileSuffixesToProcess; protected List<File> filesToProcess; protected Iterator<File> fileIterator; /** * In the initialization of this collection reader, the file names for all * files to be processed are collected */ @Override public void initialize() throws ResourceInitializationException { fileSuffixesToProcess = new ArrayList<String>(); numberOfFilesProcessed = 0; /* get input parameters from descriptor file */ directory = (String) getConfigParameterValue(DIRECTORY); itemsToSkip = (Integer) getConfigParameterValue(ITEMS_TO_SKIP); endIndex = (Integer) getConfigParameterValue(END_INDEX); dirRecursion = ((Boolean) getConfigParameterValue(DIR_RECURSION)) .booleanValue(); fileSuffix = (String) getConfigParameterValue(FILE_SUFFIX); fileSuffixesToProcess.add(fileSuffix); /* initialize list to hold files to process */ filesToProcess = new ArrayList<File>(); /* Recurse through directories to get files to process */ System.err.println("Initializing DirectoryOfFilesCollectionReader on directory: " + directory); File root = new File(directory); if (root.isFile()) { filesToProcess.add(root); } else if (root.isDirectory()) { processDirectory(root, filesToProcess); } else { error("Invalid input detected. Document collection root is neither a file nor a directory."); } fileIterator = filesToProcess.iterator(); if(itemsToSkip>0){ int i = itemsToSkip; while(i>0){ fileIterator.next(); i--; } logger.info("Skipping "+itemsToSkip+" PDF files"); } System.err.println("CR initialization complete. # files to process: " + filesToProcess.size()); //filesToProcess = null; } /* recurse through a directory tree, adding files to the filesToProcess list */ protected void processDirectory(File dir, List<File> fileList) { String[] files = dir.list(); for (String file : files) { File f = new File(dir.getAbsolutePath() + "/" + file); /* * If it is an html file, then add it to the filesToProcess list, if * it is a directory, then process the directory. */ if (f.isFile()) { if (checkForValidSuffix(f)) { fileList.add(f); } } else if (f.isDirectory() && dirRecursion) { processDirectory(f, fileList); } else if (f.isDirectory()) { // do nothing.. the recurseIntoDirectoryStructure flag is set to // false } else { System.err .println("Error. Expecting a file or directory, but encountered something else... " + f.getPath()); } } } protected boolean checkForValidSuffix(File f) { if (fileSuffixesToProcess.size() == 0) { /* if no suffixes were specified, then we process every file type */ return true; } else { for (String suffix : fileSuffixesToProcess) { if (f.getName().endsWith(suffix)) { return true; } } } return false; } /** * @see com.ibm.uima.collection.CollectionReader#getNext(com.ibm.uima.cas.CAS) */ public void getNext(CAS aCAS) throws IOException, CollectionException { JCas jcas; try { jcas = aCAS.getJCas(); } catch (CASException e) { throw new CollectionException(e); } String name=fileIterator.next().getAbsolutePath(); numberOfFilesProcessed++; ISI_UIMA_PDFUtils.setDocumentSecondaryIDs(jcas, name); } /** * @see com.ibm.uima.arg0collection.base_cpm.BaseCollectionReader#hasNext() */ public boolean hasNext() throws IOException, CollectionException { if(endIndex>0){ if(numberOfFilesProcessed < filesToProcess.size() && numberOfFilesProcessed+itemsToSkip<endIndex){ return true; }else{ return false; } }else{ if(numberOfFilesProcessed < filesToProcess.size()){ return true; } } return false; } public void close() throws IOException { } private void error(String message) { System.err.println("ERROR -- DirectoryOfFilesCollectionReader: " + message); } @SuppressWarnings("unused") private void warn(String message) { System.err.println("WARNING -- DirectoryOfFilesCollectionReader: " + message); } @SuppressWarnings("unused") private void debug(String message) { if (DEBUG) { System.err.println("DEBUG -- DirectoryOfFilesCollectionReader: " + message); } } @Override public Progress[] getProgress() { // TODO Auto-generated method stub return null; } }