/* * Copyright 2010 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.dkpro.core.api.io; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import org.apache.tools.ant.DirectoryScanner; import org.apache.tools.ant.types.resources.FileResource; import org.apache.tools.ant.types.resources.FileResourceIterator; import org.apache.uima.UimaContext; import org.apache.uima.cas.CAS; import org.apache.uima.cas.CASException; import org.apache.uima.collection.CollectionException; import org.apache.uima.fit.component.CasCollectionReader_ImplBase; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.util.Progress; import org.apache.uima.util.ProgressImpl; import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; /** * Base class for file system collection readers. Uses an Ant FileSet to conveniently walk the * file system. * <p> * Example of a hypothetic <code>FooReader</code> that should read only files ending in * <code>.foo</code> from in the directory <code>foodata</code> or any subdirectory thereof: * <pre> * CollectionReader reader = createReader(FooReader.class, * FileSetCollectionReaderBase.PARAM_LANGUAGE, "en", * FileSetCollectionReaderBase.PARAM_SOURCE_LOCATION, "some/path", * FileSetCollectionReaderBase.PARAM_PATTERNS, "[+]foodata/**/*.foo" ); * </pre> * @since 1.0.6 * @deprecated use {@link ResourceCollectionReaderBase} instead. */ @Deprecated public abstract class FileSetCollectionReaderBase extends CasCollectionReader_ImplBase { public static final String INCLUDE_PREFIX = "[+]"; public static final String EXCLUDE_PREFIX = "[-]"; /** * Location from which the input is read. * * @deprecated use {@link #PARAM_SOURCE_LOCATION} */ @Deprecated public static final String PARAM_PATH = ComponentParameters.PARAM_SOURCE_LOCATION; /** * Location from which the input is read. */ public static final String PARAM_SOURCE_LOCATION = ComponentParameters.PARAM_SOURCE_LOCATION; @ConfigurationParameter(name=PARAM_SOURCE_LOCATION, mandatory=false) private File sourceLocation; /** * A set of Ant-like include/exclude patterns. A pattern starts with {@link #INCLUDE_PREFIX [+]} * if it is an include pattern and with {@link #EXCLUDE_PREFIX [-]} if it is an exclude pattern. * The wildcard <code>/**/</code> can be used to address any number of sub-directories. * The wildcard {@code *} can be used to a address a part of a name. */ public static final String PARAM_PATTERNS = "patterns"; @ConfigurationParameter(name=PARAM_PATTERNS, mandatory=true) private String[] patterns; /** * Use the default excludes. */ public static final String PARAM_USE_DEFAULT_EXCLUDES = "useDefaultExcludes"; @ConfigurationParameter(name=PARAM_USE_DEFAULT_EXCLUDES, mandatory=true, defaultValue="true") private boolean useDefaultExcludes; /** * The language. */ public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; @ConfigurationParameter(name=PARAM_LANGUAGE, mandatory=false) private String language; /** * States whether the matching is done case sensitive. (default: true) */ public static final String PARAM_CASE_SENSITIVE= "caseSensitive"; @ConfigurationParameter(name=PARAM_CASE_SENSITIVE, mandatory=false, defaultValue="true") private boolean caseSensitive; private DirectoryScanner directoryScanner; private int completed; private Iterator<FileResource> fileSetIterator; @SuppressWarnings("unchecked") @Override public void initialize(UimaContext aContext) throws ResourceInitializationException { super.initialize(aContext); // Configure the FileSet. directoryScanner = new DirectoryScanner(); if (sourceLocation != null) { directoryScanner.setBasedir(sourceLocation); } // Configure case sensitivity directoryScanner.setCaseSensitive(caseSensitive); // Parse the patterns and inject them into the FileSet List<String> includes = new ArrayList<String>(); List<String> excludes = new ArrayList<String>(); for (String pattern : patterns) { if (pattern.startsWith(INCLUDE_PREFIX)) { includes.add(pattern.substring(INCLUDE_PREFIX.length())); } else if (pattern.startsWith(EXCLUDE_PREFIX)) { excludes.add(pattern.substring(EXCLUDE_PREFIX.length())); } else if (pattern.matches("^\\[.\\].*")) { throw new ResourceInitializationException(new IllegalArgumentException( "Patterns have to start with " + INCLUDE_PREFIX + " or " + EXCLUDE_PREFIX + ".")); } else { includes.add(pattern); } } // These should be the same as documented here: http://ant.apache.org/manual/dirtasks.html if (useDefaultExcludes) { excludes.add("**/*~"); excludes.add("**/#*#"); excludes.add("**/.#*"); excludes.add("**/%*%"); excludes.add("**/._*"); excludes.add("**/CVS"); excludes.add("**/CVS/**"); excludes.add("**/.cvsignore"); excludes.add("**/SCCS"); excludes.add("**/SCCS/**"); excludes.add("**/vssver.scc"); excludes.add("**/.svn"); excludes.add("**/.svn/**"); excludes.add("**/.DS_Store"); excludes.add("**/.git"); excludes.add("**/.git/**"); excludes.add("**/.gitattributes"); excludes.add("**/.gitignore"); excludes.add("**/.gitmodules"); excludes.add("**/.hg"); excludes.add("**/.hg/**"); excludes.add("**/.hgignore"); excludes.add("**/.hgsub"); excludes.add("**/.hgsubstate"); excludes.add("**/.hgtags"); excludes.add("**/.bzr"); excludes.add("**/.bzr/**"); excludes.add("**/.bzrignore"); } directoryScanner.setIncludes(includes.toArray(new String[includes.size()])); directoryScanner.setExcludes(excludes.toArray(new String[excludes.size()])); directoryScanner.scan(); // Get the iterator that will be used to actually traverse the FileSet. fileSetIterator = new FileResourceIterator(null, sourceLocation, directoryScanner.getIncludedFiles()); getLogger().info("Found [" + getIncludedFilesCount() + "] files to be read"); } protected int getIncludedFilesCount() { return directoryScanner.getIncludedFilesCount(); } protected Iterator<FileResource> getFileSetIterator() { return fileSetIterator; } protected FileResource nextFile() { try { return fileSetIterator.next(); } finally { completed++; } } @Override public Progress[] getProgress() { return new Progress[] { new ProgressImpl(completed, getIncludedFilesCount(), "file") }; } @Override public boolean hasNext() throws IOException, CollectionException { return fileSetIterator.hasNext(); } /** * Initialize the {@link DocumentMetaData}. This must be called before setting the document * text, otherwise the end feature of this annotation will not be set correctly. * * @param aCas * the CAS. * @param aFile * the file from which the CAS is initialized. * @param aQualifier * a qualifier if multiple CASes are generated from the same file. */ protected void initCas(CAS aCas, FileResource aFile, String aQualifier) { String qualifier = aQualifier != null ? "#"+aQualifier : ""; try { // Set the document metadata DocumentMetaData docMetaData = DocumentMetaData.create(aCas); File file = aFile.getFile(); docMetaData.setDocumentTitle(file.getName()); docMetaData.setDocumentUri(file.toURI().toString()+qualifier); docMetaData.setDocumentId(aFile.getName()+qualifier); if (aFile.getBaseDir() != null) { docMetaData.setDocumentBaseUri(sourceLocation.toURI().toString()); docMetaData.setCollectionId(aFile.getBaseDir().getPath()); } // Set the document language aCas.setDocumentLanguage(language); } catch (CASException e) { // This should not happen. throw new RuntimeException(e); } } public String getLanguage() { return language; } }