/* * Copyright 2010 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.dkpro.core.api.io; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.net.MalformedURLException; import java.net.URI; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.Comparator; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.ListIterator; import java.util.Set; import org.apache.commons.lang.StringUtils; import org.apache.uima.UimaContext; import org.apache.uima.cas.CAS; import org.apache.uima.cas.CASException; import org.apache.uima.collection.CollectionException; import org.apache.uima.fit.component.CasCollectionReader_ImplBase; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.ExternalResource; import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.util.Progress; import org.apache.uima.util.ProgressImpl; import org.springframework.core.io.support.PathMatchingResourcePatternResolver; import org.springframework.core.io.support.ResourcePatternResolver; import org.springframework.util.AntPathMatcher; import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; /** * Base class for collection readers that plan to access resources on the file system or in the * classpath or basically anywhere where Spring can resolve them. ANT-style patterns are supported * to include or exclude particular resources. * <p> * Example of a hypothetic <code>FooReader</code> that should read only files ending in * <code>.foo</code> from in the directory <code>foodata</code> or any subdirectory thereof: * * <pre> * CollectionReader reader = createReader(FooReader.class, * FooReader.PARAM_LANGUAGE, "en", * FooReader.PARAM_SOURCE_LOCATION, "some/path", * FooReader.PARAM_PATTERNS, "[+]foodata/**/*.foo"); * </pre> * <p> * The list of resources returned is sorted, so for the same set of resources, they are always * returned in the same order. * * @see <a href="http://ant.apache.org/manual/dirtasks.html#patterns">Documentation of <b>ant</b> * patterns</a> * * @since 1.0.6 */ public abstract class ResourceCollectionReaderBase extends CasCollectionReader_ImplBase { protected static final String JAR_PREFIX = "jar:file:"; public static final String INCLUDE_PREFIX = "[+]"; public static final String EXCLUDE_PREFIX = "[-]"; /** * Location from which the input is read. * * @deprecated use {@link #PARAM_SOURCE_LOCATION} */ @Deprecated public static final String PARAM_PATH = ComponentParameters.PARAM_SOURCE_LOCATION; /** * Location from which the input is read. */ public static final String PARAM_SOURCE_LOCATION = ComponentParameters.PARAM_SOURCE_LOCATION; @ConfigurationParameter(name = PARAM_SOURCE_LOCATION, mandatory = false) private String sourceLocation; /** * A set of Ant-like include/exclude patterns. A pattern starts with {@link #INCLUDE_PREFIX [+]} * if it is an include pattern and with {@link #EXCLUDE_PREFIX [-]} if it is an exclude pattern. * The wildcard <code>/**/</code> can be used to address any number of sub-directories. * The wildcard {@code *} can be used to a address a part of a name. */ public static final String PARAM_PATTERNS = ComponentParameters.PARAM_PATTERNS; @ConfigurationParameter(name = PARAM_PATTERNS, mandatory = false) private String[] patterns; /** * Use the default excludes. */ public static final String PARAM_USE_DEFAULT_EXCLUDES = "useDefaultExcludes"; @ConfigurationParameter(name = PARAM_USE_DEFAULT_EXCLUDES, mandatory = true, defaultValue = "true") private boolean useDefaultExcludes; /** * Include hidden files and directories. */ public static final String PARAM_INCLUDE_HIDDEN = "includeHidden"; @ConfigurationParameter(name = PARAM_INCLUDE_HIDDEN, mandatory = true, defaultValue = "false") private boolean includeHidden; /** * Name of optional configuration parameter that contains the language of the documents in the * input directory. If specified, this information will be added to the CAS. */ public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) private String language; /** * Name of optional external (UIMA) resource that contains the Locator for a (Spring) * ResourcePatternResolver implementation for locating (spring) resources. */ public static final String KEY_RESOURCE_RESOLVER = "resolver"; @ExternalResource(key = KEY_RESOURCE_RESOLVER, mandatory = false) private final ResourcePatternResolver resolver = new PathMatchingResourcePatternResolver(); /** * The frequency with which read documents are logged. Default: 1 (log every document). * <p> * Set to 0 or negative values to deactivate logging. */ public static final String PARAM_LOG_FREQ = "logFreq"; @ConfigurationParameter(name = PARAM_LOG_FREQ, mandatory = true, defaultValue = "1") private int logFreq; private int completed; private Collection<Resource> resources; private Iterator<Resource> resourceIterator; private ProgressMeter progress; @Override public void initialize(UimaContext aContext) throws ResourceInitializationException { super.initialize(aContext); if ((patterns == null || patterns.length == 0) && StringUtils.isBlank(sourceLocation)) { throw new IllegalArgumentException( "Either a source location, pattern, or both must be specified."); } // if an ExternalResourceLocator providing a custom ResourcePatternResolver // has been specified, use it, by default use PathMatchingResourcePatternresolver // If there are no patterns, then look for a pattern in the location itself. // If the source location contains a wildcard, split it up into a base and a pattern if (patterns == null) { int asterisk = sourceLocation.indexOf('*'); int colon = sourceLocation.indexOf(':'); if (asterisk != -1 && asterisk > colon) { // asterisk < colon in a case such as "classpath*:file.txt" int separator = Math.max(Math.max( sourceLocation.lastIndexOf(File.separatorChar, asterisk), sourceLocation.lastIndexOf('/', asterisk)), sourceLocation.lastIndexOf(':', asterisk)); if (separator != -1) { // If there is a separator before the asterisk use it to separate into // base and pattern. This is meant to catch cases such as "dir/foo*.txt" of // file:foo*.txt patterns = new String[] { INCLUDE_PREFIX + sourceLocation.substring(separator+1) }; sourceLocation = sourceLocation.substring(0, separator+1); } else { patterns = new String[] { INCLUDE_PREFIX + sourceLocation }; sourceLocation = ""; } } } // Parse the patterns and inject them into the FileSet List<String> includes = new ArrayList<String>(); List<String> excludes = getDefaultExcludes(); if (patterns != null) { for (String pattern : patterns) { if (pattern.startsWith(INCLUDE_PREFIX)) { includes.add(pattern.substring(INCLUDE_PREFIX.length())); } else if (pattern.startsWith(EXCLUDE_PREFIX)) { excludes.add(pattern.substring(EXCLUDE_PREFIX.length())); } else if (pattern.matches("^\\[.\\].*")) { throw new ResourceInitializationException(new IllegalArgumentException( "Patterns have to start with " + INCLUDE_PREFIX + " or " + EXCLUDE_PREFIX + ".")); } else { includes.add(pattern); } } } try { if (sourceLocation == null) { ListIterator<String> i = includes.listIterator(); while (i.hasNext()) { i.set(locationToUrl(i.next())); } i = excludes.listIterator(); while (i.hasNext()) { i.set(locationToUrl(i.next())); } } else { sourceLocation = locationToUrl(sourceLocation); } resources = scan(getSourceLocation(), includes, excludes); progress = new ProgressMeter(resources.size()); // Get the iterator that will be used to actually traverse the FileSet. resourceIterator = resources.iterator(); getLogger().info("Found [" + resources.size() + "] resources to be read"); } catch (IOException e) { throw new ResourceInitializationException(e); } } protected List<String> getDefaultExcludes() { List<String> excludes = new ArrayList<String>(); // These should be the same as documented here: http://ant.apache.org/manual/dirtasks.html if (useDefaultExcludes) { excludes.add("**/*~"); excludes.add("**/#*#"); excludes.add("**/.#*"); excludes.add("**/%*%"); excludes.add("**/._*"); excludes.add("**/CVS"); excludes.add("**/CVS/**"); excludes.add("**/.cvsignore"); excludes.add("**/SCCS"); excludes.add("**/SCCS/**"); excludes.add("**/vssver.scc"); excludes.add("**/.svn"); excludes.add("**/.svn/**"); excludes.add("**/.DS_Store"); excludes.add("**/.git"); excludes.add("**/.git/**"); excludes.add("**/.gitattributes"); excludes.add("**/.gitignore"); excludes.add("**/.gitmodules"); excludes.add("**/.hg"); excludes.add("**/.hg/**"); excludes.add("**/.hgignore"); excludes.add("**/.hgsub"); excludes.add("**/.hgsubstate"); excludes.add("**/.hgtags"); excludes.add("**/.bzr"); excludes.add("**/.bzr/**"); excludes.add("**/.bzrignore"); } return excludes; } /** * Make sure the given location is an URL. E.g. adds "file:" if necessary. * * @param aLocation * the location. * @return an URL. * @throws MalformedURLException * if the location cannot be converted to a valid URL. */ protected String locationToUrl(String aLocation) throws MalformedURLException { String location = aLocation; if (isUnmarkedFileLocation(aLocation)) { location = new File(location).toURI().toURL().toString(); } else if (location.startsWith(JAR_PREFIX) && !location.contains("!")) { // If we write something like "jar:file:/my/archive.zip", append the required "!" location += "!"; } return location; } /** * Checks if a location refers to a local file but does not start with "file:" * * @param aLocation * the location. * @return if "file:" needs to be added to make the location explicit. */ private boolean isUnmarkedFileLocation(String aLocation) { // On Windows systems, an absolute path contains a colon at offset 1. If the offset is // 2 or greater, the colon likely is a scheme separator, not a drive letter separator. return aLocation.indexOf(':') < 2; } protected Collection<Resource> getResources() { return resources; } protected Iterator<Resource> getResourceIterator() { return resourceIterator; } protected Resource nextFile() { try { Resource res = resourceIterator.next(); progress.setDone(completed); if (logFreq > 0 && completed % logFreq == 0) { getLogger().info(String.format("%s: %s", progress, res.location)); } return res; } finally { completed++; } } protected String getSourceLocation() { return sourceLocation; } protected boolean isSingleLocation() { return patterns == null; } /** * Get the base location used by the reader. This location always ends in a / if it is set at * all. If there is no base, an empty string is returned. * * @return the base location used by the reader. */ protected String getBase() { return getBase(getSourceLocation()); } protected String getBase(String aBase) { boolean singleLocation = patterns == null; String base; if (aBase != null) { base = aBase; // If this is a real base location, then add a "/" if there is none if (!singleLocation) { if (!base.endsWith("/") && !base.endsWith(":")) { base += "/"; } } } else { base = ""; } return base; } @Override public Progress[] getProgress() { return new Progress[] { new ProgressImpl(completed, resources.size(), "file") }; } protected ResourcePatternResolver getResolver() { return resolver; } @Override public boolean hasNext() throws IOException, CollectionException { return resourceIterator.hasNext(); } protected Collection<Resource> scan(String aBase, Collection<String> aIncludes, Collection<String> aExcludes) throws IOException { boolean singleLocation = isSingleLocation(); String base = getBase(aBase); getLogger().info("Scanning [" +base + "]"); Collection<String> includes; Collection<String> excludes; if (aIncludes == null || aIncludes.size() == 0) { if (!singleLocation) { includes = Collections.singleton("**/*"); } else { includes = Collections.singleton(""); } } else { includes = aIncludes; } if (aExcludes == null || aExcludes.size() == 0) { excludes = Collections.emptySet(); } else { excludes = aExcludes; } AntPathMatcher matcher = new AntPathMatcher(); List<Resource> result = new ArrayList<Resource>(); // Collect the bases only if we need them later on. If no base is set, then getUri() will // not work because the base ("") may be resolved to one or more JAR locations and getUri() // internally expects file locations Set<String> rsBases = new HashSet<String>(); if (base.length() > 0 && !singleLocation) { // E.g. a classpath location may resolve to multiple locations. Thus we collect all the // locations to which the base resolves. org.springframework.core.io.Resource[] rBases = resolver.getResources(base); for (org.springframework.core.io.Resource rBase : rBases) { URI uri = getUri(rBase, false); if (uri != null) { rsBases.add(uri.toString()); } } } // Now we process the include patterns one after the other for (String include : includes) { // We resolve the resources for each base+include combination. org.springframework.core.io.Resource[] resourceList = resolver.getResources(base + include); nextResource: for (org.springframework.core.io.Resource resource : resourceList) { URI uResource = getUri(resource, true); if (uResource == null) { continue; } String sResource = uResource.toString(); // Determine the resolved base for this location String matchBase = null; if (base.length() > 0 && !singleLocation) { for (String b : rsBases) { if (!sResource.startsWith(b)) { continue; } // This is the base... at least we define it as being the base. // FIXME there may be other bases. Have to define a policy if most or least // specific base should be chosen. matchBase = b; break; } if (matchBase == null) { // This should not happen... throw new IllegalStateException("No base found for location [" + sResource + "]"); } } else { // If no base is set, no need to go through the trouble of finding one. matchBase = base; } // To figure out if the resolved location is excluded, we try to find the part // of the location that was determined by the include pattern by substracting the // resolved base locations one after the other and looking if the result is // matched by the exclude. if (excludes != null) { for (String exclude : excludes) { String rest = sResource.substring(matchBase.length()); if (matcher.match(exclude, rest)) { if (getLogger().isDebugEnabled()) { getLogger().debug("Excluded: " + sResource); } continue nextResource; } } } // If the resource was not excluded, we add it to the results. String p = sResource.substring(matchBase.length()); String loc = base + p; if (isSingleLocation()) { // If it was a single location, then use the parent folder as base p = StringUtils.substringAfterLast(matchBase, "/"); matchBase = StringUtils.substringBeforeLast(matchBase, "/") + '/'; } Resource r = new Resource(loc, base, resource.getURI(), matchBase, p, resource); result.add(r); } } Collections.sort(result, new Comparator<Resource>() { @Override public int compare(Resource aO1, Resource aO2) { return aO1.location.compareTo(aO2.location); } }); if (singleLocation && result.isEmpty()) { throw new FileNotFoundException( "Resource not found or not a file: [" + aBase + "]. Please specify a file or use a pattern. Directories without patterns are " + "not valid."); } return result; } /** * Get the URI of the given resource. * * @param aResource * a resource * @param aFileOrDir * if true try to return only files, if false try to return only dirs * @return the URI of the resource * @throws IOException * if an I/O error occurs. */ private URI getUri(org.springframework.core.io.Resource aResource, boolean aFileOrDir) throws IOException { try { final File file = aResource.getFile(); // Exclude hidden files/dirs if requested if (file.isHidden() && !this.includeHidden) { return null; } // Return only dirs or files... if ((file.getPath().length() == 0) || (aFileOrDir && file.isFile()) || (!aFileOrDir && file.isDirectory())) { return aResource.getFile().toURI(); } else { return null; } } catch (final IOException e) { return aResource.getURI(); } catch (final UnsupportedOperationException e) { return aResource.getURI(); } } /** * Initialize the {@link DocumentMetaData}. This must be called before setting the document * text, otherwise the end feature of this annotation will not be set correctly. * * @param aCas * the CAS. * @param aResource * the resource from which the CAS is initialized. */ protected void initCas(CAS aCas, Resource aResource) { initCas(aCas, aResource, null); } /** * Initialize the {@link DocumentMetaData}. This must be called before setting the document * text, otherwise the end feature of this annotation will not be set correctly. * * @param aCas * the CAS. * @param aResource * the resource from which the CAS is initialized. * @param aQualifier * a qualifier if multiple CASes are generated from the same file. */ protected void initCas(CAS aCas, Resource aResource, String aQualifier) { String qualifier = aQualifier != null ? "#" + aQualifier : ""; try { // Set the document metadata DocumentMetaData docMetaData = DocumentMetaData.create(aCas); docMetaData.setDocumentTitle(new File(aResource.getPath()).getName()); docMetaData.setDocumentUri(aResource.getResolvedUri().toString() + qualifier); docMetaData.setDocumentId(aResource.getPath() + qualifier); if (aResource.getBase() != null) { docMetaData.setDocumentBaseUri(aResource.getResolvedBase()); docMetaData.setCollectionId(aResource.getResolvedBase()); } // Set the document language aCas.setDocumentLanguage(language); } catch (CASException e) { // This should not happen. throw new RuntimeException(e); } } public String getLanguage() { return language; } /** */ public static class Resource { private final String location; private final String base; private final URI resolvedUri; private final String resolvedBase; private final String path; private final org.springframework.core.io.Resource resource; public Resource(String aLocation, String aBase, URI aResolvedUri, String aResolvedBaseUri, String aPath, org.springframework.core.io.Resource aResource) { super(); location = aLocation; base = aBase; resolvedUri = aResolvedUri; resolvedBase = aResolvedBaseUri; path = aPath; resource = aResource; } public String getLocation() { return location; } public String getBase() { return base; } public URI getResolvedUri() { return resolvedUri; } public String getResolvedBase() { return resolvedBase; } public String getPath() { return path; } public org.springframework.core.io.Resource getResource() { return resource; } public InputStream getInputStream() throws IOException { return resource.getInputStream(); } @Override public int hashCode() { final int prime = 31; int result = 1; result = prime * result + ((resolvedUri == null) ? 0 : resolvedUri.hashCode()); return result; } @Override public boolean equals(Object obj) { if (this == obj) { return true; } if (obj == null) { return false; } if (getClass() != obj.getClass()) { return false; } Resource other = (Resource) obj; if (resolvedUri == null) { if (other.resolvedUri != null) { return false; } } else if (!resolvedUri.equals(other.resolvedUri)) { return false; } return true; } @Override public String toString() { return location; } } }