/*
Leech - crawling capabilities for Apache Tika
Copyright (C) 2012 DFKI GmbH, Author: Christian Reuschling
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
Contact us by mail: christian.reuschling@dfki.de
*/
package de.dfki.km.leech.parser;
import java.io.File;
import java.io.FileFilter;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.Collections;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Set;
import java.util.concurrent.SynchronousQueue;
import java.util.logging.Level;
import java.util.logging.Logger;
import javax.mail.URLName;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.xml.sax.ContentHandler;
import de.dfki.inquisition.collections.MultiValueHashMap;
import de.dfki.km.leech.Leech;
import de.dfki.km.leech.config.CrawlerContext;
import de.dfki.km.leech.config.DirectoryCrawlerContext;
import de.dfki.km.leech.detect.DatasourceMediaTypes;
import de.dfki.km.leech.io.URLStreamProvider;
import de.dfki.km.leech.util.OSUtils;
/**
* A CrawlerParser implementation that can crawl file system directories. Configure it by specifying a {@link CrawlerContext} and a
* {@link DirectoryCrawlerContext} object inside the {@link ParseContext} object for the crawl.
*
* @author Christian Reuschling, Dipl.Ing.(BA)
*/
public class DirectoryCrawlerParser extends CrawlerParser
{
protected static class OneAfterOneIterator implements Iterator<MultiValueHashMap<String, Object>>
{
static final MultiValueHashMap<String, Object> m_noMoreLeftMarker = new MultiValueHashMap<String, Object>();
protected MultiValueHashMap<String, Object> m_nextElement;
protected SynchronousQueue<MultiValueHashMap<String, Object>> m_synchronousQueue = new SynchronousQueue<MultiValueHashMap<String, Object>>();
public void addNextElement(MultiValueHashMap<String, Object> nextElement)
{
try
{
m_synchronousQueue.put(nextElement);
}
catch (InterruptedException e)
{
Logger.getLogger(DirectoryCrawlerParser.OneAfterOneIterator.class.getName()).log(Level.SEVERE, "Error", e);
}
}
@Override
public boolean hasNext()
{
try
{
m_nextElement = m_synchronousQueue.take();
if(m_noMoreLeftMarker == m_nextElement) return false;
return true;
}
catch (InterruptedException e)
{
Logger.getLogger(DirectoryCrawlerParser.OneAfterOneIterator.class.getName()).log(Level.SEVERE, "Error", e);
}
return false;
}
@Override
public MultiValueHashMap<String, Object> next()
{
return m_nextElement;
}
@Override
public void remove()
{
throw new UnsupportedOperationException();
}
}
private static final long serialVersionUID = 1824851369780822093L;
protected Leech m_leech;
/**
* Checks whether this file is inside the configured constraints (hidden files, symbolic links, etc) or not
*
* @param fFile2Check the file to check whether it is in the configured constraints
* @param crawlerContext the context object with the general constraints
* @param directoryCrawlerContext the context Object with the directory related constraints
*
* @return null in the case the file is outside the constraints, a canonical file object of the given file otherwise
*/
protected File checkIfInConstraints(File fFile2Check, CrawlerContext crawlerContext, DirectoryCrawlerContext directoryCrawlerContext)
{
File finalFile = fFile2Check;
try
{
// determine absolute and canonical paths
String strAbsolutePath = fFile2Check.getAbsolutePath();
String strCanonicalPath = fFile2Check.getCanonicalPath();
// optionally skip symbolic links
if(!directoryCrawlerContext.getFollowSymbolicLinks() && !strAbsolutePath.equals(strCanonicalPath))
{
if(crawlerContext.getVerbose())
Logger.getLogger(DirectoryCrawlerParser.class.getName()).info(
"File " + fFile2Check.toURI() + " is a symbolic link that should be ignored. Skipping.");
return null;
}
// create the canonical File
finalFile = new File(strCanonicalPath);
}
catch (IOException e)
{
Logger.getLogger(DirectoryCrawlerParser.class.getName()).log(Level.WARNING,
"Unable to resolve file to its canonical form, continuing with original file: " + fFile2Check, e);
}
if(!crawlerContext.getURLFilter().accept(finalFile.toURI().toString()))
{
if(crawlerContext.getVerbose())
Logger.getLogger(CrawlerParser.class.getName()).info(
"File " + finalFile.toURI() + " is outside the URL constraints for this data source. Skipping.");
return null;
}
if(directoryCrawlerContext.getIgnoreHiddenFiles() && finalFile.isHidden())
{
// in the case this file is hidden, we also ignore it silently
return null;
}
// Dont crawl into MacOSX bundles.
if(OSUtils.isMacOSXBundle(finalFile)) return null;
if(!finalFile.canRead())
{
if(crawlerContext.getVerbose())
Logger.getLogger(DirectoryCrawlerParser.class.getName()).info("Can't read file " + finalFile.toURI() + ". Skipping.");
return null;
}
return finalFile;
}
@Override
protected Iterator<MultiValueHashMap<String, Object>> getSubDataEntitiesInformation(InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context) throws Exception
{
// was crawlen wir hier eigentlich?
String strDirName = metadata.get(Metadata.SOURCE);
// und wie?
final CrawlerContext crawlerContext = context.get(CrawlerContext.class, new CrawlerContext());
final DirectoryCrawlerContext directoryCrawlerContext = context.get(DirectoryCrawlerContext.class, new DirectoryCrawlerContext());
final File fDir = new File(new URL(strDirName).toURI());
if(!fDir.isDirectory()) throw new IllegalStateException("' " + strDirName + "' is no directory");
final File fFinalDir = checkIfInConstraints(fDir, crawlerContext, directoryCrawlerContext);
if(fFinalDir == null) return new LinkedList<MultiValueHashMap<String, Object>>().iterator();
final OneAfterOneIterator oneAfterOneIterator = new OneAfterOneIterator();
// wir übernehmen das Konzept aus Aperture mit dem FileFilter. Des isch ned schlecht.Spart memory.
Thread listFilesThread = new Thread(new Runnable()
{
boolean m_bStopWasRequested = false;
@Override
public void run()
{
fFinalDir.listFiles(new FileFilter()
{
@Override
public boolean accept(File fSubFile2Check)
{
// we always return false - this saves memory
if(crawlerContext.stopRequested())
{
if(!m_bStopWasRequested) m_bStopWasRequested = true;
return false;
}
File fCheckedFile = checkIfInConstraints(fSubFile2Check, crawlerContext, directoryCrawlerContext);
if(fCheckedFile == null) return false;
MultiValueHashMap<String, Object> hsEntityInformation = new MultiValueHashMap<String, Object>();
hsEntityInformation.add("fileObject", fCheckedFile);
hsEntityInformation.add(CrawlerParser.SOURCEID, fCheckedFile.getAbsolutePath());
oneAfterOneIterator.addNextElement(hsEntityInformation);
return false;
}
});
// wir markieren das Ende mit einer eigens dafür erstellten Konstante
if(!m_bStopWasRequested && !crawlerContext.stopRequested())
oneAfterOneIterator.addNextElement(OneAfterOneIterator.m_noMoreLeftMarker);
}
}, "DirectoryCrawlserParser listFiles");
listFilesThread.start();
return oneAfterOneIterator;
}
@Override
public Set<MediaType> getSupportedTypes(ParseContext context)
{
return Collections.singleton(DatasourceMediaTypes.DIRECTORY);
}
@Override
protected void processCurrentDataEntity(InputStream stream, Metadata metadata, ContentHandler handler2use4recursiveCall, ParseContext context)
throws Exception
{
// NOP - wie don't process directories - we only process the files inside
}
@Override
protected void processSubDataEntity(MultiValueHashMap<String, Object> subDataEntityInformation, Metadata metadata,
ContentHandler handler2use4recursiveCall, ParseContext context) throws Exception
{
File fSubfile = (File) subDataEntityInformation.getFirst("fileObject");
URLName url = new URLName(fSubfile.toURI().toURL());
metadata = URLStreamProvider.getURLStreamProvider(url).addFirstMetadata(url, metadata, context);
InputStream stream = URLStreamProvider.getURLStreamProvider(url).getStream(url, metadata, context);
try
{
if(m_leech == null) m_leech = new Leech();
Parser parser = m_leech.getParser();
parser.parse(stream, handler2use4recursiveCall, metadata, context);
}
finally
{
if(stream != null) stream.close();
}
}
}