/* * Leech - crawling capabilities for Apache Tika * * Copyright (C) 2012 DFKI GmbH, Author: Christian Reuschling * * This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free * Software Foundation, either version 3 of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. * * Contact us by mail: christian.reuschling@dfki.de */ package de.dfki.km.leech.io; import java.io.ByteArrayInputStream; import java.io.File; import java.io.IOException; import java.net.URL; import java.text.SimpleDateFormat; import java.util.Collections; import java.util.Date; import java.util.Set; import javax.mail.URLName; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import de.dfki.km.leech.parser.incremental.IncrementalCrawlingHistory; public class FileURLStreamProvider extends URLStreamProvider { @Override public Metadata addFirstMetadata(URLName url2getMetadata, Metadata metadata2fill, ParseContext parseContext) throws Exception { if(metadata2fill == null) metadata2fill = new Metadata(); // wenn das Teil schon gefüllt ist, dann machen wir gar nix if(!(metadata2fill.get(Metadata.SOURCE) == null || metadata2fill.get(Metadata.MODIFIED) == null || metadata2fill.get(IncrementalCrawlingHistory.dataEntityId) == null || metadata2fill.get(IncrementalCrawlingHistory.dataEntityContentFingerprint) == null || metadata2fill .get(Metadata.RESOURCE_NAME_KEY) == null)) { // alle sind bereits gesetzt return metadata2fill; } // es fehlt mindestens eines, das wir hier dazupacken wollen - wir machen alles neu File file = new File(new URL(url2getMetadata.toString()).toURI()); // Für Leech metadata2fill.set(Metadata.SOURCE, file.toURI().toURL().toString()); // Optional metadata2fill.set(Metadata.MODIFIED, new SimpleDateFormat("yyyy.MM.dd HH:mm:ss:SSS").format(new Date(file.lastModified()))); // Für das inkrementelle indexieren String strEntityId; try { strEntityId = file.getCanonicalPath(); } catch (IOException e) { strEntityId = file.getAbsolutePath(); } metadata2fill.set(IncrementalCrawlingHistory.dataEntityId, strEntityId); metadata2fill.set(IncrementalCrawlingHistory.dataEntityContentFingerprint, String.valueOf(file.lastModified())); // Für Tika metadata2fill.set(Metadata.RESOURCE_NAME_KEY, strEntityId); return metadata2fill; } @Override public TikaInputStream getStream(URLName url2getStream, Metadata metadata, ParseContext parseContext) throws Exception { URL asUrl = new URL(url2getStream.toString()); File ourFile = new File(asUrl.toURI()); if(ourFile.isDirectory()) return TikaInputStream.get(new ByteArrayInputStream("leech sucks - hopefully :)".getBytes("UTF-8"))); return TikaInputStream.get(ourFile); } @Override public Set<String> getSupportedProtocols() { return Collections.singleton("file"); } }