/*
* Leech - crawling capabilities for Apache Tika
*
* Copyright (C) 2012 DFKI GmbH, Author: Christian Reuschling
*
* This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation,
* either version 3 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
* PARTICULAR PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* Contact us by mail: christian.reuschling@dfki.de
*/
package de.dfki.km.leech.parser;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Set;
import java.util.logging.Logger;
import javax.mail.URLName;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.EmptyParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.html.HtmlParser;
import org.apache.tika.sax.Link;
import org.apache.tika.sax.LinkContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import de.dfki.inquisition.collections.MultiValueHashMap;
import de.dfki.inquisition.processes.StopWatch;
import de.dfki.inquisition.text.StringUtils;
import de.dfki.km.leech.Leech;
import de.dfki.km.leech.config.CrawlerContext;
import de.dfki.km.leech.config.HtmlCrawlerContext;
import de.dfki.km.leech.io.URLStreamProvider;
import de.dfki.km.leech.parser.incremental.IncrementalCrawlingHistory;
import de.dfki.km.leech.parser.incremental.IncrementalCrawlingParser;
import de.dfki.km.leech.parser.incremental.IncrementalCrawlingHistory.Exist;
import de.dfki.km.leech.util.UrlUtil;
/**
* A CrawlerParser implementation that can crawl html files. The content of the html file is simply delegated to {@link HtmlParser}, then all links will be extracted with
* {@link LinkContentHandler} and recursively processed again with Leech. Configure it by specifying a {@link CrawlerContext} and a {@link HtmlCrawlerContext} object
* inside the {@link ParseContext} object for the crawl.
*
* @author Christian Reuschling, Dipl.Ing.(BA)
*/
public class HtmlCrawlerParser extends CrawlerParser
{
private static final long serialVersionUID = -8214006342702249257L;
protected static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(MediaType.text("html"),
MediaType.application("xhtml+xml"), MediaType.application("vnd.wap.xhtml+xml"), MediaType.application("x-asp"))));
protected Leech m_leech;
protected HtmlParser m_tikaHtmlParser = new HtmlParser();
/**
* Checks whether this URL is inside the configured constraints (domainname, some other strings, regex contstraints) or not
*
* @param strContainerURL the url from the current container to check whether it is a remote or a local one
* @param strURL2Check the URL to check whether it is in the configured constraints
* @param crawlerContext the context object with the general constraints
* @param htmlCrawlerContext the context object specific for the html parser
*
* @return true in the case the URL is inside the constraints, false otherwise
*/
protected boolean checkIfInConstraints(String strContainerURL, String strURL2Check, CrawlerContext crawlerContext, HtmlCrawlerContext htmlCrawlerContext)
{
if(crawlerContext == null) return true;
// ist der container local?
if(strContainerURL.startsWith("file:") && !strURL2Check.startsWith("file:") && !htmlCrawlerContext.getFollowRemoteLinksIfLocalFileCrawl())
{
if(crawlerContext.getVerbose())
Logger.getLogger(CrawlerParser.class.getName()).info(
"URL " + strURL2Check + " is a remote link and thus will not followed while crawling a local html file (as configured). Skipping.");
return false;
}
if(!crawlerContext.getURLFilter().accept(strURL2Check))
{
if(crawlerContext.getVerbose())
Logger.getLogger(CrawlerParser.class.getName()).info("URL " + strURL2Check + " is outside the URL constraints for this data source. Skipping.");
return false;
}
return true;
}
@Override
protected Iterator<MultiValueHashMap<String, Object>> getSubDataEntitiesInformation(InputStream stream, ContentHandler handler, Metadata metadata,
ParseContext context) throws Exception
{
HashSet<URLName> hsLinkzAndI = new HashSet<URLName>();
CrawlerContext crawlerContext = context.get(CrawlerContext.class);
HtmlCrawlerContext htmlCrawlerContext = context.get(HtmlCrawlerContext.class, new HtmlCrawlerContext());
String strContainerURL = metadata.get(Metadata.SOURCE);
// ## die Linkz - wir werden hier mal unsere links mit Tika auslesen - das ist konsequent und wird von anderen weiterentwickelt ;)
LinkContentHandler linkContentHandler = new LinkContentHandler();
m_tikaHtmlParser.parse(stream, linkContentHandler, metadata, context);
for (Link link : linkContentHandler.getLinks())
{
if(StringUtils.nullOrWhitespace(link.getUri())) continue;
try
{
String strExternalForm = new URL(link.getUri()).toExternalForm();
strExternalForm = UrlUtil.normalizeURL(new URLName(strExternalForm)).toString();
// wir verfolgen auch nur die Links, die innerhalb der userconstraints liegen
if(checkIfInConstraints(strContainerURL, strExternalForm, crawlerContext, htmlCrawlerContext)) hsLinkzAndI.add(new URLName(strExternalForm));
}
catch (Exception e)
{
// ignore link
}
}
LinkedList<MultiValueHashMap<String, Object>> llDataEntityInfos = new LinkedList<MultiValueHashMap<String, Object>>();
for (URLName url4link : hsLinkzAndI)
{
MultiValueHashMap<String, Object> entityInfo = new MultiValueHashMap<String, Object>();
url4link = UrlUtil.normalizeURL(url4link);
entityInfo.add(CrawlerParser.SOURCEID, url4link.toString());
entityInfo.add("url", url4link);
llDataEntityInfos.add(entityInfo);
}
return llDataEntityInfos.iterator();
}
@Override
public Set<MediaType> getSupportedTypes(ParseContext context)
{
return SUPPORTED_TYPES;
}
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException
{
// wenn wir nicht genug metadatan haben, dann fallen wir auf den Standard-Tika-htmlparser zurück. Das ist der Fall, wenn wir ein eingebettetes
// html-file haben, z.b. ein html-file in einem zip. Die werden dann nicht gecrawlt, der Inhalt wird ganz einfach extrahiert.
String strSource = metadata.get(Metadata.SOURCE);
if(StringUtils.nullOrWhitespace(strSource))
{
m_tikaHtmlParser.parse(stream, handler, metadata, context);
return;
}
super.parse(stream, handler, metadata, context);
}
@Override
protected void processCurrentDataEntity(InputStream stream, Metadata metadata, ContentHandler handler, ParseContext context) throws Exception
{
// der Inhalt der momentanen Seite wird verarbeitet - aber nur, wenn sich der Inhalt auch verändert hat (nicht unmodified)
String strDataEntityModState = metadata.get(IncrementalCrawlingParser.DATA_ENTITY_MODIFICATION_STATE);
if(IncrementalCrawlingParser.UNMODIFIED.equals(strDataEntityModState)) return;
m_tikaHtmlParser.parse(stream, handler, metadata, context);
}
@Override
protected void processSubDataEntity(MultiValueHashMap<String, Object> subDataEntityInformation, Metadata metadata, ContentHandler handler2use4recursiveCall,
ParseContext context) throws Exception
{
URLName url = (URLName) subDataEntityInformation.getFirst("url");
// Performance: wenn wir es in diesem Crawl schon mal prozessiert haben (anhand der nicht-redirect-geprüften URL), skippen wir hier. Redirects checken dauert.
CrawlerContext crawlerContext = context.get(CrawlerContext.class);
if(crawlerContext != null)
{
IncrementalCrawlingHistory crawlingHistory = crawlerContext.getIncrementalCrawlingHistory();
if(crawlingHistory != null)
{
Exist exist = crawlingHistory.exists(url.toString());
if(exist.equals(Exist.YES_PROCESSED))
{
metadata.set(IncrementalCrawlingParser.DATA_ENTITY_MODIFICATION_STATE, IncrementalCrawlingParser.PROCESSED);
InputStream dummyStream = new ByteArrayInputStream("leech sucks - hopefully :)".getBytes("UTF-8"));
EmptyParser.INSTANCE.parse(dummyStream, handler2use4recursiveCall, metadata, context);
return;
}
}
}
metadata = URLStreamProvider.getURLStreamProvider(url).addFirstMetadata(url, metadata, context);
InputStream stream = URLStreamProvider.getURLStreamProvider(url).getStream(url, metadata, context);
try
{
if(m_leech == null) m_leech = new Leech();
Parser parser = m_leech.getParser();
parser.parse(stream, handler2use4recursiveCall, metadata, context);
}
finally
{
if(stream != null) stream.close();
}
}
}