/*
* Leech - crawling capabilities for Apache Tika
*
* Copyright (C) 2012 DFKI GmbH, Author: Christian Reuschling
*
* This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation,
* either version 3 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
* PARTICULAR PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* Contact us by mail: christian.reuschling@dfki.de
*/
package de.dfki.km.leech.sax;
import java.util.logging.Logger;
import org.apache.tika.metadata.Metadata;
import de.dfki.km.leech.parser.incremental.IncrementalCrawlingHistory;
/**
* A DataSinkContentHandler that simply prints out the data she recieves. You can also optionally specify another contentHandler that should be wrapped - in this case the
* data will be printed out and then everything will be delegated to the wrapped contentHandler
*
* @author Christian Reuschling, Dipl.Ing.(BA)
*/
public class PrintlnContentHandler extends DataSinkContentHandlerDecorator
{
static public enum Verbosity {
all, fulltext, metadata, nothing, title, titlePlusFulltext, titlePlusMetadata
}
protected boolean m_showOnlyErrors = false;
protected Verbosity m_verbosity = Verbosity.all;
public PrintlnContentHandler()
{
super();
}
public PrintlnContentHandler(DataSinkContentHandler wrappedDataSinkContentHandler)
{
super();
m_wrappedDataSinkContentHandler = wrappedDataSinkContentHandler;
}
public PrintlnContentHandler(Metadata metadata)
{
super(metadata);
}
public PrintlnContentHandler(Metadata metadata, DataSinkContentHandler wrappedDataSinkContentHandler)
{
super(metadata);
m_wrappedDataSinkContentHandler = wrappedDataSinkContentHandler;
}
public PrintlnContentHandler(Metadata metadata, Verbosity granularity)
{
super(metadata);
m_verbosity = granularity;
}
public PrintlnContentHandler(Metadata metadata, Verbosity granularity, DataSinkContentHandler wrappedDataSinkContentHandler)
{
super(metadata);
m_verbosity = granularity;
m_wrappedDataSinkContentHandler = wrappedDataSinkContentHandler;
}
public PrintlnContentHandler(Verbosity granularity)
{
super();
m_verbosity = granularity;
}
public PrintlnContentHandler(Verbosity granularity, DataSinkContentHandler wrappedDataSinkContentHandler)
{
super();
m_verbosity = granularity;
m_wrappedDataSinkContentHandler = wrappedDataSinkContentHandler;
}
public Verbosity getVerbosity()
{
return m_verbosity;
}
public boolean isShowOnlyErrors()
{
return m_showOnlyErrors;
}
@Override
public void processErrorData(Metadata metadata)
{
StringBuilder strbMessage = new StringBuilder();
if(m_verbosity != Verbosity.nothing) strbMessage.append("## PrintlnContentHandler ERROR data ##########################\n");
if(m_verbosity == Verbosity.all || m_verbosity == Verbosity.title || m_verbosity == Verbosity.titlePlusMetadata || m_verbosity == Verbosity.titlePlusFulltext)
{
String strInfo = metadata.get(IncrementalCrawlingHistory.dataEntityId);
if(strInfo == null) strInfo = metadata.get(Metadata.SOURCE);
if(strInfo == null) strInfo = metadata.get(Metadata.RESOURCE_NAME_KEY);
strbMessage.append(strInfo).append("\n");
}
if(m_verbosity == Verbosity.all || m_verbosity == Verbosity.metadata || m_verbosity == Verbosity.titlePlusMetadata)
{
// errorMessage
// errorStacktrace
strbMessage.append("## metadata:\n");
for (String strFieldName : metadata.names())
{
for (String strValue : metadata.getValues(strFieldName))
strbMessage.append(strFieldName + ": '" + strValue + "'\n");
}
}
if(m_verbosity != Verbosity.nothing) strbMessage.append("\n");
if(m_verbosity != Verbosity.nothing) Logger.getLogger(PrintlnContentHandler.class.getName()).info(strbMessage.toString());
if(m_wrappedDataSinkContentHandler != null) m_wrappedDataSinkContentHandler.processErrorData(metadata);
}
@Override
public void processModifiedData(Metadata metadata, String strFulltext)
{
if(!m_showOnlyErrors)
{
StringBuilder strbMessage = new StringBuilder();
if(m_verbosity != Verbosity.nothing) strbMessage.append("## PrintlnContentHandler MODIFIED data ##########################\n");
if(m_verbosity == Verbosity.all || m_verbosity == Verbosity.title || m_verbosity == Verbosity.titlePlusMetadata || m_verbosity == Verbosity.titlePlusFulltext)
{
String strInfo = metadata.get(IncrementalCrawlingHistory.dataEntityId);
if(strInfo == null) strInfo = metadata.get(Metadata.SOURCE);
if(strInfo == null) strInfo = metadata.get(Metadata.RESOURCE_NAME_KEY);
strbMessage.append(strInfo).append("\n");
}
if(m_verbosity == Verbosity.all || m_verbosity == Verbosity.metadata || m_verbosity == Verbosity.titlePlusMetadata)
{
strbMessage.append("## metadata:\n");
for (String strFieldName : metadata.names())
{
for (String strValue : metadata.getValues(strFieldName))
strbMessage.append(strFieldName + ": '" + strValue + "'\n");
}
}
if(m_verbosity == Verbosity.all || m_verbosity == Verbosity.fulltext || m_verbosity == Verbosity.titlePlusFulltext)
{
strFulltext = strFulltext.replaceAll("[\\n\\s]+", " ");
strFulltext = strFulltext.substring(0, Math.min(strFulltext.length(), 2345));
strbMessage.append("## fulltext (without newlines, reduced whitespace, fixed length): \n" + strFulltext).append("\n");
}
if(m_verbosity != Verbosity.nothing) strbMessage.append("\n");
if(m_verbosity != Verbosity.nothing) Logger.getLogger(PrintlnContentHandler.class.getName()).info(strbMessage.toString());
}
if(m_wrappedDataSinkContentHandler != null) m_wrappedDataSinkContentHandler.processModifiedData(metadata, strFulltext);
}
@Override
public void processNewData(Metadata metadata, String strFulltext)
{
if(!m_showOnlyErrors)
{
StringBuilder strbMessage = new StringBuilder();
if(m_verbosity != Verbosity.nothing) strbMessage.append("## PrintlnContentHandler - NEW data ##########################\n");
if(m_verbosity == Verbosity.all || m_verbosity == Verbosity.title || m_verbosity == Verbosity.titlePlusMetadata || m_verbosity == Verbosity.titlePlusFulltext)
{
String strInfo = metadata.get(IncrementalCrawlingHistory.dataEntityId);
if(strInfo == null) strInfo = metadata.get(Metadata.SOURCE);
if(strInfo == null) strInfo = metadata.get(Metadata.RESOURCE_NAME_KEY);
strbMessage.append(strInfo).append("\n");
}
if(m_verbosity == Verbosity.all || m_verbosity == Verbosity.metadata || m_verbosity == Verbosity.titlePlusMetadata)
{
strbMessage.append("## metadata:\n");
for (String strFieldName : metadata.names())
{
for (String strValue : metadata.getValues(strFieldName))
strbMessage.append(strFieldName + ": '" + strValue + "'\n");
}
}
if(m_verbosity == Verbosity.all || m_verbosity == Verbosity.fulltext || m_verbosity == Verbosity.titlePlusFulltext)
{
strFulltext = strFulltext.replaceAll("[\\n\\s]+", " ");
strFulltext = strFulltext.substring(0, Math.min(strFulltext.length(), 2345));
strbMessage.append("## fulltext (without newlines, reduced whitespace, fixed length): \n" + strFulltext).append("\n");
}
if(m_verbosity != Verbosity.nothing) strbMessage.append("\n");
if(m_verbosity != Verbosity.nothing) Logger.getLogger(PrintlnContentHandler.class.getName()).info(strbMessage.toString());
}
if(m_wrappedDataSinkContentHandler != null) m_wrappedDataSinkContentHandler.processNewData(metadata, strFulltext);
}
@Override
public void processRemovedData(Metadata metadata)
{
if(!m_showOnlyErrors)
{
StringBuilder strbMessage = new StringBuilder();
if(m_verbosity != Verbosity.nothing) strbMessage.append("## PrintlnContentHandler REMOVED data ##########################\n");
if(m_verbosity == Verbosity.all || m_verbosity == Verbosity.title || m_verbosity == Verbosity.titlePlusMetadata || m_verbosity == Verbosity.titlePlusFulltext)
{
String strInfo = metadata.get(IncrementalCrawlingHistory.dataEntityId);
if(strInfo == null) strInfo = metadata.get(Metadata.SOURCE);
if(strInfo == null) strInfo = metadata.get(Metadata.RESOURCE_NAME_KEY);
strbMessage.append(strInfo).append("\n");
}
if(m_verbosity == Verbosity.all || m_verbosity == Verbosity.metadata || m_verbosity == Verbosity.titlePlusMetadata)
{
strbMessage.append("## metadata:\n");
for (String strFieldName : metadata.names())
{
for (String strValue : metadata.getValues(strFieldName))
strbMessage.append(strFieldName + ": '" + strValue + "'\n");
}
}
if(m_verbosity != Verbosity.nothing) strbMessage.append("\n");
if(m_verbosity != Verbosity.nothing) Logger.getLogger(PrintlnContentHandler.class.getName()).info(strbMessage.toString());
}
if(m_wrappedDataSinkContentHandler != null) m_wrappedDataSinkContentHandler.processRemovedData(metadata);
}
public PrintlnContentHandler setVerbosity(Verbosity granularity)
{
m_verbosity = granularity;
return this;
}
public PrintlnContentHandler setShowOnlyErrors(boolean showOnlyErrors)
{
m_showOnlyErrors = showOnlyErrors;
return this;
}
}