/* * Leech - crawling capabilities for Apache Tika * * Copyright (C) 2012 DFKI GmbH, Author: Christian Reuschling * * This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, * either version 3 of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A * PARTICULAR PURPOSE. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. * * Contact us by mail: christian.reuschling@dfki.de */ package de.dfki.km.leech.sax; import java.io.StringWriter; import org.apache.tika.metadata.HttpHeaders; import org.apache.tika.metadata.Metadata; import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.ContentHandlerDecorator; import org.apache.tika.sax.WriteOutContentHandler; import org.xml.sax.SAXException; import de.dfki.km.leech.parser.CrawlerParser; import de.dfki.km.leech.parser.incremental.IncrementalCrawlingHistory; import de.dfki.km.leech.parser.incremental.IncrementalCrawlingParser; import de.dfki.km.leech.util.UrlUtil; /** * A ContentHandler implementation to store data with a hook/push-interface. This is for crawling datasources recursively. Implement the processData methods and process * your data as you wish.<br> * <br> * This handler deals with the data entity modification state entries inside the metadata offered from {@link IncrementalCrawlingParser} and {@link CrawlerParser} (in * case of an error). * * @author Christian Reuschling, Dipl.Ing.(BA) */ public abstract class DataSinkContentHandler extends ContentHandlerDecorator { protected int m_iWriteLimit = -1; protected Metadata m_metadata = new Metadata(); protected StringWriter m_writer; /** * Creates a new {@link DataSinkContentHandler}.<br> * CAUTION:Note that the internal metadata object has to be the same than the one given to the parser that works with this contentHandler. Use * {@link #setMetaData(Metadata)} or one of the Leech methods with the DataSinkContentHandlers. In the second case Leech will make sure that the metadata objects will * be set correctly. */ public DataSinkContentHandler() { } /** * Creates a content handler that writes XHTML body character events to an internal string buffer, and forwards it together with the metadata object to a * callback/processing method.<br> * CAUTION:Note that the internal metadata object has to be the same than the one given to the parser that works with this contentHandler. Use * {@link #setMetaData(Metadata)} or one of the Leech methods with the DataSinkContentHandlers. In the second case Leech will make sure that the metadata objects will * be set correctly. * <p> * <p> * The internal string buffer is bounded at the given number of characters. If this write limit is reached, then a {@link SAXException} is thrown. * * @param writeLimit maximum number of characters to include in the string, or -1 to disable the write limit */ public DataSinkContentHandler(int writeLimit) { m_iWriteLimit = writeLimit; } /** * Creates a content handler that writes XHTML body character events to an internal string buffer, and forwards it together with the metadata object to a * callback/processing method. * <p> * The internal string buffer is bounded at 6 * 1024 * 1024 characters. If this write limit is reached, then a {@link SAXException} is thrown. * * @param metadata the metadata object given to the parser object that works with this ContentHandler. This is to forward this reference to the processing method, so * make sure that both objects holds the same object */ public DataSinkContentHandler(Metadata metadata) { m_metadata = metadata; } /** * Creates a content handler that writes XHTML body character events to an internal string buffer, and forwards it together with the metadata object to a * callback/processing method. * <p> * <p> * The internal string buffer is bounded at the given number of characters. If this write limit is reached, then a {@link SAXException} is thrown. * * @param writeLimit maximum number of characters to include in the string, or -1 to disable the write limit * @param metadata the metadata object given to the parser object that works with this ContentHandler. This is to forward this reference to the processing method, so * make sure that both objects holds the same object */ public DataSinkContentHandler(Metadata metadata, int writeLimit) { m_iWriteLimit = writeLimit; m_metadata = metadata; } /** * This method will be invoked by the leech class at the end of the parse method. You can perform some shutdown stuff after the crawl if you implement this method. */ public abstract void crawlFinished(); @Override public void endDocument() throws SAXException { super.endDocument(); String strDataEntitiyModState = m_metadata.get(IncrementalCrawlingParser.DATA_ENTITY_MODIFICATION_STATE); // wir entfernen die Dinge, die wir gar nicht drin haben wollen m_metadata.remove(IncrementalCrawlingParser.DATA_ENTITY_MODIFICATION_STATE); m_metadata.remove(CrawlerParser.CURRENT_CRAWLING_DEPTH); // und passen auf, daß nicht noch Passwörter in einer URL stehen String strBadAttName = IncrementalCrawlingHistory.dataEntityId; String[] straUrlsWithPwd = m_metadata.getValues(strBadAttName); m_metadata.remove(strBadAttName); for (String strPossiblePwdUrlString : straUrlsWithPwd) m_metadata.add(strBadAttName, UrlUtil.urlNameWithoutPassword(strPossiblePwdUrlString)); strBadAttName = Metadata.SOURCE; straUrlsWithPwd = m_metadata.getValues(strBadAttName); m_metadata.remove(strBadAttName); for (String strPossiblePwdUrlString : straUrlsWithPwd) m_metadata.add(strBadAttName, UrlUtil.urlNameWithoutPassword(strPossiblePwdUrlString)); strBadAttName = Metadata.RESOURCE_NAME_KEY; straUrlsWithPwd = m_metadata.getValues(strBadAttName); m_metadata.remove(strBadAttName); for (String strPossiblePwdUrlString : straUrlsWithPwd) m_metadata.add(strBadAttName, UrlUtil.urlNameWithoutPassword(strPossiblePwdUrlString)); if(IncrementalCrawlingParser.MODIFIED.equals(strDataEntitiyModState)) { processModifiedData(m_metadata, this.toString()); } else if(IncrementalCrawlingParser.REMOVED.equals(strDataEntitiyModState)) { // these are set because of the dummy stream m_metadata.remove(HttpHeaders.CONTENT_ENCODING); m_metadata.remove(HttpHeaders.CONTENT_TYPE); processRemovedData(m_metadata); } else if(IncrementalCrawlingParser.ERROR.equals(strDataEntitiyModState)) { processErrorData(m_metadata); } else if(IncrementalCrawlingParser.UNMODIFIED.equals(strDataEntitiyModState)) { // these are set because of the dummy stream m_metadata.remove(HttpHeaders.CONTENT_ENCODING); m_metadata.remove(HttpHeaders.CONTENT_TYPE); processUnmodifiedData(m_metadata); } else if(IncrementalCrawlingParser.PROCESSED.equals(strDataEntitiyModState)) { // these are set because of the dummy stream m_metadata.remove(HttpHeaders.CONTENT_ENCODING); m_metadata.remove(HttpHeaders.CONTENT_TYPE); processProcessedData(m_metadata); } else processNewData(m_metadata, this.toString()); // da wir diesen handler über die rekursiven Aufrufe wiederverwenden möchten, setzen wir hier die members zurück. Das metadata-Object wird im // CrawlerParser zurückgesetzt if(m_writer != null) m_writer.getBuffer().delete(0, m_writer.getBuffer().length()); } /** * This is invoked if we have an entity that was crawled at another crawl in the past, according to the crawling history, and was not modified, according to the * dataEntityContentFingerprint. * * @param metadata some metadata (at least an identifying Id) to deal with the entity */ abstract public void processUnmodifiedData(Metadata metadata); /** * This is invoked if we have an entity that was processed in this crawl yet. This is if we have somehow a double entry, or if we have cycles, e.g. during a web * crawl, where we sometimes come back to a link we started from. * * @param metadata some metadata (at least an identifying Id) to deal with the entity */ abstract public void processProcessedData(Metadata metadata); public Metadata getMetaData() { return m_metadata; } protected void init() { m_writer = new StringWriter(); setContentHandler(new BodyContentHandler(new WriteOutContentHandler(m_writer, m_iWriteLimit))); } /** * Will be invoked in the case a data entity causes an error during indexing. * * @param metadata some metadata (at least an identifying Id) to deal with the error entity */ public abstract void processErrorData(Metadata metadata); /** * Will be invoked in the case a data entity was modified since the last crawl. * * @param metadata the metadata of the data entity * @param strFulltext the full body text of the data entity */ public abstract void processModifiedData(Metadata metadata, String strFulltext); /** * Will be invoked in the case a new data entity was found. * * @param metadata the metadata of the data entity * @param strFulltext the full body text of the data entity */ public abstract void processNewData(Metadata metadata, String strFulltext); /** * Will be invoked in the case a data entity was removed since the last crawl. * * @param metadata some metadata (at least an identifying Id) to deal with the removed entity */ public abstract void processRemovedData(Metadata metadata); public void setMetaData(Metadata metadata) { m_metadata = metadata; } @Override public void startDocument() throws SAXException { init(); super.startDocument(); } }