/* * Leech - crawling capabilities for Apache Tika * * Copyright (C) 2012 DFKI GmbH, Author: Christian Reuschling * * This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, * either version 3 of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A * PARTICULAR PURPOSE. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. * * Contact us by mail: christian.reuschling@dfki.de */ package de.dfki.km.leech.sax; import java.text.SimpleDateFormat; import java.util.Collections; import java.util.Date; import java.util.HashMap; import java.util.LinkedList; import java.util.Map.Entry; import java.util.logging.Logger; import org.apache.tika.metadata.Metadata; import de.dfki.inquisition.collections.CollectionUtilz; import de.dfki.inquisition.collections.MultiValueTreeMap; import de.dfki.inquisition.processes.StopWatch; import de.dfki.inquisition.text.StringUtils; /** * A ContentHandler wrapper/decorator that counts the new, modified removed and error entities during a crawl. For new, modified and error entities she also counts the * according content types as detail information.<br> * <br> * Usage:<br> * <code> * CrawlerContext crawlerContext = new CrawlerContext().setIncrementalCrawlingHistoryPath("./history/forResourceDir");<br><br> * CrawlReportContentHandler reportContentHandler = new CrawlReportContentHandler(new PrintlnContentHandler(Granularity.titlePlusFulltext));<br> * leech.parse(new File("resource/testData_short"), reportContentHandler, crawlerContext.createParseContext());<br><br> * System.out.println(reportContentHandler.getReport());<br> * </code> * * @author Christian Reuschling, Dipl.Ing.(BA) * */ public class CrawlReportContentHandler extends DataSinkContentHandlerDecorator { public static class CrawlReport { public HashMap<String, Integer> hsErrorType2EntityCount = new HashMap<String, Integer>(); public HashMap<String, Integer> hsModifiedType2EntityCount = new HashMap<String, Integer>(); public HashMap<String, Integer> hsNewType2EntityCount = new HashMap<String, Integer>(); public int iErrorEntities = 0; public int iModifiedEntities = 0; public int iNewEntities = 0; public int iProcessedEntities = 0; public int iRemovedEntities = 0; public int iUnModifiedEntities = 0; public long lastModifiedEntityProcessingTime; public long lastNewEntityProcessingTime; public long lastRemovedEntityProcessingTime; public long lfirstEntityStartTime = -1; public long lLastEntityEndTime = -1; public long lModifiedEntitiesProcessingTime = 0; public long lNewEntitiesProcessingTime = 0; public long lRemovedEntitiesProcessingTime = 0; boolean bSomeHandled = false; @Override public String toString() { StringBuilder strbReport = new StringBuilder(); strbReport.append("Report: "); if(lfirstEntityStartTime != -1) strbReport.append("First handled data entity at ").append(new SimpleDateFormat().format(new Date(lfirstEntityStartTime))).append(", "); int iEntities = iModifiedEntities + iNewEntities + iRemovedEntities + iErrorEntities; strbReport.append(StringUtils.beautifyNumber(iEntities)).append(" processed entities"); if(lfirstEntityStartTime != -1) { long lDuration = lLastEntityEndTime - lfirstEntityStartTime; strbReport.append(", duration ").append(StopWatch.formatTimeDistance(lDuration)); if(lDuration > 0) { long lMilliSecondsPerEntity = Math.round((double) lDuration / (double) iEntities); strbReport.append(", ").append(StopWatch.formatTimeDistance(lMilliSecondsPerEntity)).append("/entity"); double dEntitiesPerMilliSecond = (double) iEntities / (double) lDuration; strbReport.append(", ").append(StringUtils.beautifyNumber(Math.round(dEntitiesPerMilliSecond * 1000))).append("/s"); strbReport.append(", ").append(StringUtils.beautifyNumber(Math.round(dEntitiesPerMilliSecond * 1000 * 60))).append("/m"); strbReport.append(", ").append(StringUtils.beautifyNumber(Math.round(dEntitiesPerMilliSecond * 1000 * 60 * 60))).append("/h"); strbReport.append(", ").append(StringUtils.beautifyNumber(Math.round(dEntitiesPerMilliSecond * 1000 * 60 * 60 * 24))).append("/d"); } } strbReport.append("\n"); strbReport.append("New data entities: ").append(StringUtils.beautifyNumber(iNewEntities)); if(iNewEntities > 0) strbReport.append(" (in average ").append(StopWatch.formatTimeDistance(lNewEntitiesProcessingTime / iNewEntities)) .append(" to handle. Last entity took " + StopWatch.formatTimeDistance(lastNewEntityProcessingTime) + ")"); strbReport.append("\n"); MultiValueTreeMap<Integer, String> tmEntityCount2Type = new MultiValueTreeMap<>(Collections.reverseOrder(), LinkedList.class); for (Entry<String, Integer> newType2EntityCount : hsNewType2EntityCount.entrySet()) tmEntityCount2Type.add(newType2EntityCount.getValue(), newType2EntityCount.getKey()); StringBuilder strbTmp = new StringBuilder(); for (Entry<Integer, String> entityCount2Type : tmEntityCount2Type.entryList()) strbTmp.append(", ").append(entityCount2Type.getValue()).append(":").append(StringUtils.beautifyNumber(entityCount2Type.getKey())); strbTmp.replace(0, 1, ""); strbReport.append(strbTmp); if(strbTmp.length() > 0) strbReport.append("\n"); strbReport.append("Modified data entities: ").append(StringUtils.beautifyNumber(iModifiedEntities)); if(iModifiedEntities > 0) strbReport.append(" (in average ").append(StopWatch.formatTimeDistance(lModifiedEntitiesProcessingTime / iModifiedEntities)) .append(" to handle. Last entity took " + StopWatch.formatTimeDistance(lastModifiedEntityProcessingTime) + ")"); strbReport.append("\n"); tmEntityCount2Type = new MultiValueTreeMap<>(Collections.reverseOrder(), LinkedList.class); for (Entry<String, Integer> modifiedType2EntityCount : hsModifiedType2EntityCount.entrySet()) tmEntityCount2Type.add(modifiedType2EntityCount.getValue(), modifiedType2EntityCount.getKey()); strbTmp = new StringBuilder(); for (Entry<Integer, String> entityCount2Type : tmEntityCount2Type.entryList()) strbTmp.append(", ").append(entityCount2Type.getValue()).append(":").append(StringUtils.beautifyNumber(entityCount2Type.getKey())); strbTmp.replace(0, 1, ""); strbReport.append(strbTmp); if(strbTmp.length() > 0) strbReport.append("\n"); strbReport.append("Removed data entities: ").append(StringUtils.beautifyNumber(iRemovedEntities)); if(iRemovedEntities > 0) strbReport.append(" (in average ").append(StopWatch.formatTimeDistance(lRemovedEntitiesProcessingTime / iRemovedEntities)) .append(" to handle. Last entity took " + StopWatch.formatTimeDistance(lastRemovedEntityProcessingTime) + ")"); strbReport.append("\n"); strbReport.append("Unmodified data entities: ").append(StringUtils.beautifyNumber(iUnModifiedEntities)); strbReport.append("\n"); strbReport.append("Double data entities: ").append(StringUtils.beautifyNumber(iProcessedEntities)); strbReport.append("\n"); strbReport.append("Error data entities: ").append(StringUtils.beautifyNumber(iErrorEntities)).append("\n"); tmEntityCount2Type = new MultiValueTreeMap<>(Collections.reverseOrder(), LinkedList.class); for (Entry<String, Integer> errorType2EntityCount : hsErrorType2EntityCount.entrySet()) tmEntityCount2Type.add(errorType2EntityCount.getValue(), errorType2EntityCount.getKey()); strbTmp = new StringBuilder(); for (Entry<Integer, String> entityCount2Type : tmEntityCount2Type.entryList()) strbTmp.append(", ").append(entityCount2Type.getValue()).append(":").append(StringUtils.beautifyNumber(entityCount2Type.getKey())); strbTmp.replace(0, 1, ""); strbReport.append(strbTmp); if(strbTmp.length() > 0) strbReport.append("\n"); return strbReport.toString(); } } protected CrawlReport m_crawlReport = new CrawlReport(); protected long m_lastReportTime = -1; protected long m_lCyclicReportMilliseconds = -1; public CrawlReportContentHandler(DataSinkContentHandler wrappedDataSinkContentHandler) { m_wrappedDataSinkContentHandler = wrappedDataSinkContentHandler; } @Override public void crawlFinished() { if(m_wrappedDataSinkContentHandler != null) m_wrappedDataSinkContentHandler.crawlFinished(); Logger.getLogger(CrawlReportContentHandler.class.getName()).info("Crawl finished:\n" + getReport().toString()); } public CrawlReport getReport() { return m_crawlReport; } @Override public void processErrorData(Metadata metadata) { if(m_crawlReport.lfirstEntityStartTime == -1 || m_crawlReport.bSomeHandled == false) { m_crawlReport.lfirstEntityStartTime = System.currentTimeMillis(); m_crawlReport.bSomeHandled = true; } m_crawlReport.iErrorEntities++; String[] strTypes = metadata.getValues("Content-Type"); if(strTypes == null || strTypes.length == 0) strTypes = CollectionUtilz.createArray("unknown"); for (String strType : strTypes) { int iIndex = strType.indexOf(";"); if(iIndex != -1) strType = strType.substring(0, iIndex); Integer iCount4Type = m_crawlReport.hsErrorType2EntityCount.get(strType); if(iCount4Type == null) iCount4Type = 0; iCount4Type++; m_crawlReport.hsErrorType2EntityCount.put(strType, iCount4Type); } if(m_wrappedDataSinkContentHandler != null) m_wrappedDataSinkContentHandler.processErrorData(metadata); m_crawlReport.lLastEntityEndTime = System.currentTimeMillis(); printReportIfItsTime(); } @Override public void processModifiedData(Metadata metadata, String strFulltext) { if(m_crawlReport.lfirstEntityStartTime == -1 || m_crawlReport.bSomeHandled == false) { m_crawlReport.lfirstEntityStartTime = System.currentTimeMillis(); m_crawlReport.bSomeHandled = true; } m_crawlReport.iModifiedEntities++; String[] strTypes = metadata.getValues("Content-Type"); if(strTypes == null || strTypes.length == 0) strTypes = CollectionUtilz.createArray("unknown"); for (String strType : strTypes) { int iIndex = strType.indexOf(";"); if(iIndex != -1) strType = strType.substring(0, iIndex); Integer iCount4Type = m_crawlReport.hsModifiedType2EntityCount.get(strType); if(iCount4Type == null) iCount4Type = 0; iCount4Type++; m_crawlReport.hsModifiedType2EntityCount.put(strType, iCount4Type); } long lStart = System.currentTimeMillis(); if(m_wrappedDataSinkContentHandler != null) m_wrappedDataSinkContentHandler.processModifiedData(metadata, strFulltext); long lDuration = System.currentTimeMillis() - lStart; m_crawlReport.lModifiedEntitiesProcessingTime += lDuration; m_crawlReport.lastModifiedEntityProcessingTime = lDuration; m_crawlReport.lLastEntityEndTime = System.currentTimeMillis(); printReportIfItsTime(); } @Override public void processNewData(Metadata metadata, String strFulltext) { if(m_crawlReport.lfirstEntityStartTime == -1 || m_crawlReport.bSomeHandled == false) { m_crawlReport.lfirstEntityStartTime = System.currentTimeMillis(); m_crawlReport.bSomeHandled = true; } m_crawlReport.iNewEntities++; String[] strTypes = metadata.getValues("Content-Type"); if(strTypes == null || strTypes.length == 0) strTypes = CollectionUtilz.createArray("unknown"); for (String strType : strTypes) { int iIndex = strType.indexOf(";"); if(iIndex != -1) strType = strType.substring(0, iIndex); Integer iCount4Type = m_crawlReport.hsNewType2EntityCount.get(strType); if(iCount4Type == null) iCount4Type = 0; iCount4Type++; m_crawlReport.hsNewType2EntityCount.put(strType, iCount4Type); } long lStart = System.currentTimeMillis(); if(m_wrappedDataSinkContentHandler != null) m_wrappedDataSinkContentHandler.processNewData(metadata, strFulltext); long lDuration = System.currentTimeMillis() - lStart; m_crawlReport.lNewEntitiesProcessingTime += lDuration; m_crawlReport.lastNewEntityProcessingTime = lDuration; m_crawlReport.lLastEntityEndTime = System.currentTimeMillis(); printReportIfItsTime(); } @Override public void processProcessedData(Metadata metadata) { m_crawlReport.iProcessedEntities++; if(m_wrappedDataSinkContentHandler != null) m_wrappedDataSinkContentHandler.processProcessedData(metadata); printReportIfItsTime(); } @Override public void processRemovedData(Metadata metadata) { if(m_crawlReport.lfirstEntityStartTime == -1 || m_crawlReport.bSomeHandled == false) { m_crawlReport.lfirstEntityStartTime = System.currentTimeMillis(); m_crawlReport.bSomeHandled = true; } m_crawlReport.iRemovedEntities++; long lStart = System.currentTimeMillis(); if(m_wrappedDataSinkContentHandler != null) m_wrappedDataSinkContentHandler.processRemovedData(metadata); long lDuration = System.currentTimeMillis() - lStart; m_crawlReport.lRemovedEntitiesProcessingTime += lDuration; m_crawlReport.lastRemovedEntityProcessingTime = lDuration; m_crawlReport.lLastEntityEndTime = System.currentTimeMillis(); printReportIfItsTime(); } @Override public void processUnmodifiedData(Metadata metadata) { m_crawlReport.iUnModifiedEntities++; if(m_wrappedDataSinkContentHandler != null) m_wrappedDataSinkContentHandler.processUnmodifiedData(metadata); printReportIfItsTime(); } /** * Sets everything counted yet to zero. Then you can use this object for another crawl */ public void reset() { m_crawlReport = new CrawlReport(); } /** * Sets whether or not a time-based cyclic report will be generated. The time is not a hard criteria, the method will print the report when new content arrives and * the last report was longer ago than everMilliseconds * * @param everyMilliseconds in the case this value is <0, cyclic report printing will be disabled. Otherwise, every <everyMilliseconds> milliseconds a report will be * println'd * * @return this */ public CrawlReportContentHandler setCyclicReportPrintln(long everyMilliseconds) { m_lCyclicReportMilliseconds = everyMilliseconds; return this; } protected void printReportIfItsTime() { if(m_lCyclicReportMilliseconds < 0) return; if(m_lastReportTime < 0) { m_lastReportTime = System.currentTimeMillis(); return; } if(System.currentTimeMillis() >= m_lastReportTime + m_lCyclicReportMilliseconds) { Logger.getLogger(CrawlReportContentHandler.class.getName()).info(m_crawlReport.toString()); m_lastReportTime = System.currentTimeMillis(); } } }