/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.tika.eval; import java.io.IOException; import java.io.Reader; import java.io.StringReader; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.sql.Types; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.List; import java.util.Locale; import java.util.Map; import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.atomic.AtomicInteger; import java.util.regex.Matcher; import java.util.regex.Pattern; import com.optimaize.langdetect.DetectedLanguage; import org.apache.commons.io.FilenameUtils; import org.apache.commons.lang3.tuple.Pair; import org.apache.commons.math3.stat.descriptive.SummaryStatistics; import org.apache.tika.batch.FileResource; import org.apache.tika.batch.FileResourceConsumer; import org.apache.tika.batch.fs.FSProperties; import org.apache.tika.config.TikaConfig; import org.apache.tika.eval.db.ColInfo; import org.apache.tika.eval.db.Cols; import org.apache.tika.eval.db.TableInfo; import org.apache.tika.eval.io.ExtractReaderException; import org.apache.tika.eval.io.IDBWriter; import org.apache.tika.eval.tokens.AnalyzerManager; import org.apache.tika.eval.tokens.CommonTokenCountManager; import org.apache.tika.eval.tokens.CommonTokenResult; import org.apache.tika.eval.tokens.TokenCounter; import org.apache.tika.eval.tokens.TokenIntPair; import org.apache.tika.eval.tokens.TokenStatistics; import org.apache.tika.eval.util.LanguageIDWrapper; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.PagedText; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.RecursiveParserWrapper; import org.apache.tika.utils.ExceptionUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public abstract class AbstractProfiler extends FileResourceConsumer { private static final Logger LOG = LoggerFactory.getLogger(AbstractProfiler.class); private static final String[] EXTRACT_EXTENSIONS = { ".json", ".txt", "" }; private static final String[] COMPRESSION_EXTENSIONS = { "", ".bz2", ".gzip", ".zip", }; static final long NON_EXISTENT_FILE_LENGTH = -1l; public static TableInfo REF_EXTRACT_EXCEPTION_TYPES = new TableInfo("ref_extract_exception_types", new ColInfo(Cols.EXTRACT_EXCEPTION_ID, Types.INTEGER), new ColInfo(Cols.EXTRACT_EXCEPTION_DESCRIPTION, Types.VARCHAR, 128) ); public static TableInfo REF_PARSE_ERROR_TYPES = new TableInfo("ref_parse_error_types", new ColInfo(Cols.PARSE_ERROR_ID, Types.INTEGER), new ColInfo(Cols.PARSE_ERROR_DESCRIPTION, Types.VARCHAR, 128) ); public static TableInfo REF_PARSE_EXCEPTION_TYPES = new TableInfo("ref_parse_exception_types", new ColInfo(Cols.PARSE_EXCEPTION_ID, Types.INTEGER), new ColInfo(Cols.PARSE_EXCEPTION_DESCRIPTION, Types.VARCHAR, 128) ); public static final String TRUE = Boolean.toString(true); public static final String FALSE = Boolean.toString(false); protected static final AtomicInteger ID = new AtomicInteger(); private final static String UNKNOWN_EXTENSION = "unk"; //make this configurable private final static String DIGEST_KEY = "X-TIKA:digest:MD5"; private static CommonTokenCountManager commonTokenCountManager; private String lastExtractExtension = null; AnalyzerManager analyzerManager; TokenCounter tokenCounter; public enum EXCEPTION_TYPE { RUNTIME, ENCRYPTION, ACCESS_PERMISSION, UNSUPPORTED_VERSION, } /** * If information was gathered from the log file about * a parse error */ public enum PARSE_ERROR_TYPE { OOM, TIMEOUT } public static TableInfo MIME_TABLE = new TableInfo("mimes", new ColInfo(Cols.MIME_ID, Types.INTEGER, "PRIMARY KEY"), new ColInfo(Cols.MIME_STRING, Types.VARCHAR, 256), new ColInfo(Cols.FILE_EXTENSION, Types.VARCHAR, 12) ); private static Pattern FILE_NAME_CLEANER = Pattern.compile("\\.(json|txt)(\\.(bz2|gz|zip))?$"); final static int FILE_PATH_MAX_LEN = 1024;//max len for varchar for file_path int maxContentLength = 10000000; int maxContentLengthForLangId = 50000; int maxTokens = 200000; //these remove runtime info from the stacktraces so //that actual causes can be counted. private final static Pattern CAUSED_BY_SNIPPER = Pattern.compile("(Caused by: [^:]+):[^\\r\\n]+"); private final static Pattern ACCESS_PERMISSION_EXCEPTION = Pattern.compile("org\\.apache\\.tika\\.exception\\.AccessPermissionException"); private final static Pattern ENCRYPTION_EXCEPTION = Pattern.compile("org\\.apache\\.tika.exception\\.EncryptedDocumentException"); private TikaConfig config = TikaConfig.getDefaultConfig();//TODO: allow configuration final LanguageIDWrapper langIder; protected IDBWriter writer; /** * * @param p path to the common_tokens directory. If this is null, try to load from classPath * @throws IOException */ public static void loadCommonTokens(Path p, String defaultLangCode) throws IOException { commonTokenCountManager = new CommonTokenCountManager(p, defaultLangCode); } public AbstractProfiler(ArrayBlockingQueue<FileResource> fileQueue, IDBWriter writer) { super(fileQueue); this.writer = writer; langIder = new LanguageIDWrapper(); initAnalyzersAndTokenCounter(maxTokens); } private void initAnalyzersAndTokenCounter(int maxTokens) { try { analyzerManager = AnalyzerManager.newInstance(maxTokens); tokenCounter = new TokenCounter(analyzerManager.getGeneralAnalyzer()); } catch (IOException e) { throw new RuntimeException(e); } } /** * Truncate the content string if greater than this length to this length * @param maxContentLength */ public void setMaxContentLength(int maxContentLength) { this.maxContentLength = maxContentLength; } /** * Truncate content string if greater than this length to this length for lang id * * @param maxContentLengthForLangId */ public void setMaxContentLengthForLangId(int maxContentLengthForLangId) { this.maxContentLengthForLangId = maxContentLengthForLangId; } /** * Add a LimitTokenCountFilterFactory if > -1 * * @param maxTokens */ public void setMaxTokens(int maxTokens) { this.maxTokens = maxTokens; initAnalyzersAndTokenCounter(maxTokens); } protected void writeExtractException(TableInfo extractExceptionTable, String containerId, String filePath, ExtractReaderException.TYPE type) throws IOException { Map<Cols, String> data = new HashMap<>(); data.put(Cols.CONTAINER_ID, containerId); data.put(Cols.FILE_PATH, filePath); data.put(Cols.EXTRACT_EXCEPTION_ID, Integer.toString(type.ordinal())); writer.writeRow(extractExceptionTable, data); } protected void writeProfileData(EvalFilePaths fps, int i, Metadata m, String fileId, String containerId, List<Integer> numAttachments, TableInfo profileTable) { Map<Cols, String> data = new HashMap<>(); data.put(Cols.ID, fileId); data.put(Cols.CONTAINER_ID, containerId); data.put(Cols.MD5, m.get(DIGEST_KEY)); if ( i < numAttachments.size()) { data.put(Cols.NUM_ATTACHMENTS, Integer.toString(numAttachments.get(i))); } data.put(Cols.ELAPSED_TIME_MILLIS, getTime(m)); data.put(Cols.NUM_METADATA_VALUES, Integer.toString(countMetadataValues(m))); Integer nPages = m.getInt(PagedText.N_PAGES); if (nPages != null) { data.put(Cols.NUM_PAGES, Integer.toString(nPages)); } //if the outer wrapper document if (i == 0) { data.put(Cols.IS_EMBEDDED, FALSE); data.put(Cols.FILE_NAME, fps.getRelativeSourceFilePath().getFileName().toString()); } else { data.put(Cols.IS_EMBEDDED, TRUE); data.put(Cols.FILE_NAME, getFileName(m.get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH))); } String ext = FilenameUtils.getExtension(data.get(Cols.FILE_NAME)); ext = (ext == null) ? "" : ext.toLowerCase(Locale.US); data.put(Cols.FILE_EXTENSION, ext); long srcFileLen = getSourceFileLength(m); if (srcFileLen > NON_EXISTENT_FILE_LENGTH) { data.put(Cols.LENGTH, Long.toString(srcFileLen)); } else { data.put(Cols.LENGTH, ""); } int numMetadataValues = countMetadataValues(m); data.put(Cols.NUM_METADATA_VALUES, Integer.toString(numMetadataValues)); data.put(Cols.ELAPSED_TIME_MILLIS, getTime(m)); String content = getContent(m); if (content == null || content.trim().length() == 0) { data.put(Cols.HAS_CONTENT, FALSE); } else { data.put(Cols.HAS_CONTENT, TRUE); } getFileTypes(m, data); try { writer.writeRow(profileTable, data); } catch (IOException e) { throw new RuntimeException(e); } } private static String getFileName(String path) { if (path == null) { return ""; } //filenameUtils checks for a null byte in the path. //it will throw an IllegalArgumentException if there is a null byte. //given that we're recording names and not using them on a file path //we should ignore this. try { return FilenameUtils.getName(path); } catch (IllegalArgumentException e) { LOG.warn("{} in {}", e.getMessage(), path); } path = path.replaceAll("\u0000", " "); try { return FilenameUtils.getName(path); } catch (IllegalArgumentException e) { LOG.warn("Again: {} in {}", e.getMessage(), path); } //give up return ""; } protected void writeExceptionData(String fileId, Metadata m, TableInfo exceptionTable) { Map<Cols, String> data = new HashMap<>(); getExceptionStrings(m, data); if (data.keySet().size() > 0) { try { data.put(Cols.ID, fileId); writer.writeRow(exceptionTable, data); } catch (IOException e) { throw new RuntimeException(e); } } } /** * Checks to see if metadata is null or content is empty (null or only whitespace). * If any of these, then this does no processing, and the fileId is not * entered into the content table. * * @param fileId * @param m * @param fieldName * @param contentsTable */ protected void writeContentData(String fileId, Metadata m, String fieldName, TableInfo contentsTable) throws IOException { if (m == null) { return; } Map<Cols, String> data = new HashMap<>(); String content = getContent(m, maxContentLength, data); if (content == null || content.trim().length() == 0) { return; } tokenCounter.clear(fieldName); tokenCounter.add(fieldName, content); data.put(Cols.ID, fileId); data.put(Cols.CONTENT_LENGTH, Integer.toString(content.length())); langid(m, data); String langid = data.get(Cols.LANG_ID_1); langid = (langid == null) ? "" : langid; writeTokenCounts(data, fieldName, tokenCounter); CommonTokenResult commonTokenResult = null; try { commonTokenResult = commonTokenCountManager.countTokenOverlaps(langid, tokenCounter.getTokens(fieldName)); } catch (IOException e) { LOG.error("{}", e.getMessage(), e); } data.put(Cols.COMMON_TOKENS_LANG, commonTokenResult.getLangCode()); data.put(Cols.NUM_COMMON_TOKENS, Integer.toString(commonTokenResult.getCommonTokens())); TokenStatistics tokenStatistics = tokenCounter.getTokenStatistics(fieldName); data.put(Cols.NUM_UNIQUE_TOKENS, Integer.toString(tokenStatistics.getTotalUniqueTokens())); data.put(Cols.NUM_TOKENS, Integer.toString(tokenStatistics.getTotalTokens())); data.put(Cols.NUM_ALPHABETIC_TOKENS, Integer.toString(commonTokenResult.getAlphabeticTokens())); data.put(Cols.TOKEN_ENTROPY_RATE, Double.toString(tokenStatistics.getEntropy())); SummaryStatistics summStats = tokenStatistics.getSummaryStatistics(); data.put(Cols.TOKEN_LENGTH_SUM, Integer.toString((int) summStats.getSum())); data.put(Cols.TOKEN_LENGTH_MEAN, Double.toString(summStats.getMean())); data.put(Cols.TOKEN_LENGTH_STD_DEV, Double.toString(summStats.getStandardDeviation())); unicodeBlocks(m, data); try { writer.writeRow(contentsTable, data); } catch (IOException e) { throw new RuntimeException(e); } } String getTime(Metadata m) { String elapsed = "-1"; String v = m.get(RecursiveParserWrapper.PARSE_TIME_MILLIS); if (v != null) { return v; } return elapsed; } int countMetadataValues(Metadata m) { if (m == null) { return 0; } int i = 0; for (String n : m.names()) { i += m.getValues(n).length; } return i; } void getExceptionStrings(Metadata metadata, Map<Cols, String> data) { String fullTrace = metadata.get(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + "runtime"); if (fullTrace == null) { fullTrace = metadata.get(RecursiveParserWrapper.EMBEDDED_EXCEPTION); } if (fullTrace != null) { //check for "expected" exceptions...exceptions //that can't be fixed. //Do not store trace for "expected" exceptions Matcher matcher = ACCESS_PERMISSION_EXCEPTION.matcher(fullTrace); if (matcher.find()) { data.put(Cols.PARSE_EXCEPTION_ID, Integer.toString(EXCEPTION_TYPE.ACCESS_PERMISSION.ordinal())); return; } matcher = ENCRYPTION_EXCEPTION.matcher(fullTrace); if (matcher.find()) { data.put(Cols.PARSE_EXCEPTION_ID, Integer.toString(EXCEPTION_TYPE.ENCRYPTION.ordinal())); return; } data.put(Cols.PARSE_EXCEPTION_ID, Integer.toString(EXCEPTION_TYPE.RUNTIME.ordinal())); data.put(Cols.ORIG_STACK_TRACE, fullTrace); //TikaExceptions can have object ids, as in the "@2b1ea6ee" in: //org.apache.tika.exception.TikaException: TIKA-198: Illegal //IOException from org.apache.tika.parser.microsoft.OfficeParser@2b1ea6ee //For reporting purposes, let's snip off the object id so that we can more //easily count exceptions. String sortTrace = ExceptionUtils.trimMessage(fullTrace); matcher = CAUSED_BY_SNIPPER.matcher(sortTrace); sortTrace = matcher.replaceAll("$1"); sortTrace = sortTrace.replaceAll("org.apache.tika.", "o.a.t."); data.put(Cols.SORT_STACK_TRACE, sortTrace); } } /** * Get the content and record in the data {@link Cols#CONTENT_TRUNCATED_AT_MAX_LEN} whether the string was truncated * * @param metadata * @param maxLength * @param data * @return */ protected static String getContent(Metadata metadata, int maxLength, Map<Cols, String> data) { data.put(Cols.CONTENT_TRUNCATED_AT_MAX_LEN, "FALSE"); String c = getContent(metadata); if (maxLength > -1 && c.length() > maxLength) { c = c.substring(0, maxLength); data.put(Cols.CONTENT_TRUNCATED_AT_MAX_LEN, "TRUE"); } return c; } protected static String getContent(Metadata metadata) { if (metadata == null) { return ""; } String c = metadata.get(RecursiveParserWrapper.TIKA_CONTENT); if (c == null) { return ""; } return c; } void unicodeBlocks(Metadata metadata, Map<Cols, String> data) { String content = getContent(metadata); if (content.length() < 200) { return; } String s = content; if (content.length() > maxContentLengthForLangId) { s = content.substring(0, maxContentLengthForLangId); } Map<String, Integer> m = new HashMap<>(); Reader r = new StringReader(s); try { int c = r.read(); while (c != -1) { Character.UnicodeBlock block = Character.UnicodeBlock.of(c); String blockString = (block == null) ? "NULL" : block.toString(); Integer i = m.get(blockString); if (i == null) { i = 0; } i++; if (block == null) { blockString = "NULL"; } m.put(blockString, i); c = r.read(); } } catch (IOException e) { LOG.warn("IOException", e); } List<Pair<String, Integer>> pairs = new ArrayList<>(); for (Map.Entry<String, Integer> e : m.entrySet()) { pairs.add(Pair.of(e.getKey(), e.getValue())); } Collections.sort(pairs, new Comparator<Pair<String, Integer>>() { @Override public int compare(Pair<String, Integer> o1, Pair<String, Integer> o2) { return o2.getValue().compareTo(o1.getValue()); } }); StringBuilder sb = new StringBuilder(); for (int i = 0; i < 20 && i < pairs.size(); i++) { if (i > 0) { sb.append(" | "); } sb.append(pairs.get(i).getKey()+": "+pairs.get(i).getValue()); } data.put(Cols.UNICODE_CHAR_BLOCKS, sb.toString()); } void langid(Metadata metadata, Map<Cols, String> data) { String content = getContent(metadata); if (content.length() < 50) { return; } String s = content; if (content.length() > maxContentLengthForLangId) { s = content.substring(0, maxContentLengthForLangId); } List<DetectedLanguage> probabilities = langIder.getProbabilities(s); if (probabilities.size() > 0) { data.put(Cols.LANG_ID_1, getLangString(probabilities.get(0))); data.put(Cols.LANG_ID_PROB_1, Double.toString(probabilities.get(0).getProbability())); } if (probabilities.size() > 1) { data.put(Cols.LANG_ID_2, getLangString(probabilities.get(1))); data.put(Cols.LANG_ID_PROB_2, Double.toString(probabilities.get(1).getProbability())); } } private String getLangString(DetectedLanguage detectedLanguage) { //So that we have mapping between lang id and common-tokens file names String lang = detectedLanguage.getLocale().getLanguage(); if ("zh".equals(lang)) { if (detectedLanguage.getLocale().getRegion().isPresent()) { lang += "-" + detectedLanguage.getLocale().getRegion().get().toLowerCase(Locale.US); } else { //hope for the best lang += "-cn"; } } return lang; } void getFileTypes(Metadata metadata, Map<Cols, String> output) { if (metadata == null) { return; } String type = metadata.get(Metadata.CONTENT_TYPE); if (type == null) { return; } int mimeId = writer.getMimeId(type); output.put(Cols.MIME_ID, Integer.toString(mimeId)); } void writeTokenCounts(Map<Cols, String> data, String field, TokenCounter tokenCounter) { int stops = 0; int i = 0; StringBuilder sb = new StringBuilder(); TokenStatistics tokenStatistics = tokenCounter.getTokenStatistics(field); for (TokenIntPair t : tokenStatistics.getTopN()) { if (i++ > 0) { sb.append(" | "); } sb.append(t.getToken() + ": " + t.getValue()); } data.put(Cols.TOP_N_TOKENS, sb.toString()); } public void closeWriter() throws IOException { writer.close(); } /** * * @param metadata * @param extracts * @return evalfilepaths for files if crawling an extract directory */ protected EvalFilePaths getPathsFromExtractCrawl(Metadata metadata, Path extracts) { String relExtractFilePath = metadata.get(FSProperties.FS_REL_PATH); Matcher m = FILE_NAME_CLEANER.matcher(relExtractFilePath); Path relativeSourceFilePath = Paths.get(m.replaceAll("")); //just try slapping the relextractfilepath on the extractdir Path extractFile = extracts.resolve(relExtractFilePath); if (! Files.isRegularFile(extractFile)) { //if that doesn't work, try to find the right extract file. //This is necessary if crawling extractsA and trying to find a file in //extractsB that is not in the same format: json vs txt or compressed extractFile = findFile(extracts, relativeSourceFilePath); } return new EvalFilePaths(relativeSourceFilePath, extractFile); } //call this if the crawler is crawling through the src directory protected EvalFilePaths getPathsFromSrcCrawl(Metadata metadata, Path srcDir, Path extracts) { Path relativeSourceFilePath = Paths.get(metadata.get(FSProperties.FS_REL_PATH)); Path extractFile = findFile(extracts, relativeSourceFilePath); Path inputFile = srcDir.resolve(relativeSourceFilePath); long srcLen = -1l; //try to get the length of the source file in case there was an error //in both extracts try { srcLen = Files.size(inputFile); } catch (IOException e) { LOG.warn("Couldn't get length for: {}", inputFile.toAbsolutePath()); } return new EvalFilePaths(relativeSourceFilePath, extractFile, srcLen); } /** * * @param extractRootDir * @param relativeSourceFilePath * @return extractFile or null if couldn't find one. */ private Path findFile(Path extractRootDir, Path relativeSourceFilePath) { String relSrcFilePathString = relativeSourceFilePath.toString(); if (lastExtractExtension != null) { Path candidate = extractRootDir.resolve(relSrcFilePathString+lastExtractExtension); if (Files.isRegularFile(candidate)) { return candidate; } } for (String ext : EXTRACT_EXTENSIONS) { for (String compress : COMPRESSION_EXTENSIONS) { Path candidate = extractRootDir.resolve(relSrcFilePathString+ext+compress); if (Files.isRegularFile(candidate)) { lastExtractExtension = ext+compress; return candidate; } } } return null; } protected long getSourceFileLength(EvalFilePaths fps, List<Metadata> metadataList) { if (fps.getSourceFileLength() > NON_EXISTENT_FILE_LENGTH) { return fps.getSourceFileLength(); } return getSourceFileLength(metadataList); } long getSourceFileLength(List<Metadata> metadataList) { if (metadataList == null || metadataList.size() < 1) { return NON_EXISTENT_FILE_LENGTH; } return getSourceFileLength(metadataList.get(0)); } long getSourceFileLength(Metadata m) { String lenString = m.get(Metadata.CONTENT_LENGTH); if (lenString == null) { return NON_EXISTENT_FILE_LENGTH; } try { return Long.parseLong(lenString); } catch (NumberFormatException e) { //swallow } return NON_EXISTENT_FILE_LENGTH; } protected long getFileLength(Path p) { if (p != null && Files.isRegularFile(p)) { try { return Files.size(p); } catch (IOException e) { //swallow } } return NON_EXISTENT_FILE_LENGTH; } /** * * @param list * @return empty list if input list is empty or null */ static List<Integer> countAttachments(List<Metadata> list) { List<Integer> ret = new ArrayList<>(); if (list == null || list.size() == 0) { return ret; } //container document attachment count = list.size()-1 ret.add(list.size()-1); Map<String, Integer> counts = new HashMap<>(); for (int i = 1; i < list.size(); i++) { String path = list.get(i).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH); if (path == null) { //shouldn't ever happen continue; } String[] parts = path.split("/"); StringBuilder parent = new StringBuilder(); for (int end = 1; end < parts.length-1; end++) { parent.setLength(0); join("/", parent, parts, 1, end); String parentPath = parent.toString(); Integer count = counts.get(parentPath); if (count == null) { count = 1; } else { count++; } counts.put(parentPath, count); } } for (int i = 1; i < list.size(); i++) { Integer count = counts.get(list.get(i).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH)); if (count == null) { count = 0; } ret.add(i, count); } return ret; } private static void join(String delimiter, StringBuilder sb, String[] parts, int start, int end) { for (int i = start; i <= end; i++) { sb.append(delimiter); sb.append(parts[i]); } } }