AbstractProfiler.java example

Explorer
tika-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.tika.eval;


import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.sql.Types;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import com.optimaize.langdetect.DetectedLanguage;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.commons.math3.stat.descriptive.SummaryStatistics;
import org.apache.tika.batch.FileResource;
import org.apache.tika.batch.FileResourceConsumer;
import org.apache.tika.batch.fs.FSProperties;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.eval.db.ColInfo;
import org.apache.tika.eval.db.Cols;
import org.apache.tika.eval.db.TableInfo;
import org.apache.tika.eval.io.ExtractReaderException;
import org.apache.tika.eval.io.IDBWriter;
import org.apache.tika.eval.tokens.AnalyzerManager;
import org.apache.tika.eval.tokens.CommonTokenCountManager;
import org.apache.tika.eval.tokens.CommonTokenResult;
import org.apache.tika.eval.tokens.TokenCounter;
import org.apache.tika.eval.tokens.TokenIntPair;
import org.apache.tika.eval.tokens.TokenStatistics;
import org.apache.tika.eval.util.LanguageIDWrapper;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.PagedText;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.RecursiveParserWrapper;
import org.apache.tika.utils.ExceptionUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public abstract class AbstractProfiler extends FileResourceConsumer {

    private static final Logger LOG = LoggerFactory.getLogger(AbstractProfiler.class);


    private static final String[] EXTRACT_EXTENSIONS = {
            ".json",
            ".txt",
            ""
    };

    private static final String[] COMPRESSION_EXTENSIONS = {
            "",
            ".bz2",
            ".gzip",
            ".zip",
    };
    static final long NON_EXISTENT_FILE_LENGTH = -1l;

    public static TableInfo REF_EXTRACT_EXCEPTION_TYPES = new TableInfo("ref_extract_exception_types",
            new ColInfo(Cols.EXTRACT_EXCEPTION_ID, Types.INTEGER),
            new ColInfo(Cols.EXTRACT_EXCEPTION_DESCRIPTION, Types.VARCHAR, 128)
    );


    public static TableInfo REF_PARSE_ERROR_TYPES = new TableInfo("ref_parse_error_types",
            new ColInfo(Cols.PARSE_ERROR_ID, Types.INTEGER),
            new ColInfo(Cols.PARSE_ERROR_DESCRIPTION, Types.VARCHAR, 128)
    );

    public static TableInfo REF_PARSE_EXCEPTION_TYPES = new TableInfo("ref_parse_exception_types",
            new ColInfo(Cols.PARSE_EXCEPTION_ID, Types.INTEGER),
            new ColInfo(Cols.PARSE_EXCEPTION_DESCRIPTION, Types.VARCHAR, 128)
    );

    public static final String TRUE = Boolean.toString(true);
    public static final String FALSE = Boolean.toString(false);


    protected static final AtomicInteger ID = new AtomicInteger();

    private final static String UNKNOWN_EXTENSION = "unk";
    //make this configurable
    private final static String DIGEST_KEY = "X-TIKA:digest:MD5";

    private static CommonTokenCountManager commonTokenCountManager;
    private String lastExtractExtension = null;

    AnalyzerManager analyzerManager;
    TokenCounter tokenCounter;


    public enum EXCEPTION_TYPE {
        RUNTIME,
        ENCRYPTION,
        ACCESS_PERMISSION,
        UNSUPPORTED_VERSION,
    }

    /**
     * If information was gathered from the log file about
     * a parse error
     */
    public enum PARSE_ERROR_TYPE {
        OOM,
        TIMEOUT
    }

    public static TableInfo MIME_TABLE = new TableInfo("mimes",
            new ColInfo(Cols.MIME_ID, Types.INTEGER, "PRIMARY KEY"),
            new ColInfo(Cols.MIME_STRING, Types.VARCHAR, 256),
            new ColInfo(Cols.FILE_EXTENSION, Types.VARCHAR, 12)
    );

    private static Pattern FILE_NAME_CLEANER = Pattern.compile("\\.(json|txt)(\\.(bz2|gz|zip))?$");


    final static int FILE_PATH_MAX_LEN = 1024;//max len for varchar for file_path
    int maxContentLength = 10000000;
    int maxContentLengthForLangId = 50000;
    int maxTokens = 200000;


    //these remove runtime info from the stacktraces so
    //that actual causes can be counted.
    private final static Pattern CAUSED_BY_SNIPPER =
            Pattern.compile("(Caused by: [^:]+):[^\\r\\n]+");

    private final static Pattern ACCESS_PERMISSION_EXCEPTION =
            Pattern.compile("org\\.apache\\.tika\\.exception\\.AccessPermissionException");
    private final static Pattern ENCRYPTION_EXCEPTION =
            Pattern.compile("org\\.apache\\.tika.exception\\.EncryptedDocumentException");

    private TikaConfig config = TikaConfig.getDefaultConfig();//TODO: allow configuration
    final LanguageIDWrapper langIder;
    protected IDBWriter writer;

    /**
     *
     * @param p path to the common_tokens directory.  If this is null, try to load from classPath
     * @throws IOException
     */
    public static void loadCommonTokens(Path p, String defaultLangCode) throws IOException {
        commonTokenCountManager = new CommonTokenCountManager(p, defaultLangCode);
    }

    public AbstractProfiler(ArrayBlockingQueue<FileResource> fileQueue,
                            IDBWriter writer) {
        super(fileQueue);
        this.writer = writer;
        langIder = new LanguageIDWrapper();
        initAnalyzersAndTokenCounter(maxTokens);
    }

    private void initAnalyzersAndTokenCounter(int maxTokens) {
        try {
            analyzerManager = AnalyzerManager.newInstance(maxTokens);
            tokenCounter = new TokenCounter(analyzerManager.getGeneralAnalyzer());
        } catch (IOException e) {
            throw new RuntimeException(e);
        }

    }

    /**
     * Truncate the content string if greater than this length to this length
     * @param maxContentLength
     */
    public void setMaxContentLength(int maxContentLength) {
        this.maxContentLength = maxContentLength;
    }

    /**
     * Truncate content string if greater than this length to this length for lang id
     *
     * @param maxContentLengthForLangId
     */
    public void setMaxContentLengthForLangId(int maxContentLengthForLangId) {
        this.maxContentLengthForLangId = maxContentLengthForLangId;
    }

    /**
     * Add a LimitTokenCountFilterFactory if > -1
     *
     * @param maxTokens
     */
    public void setMaxTokens(int maxTokens) {
        this.maxTokens = maxTokens;
        initAnalyzersAndTokenCounter(maxTokens);
    }


    protected void writeExtractException(TableInfo extractExceptionTable, String containerId,
                                         String filePath, ExtractReaderException.TYPE type) throws IOException {
        Map<Cols, String> data = new HashMap<>();
        data.put(Cols.CONTAINER_ID, containerId);
        data.put(Cols.FILE_PATH, filePath);
        data.put(Cols.EXTRACT_EXCEPTION_ID, Integer.toString(type.ordinal()));
        writer.writeRow(extractExceptionTable, data);

    }

    protected void writeProfileData(EvalFilePaths fps, int i, Metadata m,
                                    String fileId, String containerId,
                                    List<Integer> numAttachments, TableInfo profileTable) {

        Map<Cols, String> data = new HashMap<>();
        data.put(Cols.ID, fileId);
        data.put(Cols.CONTAINER_ID, containerId);
        data.put(Cols.MD5, m.get(DIGEST_KEY));

        if ( i < numAttachments.size()) {
            data.put(Cols.NUM_ATTACHMENTS, Integer.toString(numAttachments.get(i)));
        }
        data.put(Cols.ELAPSED_TIME_MILLIS, getTime(m));
        data.put(Cols.NUM_METADATA_VALUES,
                Integer.toString(countMetadataValues(m)));

        Integer nPages = m.getInt(PagedText.N_PAGES);
        if (nPages != null) {
            data.put(Cols.NUM_PAGES, Integer.toString(nPages));
        }

        //if the outer wrapper document
        if (i == 0) {
            data.put(Cols.IS_EMBEDDED, FALSE);
            data.put(Cols.FILE_NAME, fps.getRelativeSourceFilePath().getFileName().toString());
        } else {
            data.put(Cols.IS_EMBEDDED, TRUE);
            data.put(Cols.FILE_NAME, getFileName(m.get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH)));
        }
        String ext = FilenameUtils.getExtension(data.get(Cols.FILE_NAME));
        ext = (ext == null) ? "" : ext.toLowerCase(Locale.US);
        data.put(Cols.FILE_EXTENSION, ext);
        long srcFileLen = getSourceFileLength(m);
        if (srcFileLen > NON_EXISTENT_FILE_LENGTH) {
            data.put(Cols.LENGTH, Long.toString(srcFileLen));
        } else {
            data.put(Cols.LENGTH, "");
        }
        int numMetadataValues = countMetadataValues(m);
        data.put(Cols.NUM_METADATA_VALUES,
                Integer.toString(numMetadataValues));

        data.put(Cols.ELAPSED_TIME_MILLIS,
                getTime(m));

        String content = getContent(m);
        if (content == null || content.trim().length() == 0) {
            data.put(Cols.HAS_CONTENT, FALSE);
        } else {
            data.put(Cols.HAS_CONTENT, TRUE);
        }
        getFileTypes(m, data);
        try {
            writer.writeRow(profileTable, data);
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    private static String getFileName(String path) {
        if (path == null) {
            return "";
        }
        //filenameUtils checks for a null byte in the path.
        //it will throw an IllegalArgumentException if there is a null byte.
        //given that we're recording names and not using them on a file path
        //we should ignore this.
        try {
            return FilenameUtils.getName(path);
        } catch (IllegalArgumentException e) {
            LOG.warn("{} in {}", e.getMessage(), path);
        }
        path = path.replaceAll("\u0000", " ");
        try {
            return FilenameUtils.getName(path);
        } catch (IllegalArgumentException e) {
            LOG.warn("Again: {} in {}", e.getMessage(), path);
        }
        //give up
        return "";
    }

    protected void writeExceptionData(String fileId, Metadata m, TableInfo exceptionTable) {
        Map<Cols, String> data = new HashMap<>();
        getExceptionStrings(m, data);
        if (data.keySet().size() > 0) {
            try {
                data.put(Cols.ID, fileId);
                writer.writeRow(exceptionTable, data);
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
        }
    }

    /**
     * Checks to see if metadata is null or content is empty (null or only whitespace).
     * If any of these, then this does no processing, and the fileId is not
     * entered into the content table.
     *
     * @param fileId
     * @param m
     * @param fieldName
     * @param contentsTable
     */
    protected void writeContentData(String fileId, Metadata m,
                                    String fieldName, TableInfo contentsTable) throws IOException {
        if (m == null) {
            return;
        }
        Map<Cols, String> data = new HashMap<>();
        String content = getContent(m, maxContentLength, data);
        if (content == null || content.trim().length() == 0) {
            return;
        }
        tokenCounter.clear(fieldName);
        tokenCounter.add(fieldName, content);

        data.put(Cols.ID, fileId);
        data.put(Cols.CONTENT_LENGTH, Integer.toString(content.length()));
        langid(m, data);
        String langid = data.get(Cols.LANG_ID_1);
        langid = (langid == null) ? "" : langid;

        writeTokenCounts(data, fieldName, tokenCounter);
        CommonTokenResult commonTokenResult = null;
        try {
            commonTokenResult = commonTokenCountManager.countTokenOverlaps(langid,
                    tokenCounter.getTokens(fieldName));
        } catch (IOException e) {
            LOG.error("{}", e.getMessage(), e);
        }
        data.put(Cols.COMMON_TOKENS_LANG, commonTokenResult.getLangCode());
        data.put(Cols.NUM_COMMON_TOKENS, Integer.toString(commonTokenResult.getCommonTokens()));
        TokenStatistics tokenStatistics = tokenCounter.getTokenStatistics(fieldName);
        data.put(Cols.NUM_UNIQUE_TOKENS,
                Integer.toString(tokenStatistics.getTotalUniqueTokens()));
        data.put(Cols.NUM_TOKENS,
                Integer.toString(tokenStatistics.getTotalTokens()));
        data.put(Cols.NUM_ALPHABETIC_TOKENS,
                Integer.toString(commonTokenResult.getAlphabeticTokens()));

        data.put(Cols.TOKEN_ENTROPY_RATE,
                Double.toString(tokenStatistics.getEntropy()));
        SummaryStatistics summStats = tokenStatistics.getSummaryStatistics();
        data.put(Cols.TOKEN_LENGTH_SUM,
                Integer.toString((int) summStats.getSum()));

        data.put(Cols.TOKEN_LENGTH_MEAN,
                Double.toString(summStats.getMean()));

        data.put(Cols.TOKEN_LENGTH_STD_DEV,
                Double.toString(summStats.getStandardDeviation()));
        unicodeBlocks(m, data);
        try {
            writer.writeRow(contentsTable, data);
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    String getTime(Metadata m) {
        String elapsed = "-1";

        String v = m.get(RecursiveParserWrapper.PARSE_TIME_MILLIS);
        if (v != null) {
            return v;
        }
        return elapsed;
    }

    int countMetadataValues(Metadata m) {
        if (m == null) {
            return 0;
        }
        int i = 0;
        for (String n : m.names()) {
            i += m.getValues(n).length;
        }
        return i;
    }

    void getExceptionStrings(Metadata metadata, Map<Cols, String> data) {

        String fullTrace = metadata.get(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + "runtime");

        if (fullTrace == null) {
            fullTrace = metadata.get(RecursiveParserWrapper.EMBEDDED_EXCEPTION);
        }

        if (fullTrace != null) {
            //check for "expected" exceptions...exceptions
            //that can't be fixed.
            //Do not store trace for "expected" exceptions

            Matcher matcher = ACCESS_PERMISSION_EXCEPTION.matcher(fullTrace);
            if (matcher.find()) {
                data.put(Cols.PARSE_EXCEPTION_ID,
                        Integer.toString(EXCEPTION_TYPE.ACCESS_PERMISSION.ordinal()));
                return;
            }
            matcher = ENCRYPTION_EXCEPTION.matcher(fullTrace);
            if (matcher.find()) {
                data.put(Cols.PARSE_EXCEPTION_ID,
                        Integer.toString(EXCEPTION_TYPE.ENCRYPTION.ordinal()));
                return;
            }

            data.put(Cols.PARSE_EXCEPTION_ID,
                    Integer.toString(EXCEPTION_TYPE.RUNTIME.ordinal()));

            data.put(Cols.ORIG_STACK_TRACE, fullTrace);
            //TikaExceptions can have object ids, as in the "@2b1ea6ee" in:
            //org.apache.tika.exception.TikaException: TIKA-198: Illegal
            //IOException from org.apache.tika.parser.microsoft.OfficeParser@2b1ea6ee
            //For reporting purposes, let's snip off the object id so that we can more
            //easily count exceptions.
            String sortTrace = ExceptionUtils.trimMessage(fullTrace);

            matcher = CAUSED_BY_SNIPPER.matcher(sortTrace);
            sortTrace = matcher.replaceAll("$1");
            sortTrace = sortTrace.replaceAll("org.apache.tika.", "o.a.t.");
            data.put(Cols.SORT_STACK_TRACE, sortTrace);
        }
    }

    /**
     * Get the content and record in the data {@link Cols#CONTENT_TRUNCATED_AT_MAX_LEN} whether the string was truncated
     *
     * @param metadata
     * @param maxLength
     * @param data
     * @return
     */
    protected static String getContent(Metadata metadata, int maxLength, Map<Cols, String> data) {
        data.put(Cols.CONTENT_TRUNCATED_AT_MAX_LEN, "FALSE");
        String c = getContent(metadata);
        if (maxLength > -1 && c.length() > maxLength) {
            c = c.substring(0, maxLength);
            data.put(Cols.CONTENT_TRUNCATED_AT_MAX_LEN, "TRUE");
        }
        return c;

    }
    protected static String getContent(Metadata metadata) {
        if (metadata == null) {
            return "";
        }
        String c = metadata.get(RecursiveParserWrapper.TIKA_CONTENT);
        if (c == null) {
            return "";
        }
        return c;
    }

    void unicodeBlocks(Metadata metadata, Map<Cols, String> data) {
        String content = getContent(metadata);
        if (content.length() < 200) {
            return;
        }
        String s = content;
        if (content.length() > maxContentLengthForLangId) {
            s = content.substring(0, maxContentLengthForLangId);
        }
        Map<String, Integer> m = new HashMap<>();
        Reader r = new StringReader(s);
        try {
            int c = r.read();
            while (c != -1) {
                Character.UnicodeBlock block = Character.UnicodeBlock.of(c);
                String blockString = (block == null) ? "NULL" : block.toString();
                Integer i = m.get(blockString);
                if (i == null) {
                    i = 0;
                }
                i++;
                if (block == null) {
                    blockString = "NULL";
                }
                m.put(blockString, i);
                c = r.read();
            }
        } catch (IOException e) {
            LOG.warn("IOException", e);
        }

        List<Pair<String, Integer>> pairs = new ArrayList<>();
        for (Map.Entry<String, Integer> e : m.entrySet()) {
            pairs.add(Pair.of(e.getKey(), e.getValue()));
        }
        Collections.sort(pairs, new Comparator<Pair<String, Integer>>() {
            @Override
            public int compare(Pair<String, Integer> o1, Pair<String, Integer> o2) {
                return o2.getValue().compareTo(o1.getValue());
            }
        });
        StringBuilder sb = new StringBuilder();

        for (int i = 0; i < 20 && i < pairs.size(); i++) {
            if (i > 0) {
                sb.append(" | ");
            }
            sb.append(pairs.get(i).getKey()+": "+pairs.get(i).getValue());
        }
        data.put(Cols.UNICODE_CHAR_BLOCKS, sb.toString());
    }

    void langid(Metadata metadata, Map<Cols, String> data) {
        String content = getContent(metadata);
        if (content.length() < 50) {
            return;
        }
        String s = content;
        if (content.length() > maxContentLengthForLangId) {
            s = content.substring(0, maxContentLengthForLangId);
        }
        List<DetectedLanguage> probabilities = langIder.getProbabilities(s);
        if (probabilities.size() > 0) {
            data.put(Cols.LANG_ID_1, getLangString(probabilities.get(0)));
            data.put(Cols.LANG_ID_PROB_1,
            Double.toString(probabilities.get(0).getProbability()));
        }
        if (probabilities.size() > 1) {
            data.put(Cols.LANG_ID_2, getLangString(probabilities.get(1)));
            data.put(Cols.LANG_ID_PROB_2,
            Double.toString(probabilities.get(1).getProbability()));
        }
    }

    private String getLangString(DetectedLanguage detectedLanguage) {
        //So that we have mapping between lang id and common-tokens file names
        String lang = detectedLanguage.getLocale().getLanguage();
        if ("zh".equals(lang)) {
            if (detectedLanguage.getLocale().getRegion().isPresent()) {
                lang += "-" + detectedLanguage.getLocale().getRegion().get().toLowerCase(Locale.US);
            } else {
                //hope for the best
                lang += "-cn";
            }
        }
        return lang;
    }

    void getFileTypes(Metadata metadata, Map<Cols, String> output) {
        if (metadata == null) {
            return;
        }
        String type = metadata.get(Metadata.CONTENT_TYPE);
        if (type == null) {
            return;
        }
        int mimeId = writer.getMimeId(type);
        output.put(Cols.MIME_ID, Integer.toString(mimeId));
    }

    void writeTokenCounts(Map<Cols, String> data, String field,
                          TokenCounter tokenCounter) {


        int stops = 0;
        int i = 0;
        StringBuilder sb = new StringBuilder();
        TokenStatistics tokenStatistics = tokenCounter.getTokenStatistics(field);
        for (TokenIntPair t : tokenStatistics.getTopN()) {
            if (i++ > 0) {
                sb.append(" | ");
            }
            sb.append(t.getToken() + ": " + t.getValue());
        }

        data.put(Cols.TOP_N_TOKENS, sb.toString());
    }


    public void closeWriter() throws IOException {
        writer.close();
    }


    /**
     *
     * @param metadata
     * @param extracts
     * @return evalfilepaths for files if crawling an extract directory
     */
    protected EvalFilePaths getPathsFromExtractCrawl(Metadata metadata,
                                                     Path extracts) {
        String relExtractFilePath = metadata.get(FSProperties.FS_REL_PATH);
        Matcher m = FILE_NAME_CLEANER.matcher(relExtractFilePath);
        Path relativeSourceFilePath = Paths.get(m.replaceAll(""));
        //just try slapping the relextractfilepath on the extractdir
        Path extractFile = extracts.resolve(relExtractFilePath);
        if (! Files.isRegularFile(extractFile)) {
            //if that doesn't work, try to find the right extract file.
            //This is necessary if crawling extractsA and trying to find a file in
            //extractsB that is not in the same format: json vs txt or compressed
            extractFile = findFile(extracts, relativeSourceFilePath);
        }
        return new EvalFilePaths(relativeSourceFilePath, extractFile);
    }
    //call this if the crawler is crawling through the src directory
    protected EvalFilePaths getPathsFromSrcCrawl(Metadata metadata, Path srcDir,
                                                 Path extracts) {
        Path relativeSourceFilePath = Paths.get(metadata.get(FSProperties.FS_REL_PATH));
        Path extractFile = findFile(extracts, relativeSourceFilePath);
        Path inputFile = srcDir.resolve(relativeSourceFilePath);
        long srcLen = -1l;
        //try to get the length of the source file in case there was an error
        //in both extracts
        try {
            srcLen = Files.size(inputFile);
        } catch (IOException e) {
            LOG.warn("Couldn't get length for: {}", inputFile.toAbsolutePath());
        }
        return new EvalFilePaths(relativeSourceFilePath, extractFile, srcLen);
    }

    /**
     *
     * @param extractRootDir
     * @param relativeSourceFilePath
     * @return extractFile or null if couldn't find one.
     */
    private Path findFile(Path extractRootDir, Path relativeSourceFilePath) {
        String relSrcFilePathString = relativeSourceFilePath.toString();
        if (lastExtractExtension != null) {
            Path candidate = extractRootDir.resolve(relSrcFilePathString+lastExtractExtension);
            if (Files.isRegularFile(candidate)) {
                return candidate;
            }
        }
        for (String ext : EXTRACT_EXTENSIONS) {
            for (String compress : COMPRESSION_EXTENSIONS) {
                Path candidate = extractRootDir.resolve(relSrcFilePathString+ext+compress);
                if (Files.isRegularFile(candidate)) {
                    lastExtractExtension = ext+compress;
                    return candidate;
                }
            }
        }
        return null;
    }

    protected long getSourceFileLength(EvalFilePaths fps, List<Metadata> metadataList) {
        if (fps.getSourceFileLength() > NON_EXISTENT_FILE_LENGTH) {
            return fps.getSourceFileLength();
        }
        return getSourceFileLength(metadataList);
    }

    long getSourceFileLength(List<Metadata> metadataList) {
        if (metadataList == null || metadataList.size() < 1) {
            return NON_EXISTENT_FILE_LENGTH;
        }
        return getSourceFileLength(metadataList.get(0));
    }

    long getSourceFileLength(Metadata m) {
        String lenString = m.get(Metadata.CONTENT_LENGTH);
        if (lenString == null) {
            return NON_EXISTENT_FILE_LENGTH;
        }
        try {
            return Long.parseLong(lenString);
        } catch (NumberFormatException e) {
            //swallow
        }
        return NON_EXISTENT_FILE_LENGTH;
    }

    protected long getFileLength(Path p) {
        if (p != null && Files.isRegularFile(p)) {
            try {
                return Files.size(p);
            } catch (IOException e) {
                //swallow
            }
        }
        return NON_EXISTENT_FILE_LENGTH;
    }

    /**
     *
     * @param list
     * @return empty list if input list is empty or null
     */
    static List<Integer> countAttachments(List<Metadata> list) {
        List<Integer> ret = new ArrayList<>();
        if (list == null || list.size() == 0) {
            return ret;
        }
        //container document attachment count = list.size()-1
        ret.add(list.size()-1);

        Map<String, Integer> counts = new HashMap<>();
        for (int i = 1; i < list.size(); i++) {
            String path = list.get(i).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH);
            if (path == null) {
                //shouldn't ever happen
                continue;
            }
            String[] parts = path.split("/");
            StringBuilder parent = new StringBuilder();
            for (int end = 1; end < parts.length-1; end++) {
                parent.setLength(0);
                join("/", parent, parts, 1, end);
                String parentPath = parent.toString();
                Integer count = counts.get(parentPath);
                if (count == null) {
                    count = 1;
                } else {
                    count++;
                }
                counts.put(parentPath, count);
            }
        }

        for (int i = 1; i < list.size(); i++) {
            Integer count = counts.get(list.get(i).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH));
            if (count == null) {
                count = 0;
            }
            ret.add(i, count);
        }
        return ret;


    }

    private static void join(String delimiter, StringBuilder sb, String[] parts, int start, int end) {
        for (int i = start; i <= end; i++) {
            sb.append(delimiter);
            sb.append(parts[i]);
        }
    }
}