XText.java example

Explorer

Xponents-master
- Basics
  - src
    - main
      - java
        org
        opensextant
        ConfigException.java
        data
        Country.java
        DocInput.java
        GeoBase.java
        Geocoding.java
        Language.java
        LatLon.java
        Place.java
        Taxon.java
        TextInput.java
        extraction
        ExtractionException.java
        ExtractionMetrics.java
        ExtractionResult.java
        Extractor.java
        MatchFilter.java
        NormalizationException.java
        TextEntity.java
        TextMatch.java
        processing
        Parameters.java
        ProcessingException.java
        util
        AnyFilenameFilter.java
        FileUtility.java
        GeodeticUtility.java
        GeonamesUtility.java
        TextUtils.java
    - test
      - java
        MetricsTest.java
        TestGeoUtils.java
        TestGeonamesLanguages.java
        TestGeonamesMeta.java
        TestTextUtils.java
- Examples
  - src
    - main
      - java
        org
        opensextant
        examples
        BasicGeoTemporalProcessing.java
        TaxonomicTagger.java
        WebCrawl.java
        twitter
        MicroMessage.java
        Tweet.java
        TweetGeocoder.java
- Extraction
  - src
    - main
      - java
        org
        opensextant
        extraction
        SolrMatcherSupport.java
        SolrTaggerRequest.java
        extractors
        geo
        BoundaryObserver.java
        CountryCount.java
        CountryObserver.java
        GazetteerMatcher.java
        GazetteerUpdateProcessorFactory.java
        LocationObserver.java
        PlaceCandidate.java
        PlaceCount.java
        PlaceEvidence.java
        PlaceGeocoder.java
        ScoredPlace.java
        SolrGazetteer.java
        TagFilter.java
        rules
        ContextualOrganizationRule.java
        CoordinateAssociationRule.java
        CountryRule.java
        GeocodeRule.java
        LocationChooserRule.java
        MajorPlaceRule.java
        NameCodeRule.java
        NameRule.java
        NonsenseFilter.java
        PersonNameFilter.java
        ProvinceAssociationRule.java
        xtax
        TaxonMatch.java
        TaxonMatcher.java
        output
        AbstractFormatter.java
        CSVFormatter.java
        FormatterFactory.java
        GDBFormatter.java
        GISDataFormatter.java
        GISDataModel.java
        GeoCSVFormatter.java
        KMLFormatter.java
        OpenSextantSchema.java
        ResultsFormatter.java
        ShapefileFormatter.java
        WKTFormatter.java
        processing
        ResultsUtility.java
        XtractorGroup.java
        progress
        ProgressListener.java
        ProgressMonitor.java
        ProgressMonitorBase.java
        util
        SolrProxy.java
        SolrUtil.java
    - test
      - java
        org
        opensextant
        extractors
        test
        TestExtraction.java
        TestGazFactory.java
        TestGazMatcher.java
        TestGazetteer.java
        TestGazetteerConflationKey.java
        TestPersonFilter.java
        TestPlaceGeocoder.java
        TestPlaceGeocoderLanguages.java
        TestStopFilters.java
        TestUtils.java
        TestXTax.java
- MapReduce
  - src
    - main
      - java
        org
        opensextant
        mapreduce
        AbstractMapper.java
        GeoTaggerMapper.java
        KeywordTaggerMapper.java
        Log4JUtils.java
        LoggingUtilities.java
        XponentsTaggerDemo.java
    - test
      - java
        org
        apache
        solr
        core
        CoreContainer.java
- Patterns
  - src
    - main
      - java
        org
        opensextant
        extractors
        flexpat
        AbstractFlexPat.java
        PatternTestCase.java
        RegexPattern.java
        RegexPatternManager.java
        TextMatchResult.java
        poli
        PatternsOfLife.java
        PoliMatch.java
        PoliPatternManager.java
        TestCase.java
        data
        MACAddress.java
        Money.java
        TelephoneNumber.java
        xcoord
        DMSFilter.java
        DMSOrdinate.java
        GeocoordMatch.java
        GeocoordMatchFilter.java
        GeocoordNormalization.java
        GeocoordPattern.java
        GeocoordPrecision.java
        GeocoordTestCase.java
        Hemisphere.java
        MGRSFilter.java
        MGRSParser.java
        PatternManager.java
        PrecisionScales.java
        UTMParser.java
        XConstants.java
        XCoord.java
        xtemporal
        DateMatch.java
        DateNormalization.java
        DateTimePattern.java
        PatternManager.java
        TestCase.java
        XTConstants.java
        XTemporal.java
    - test
      - java
        org
        opensextant
        extractors
        test
        DateNormalizationTest.java
        PrecisionScalesTest.java
        TestPoLi.java
        TestPoLiReporter.java
        TestXCoord.java
        TestXCoordReporter.java
        TestXTemporal.java
        TestXTemporalReporter.java
- XText
  - examples
  - src
    - main
      - java
        org
        opensextant
        xtext
        Content.java
        ConversionListener.java
        ConvertedDocument.java
        Converter.java
        ExclusionFilter.java
        PathManager.java
        XText.java
        collectors
        ArchiveNavigator.java
        CollectionListener.java
        Collector.java
        mailbox
        DefaultMailCrawl.java
        MailClient.java
        MailConfig.java
        NTLMAuth.java
        OutlookPSTCrawler.java
        sharepoint
        DefaultSharepointCrawl.java
        SPLink.java
        SharepointClient.java
        web
        CrawlFilter.java
        DefaultWebCrawl.java
        HyperLink.java
        WebClient.java
        converters
        ConverterAdapter.java
        DefaultConverter.java
        EmbeddedContentConverter.java
        ImageMetadataConverter.java
        MessageConverter.java
        TextTranscodingConverter.java
        TikaHTMLConverter.java
        WebArchiveConverter.java
    - test
      - java
        org
        opensextant
        xtext
        converters
        test
        MessageConverterTest.java
        test
        Decomposer.java
        ImageGroper.java
        MailClientTest.java
        SharepointClientTest.java
        SharepointCrawlTest.java
        TestPST.java
        TestSPLinks.java
        TestTikaPST.java
        Tests.java
        TextTranscodingTest.java
        WebLinkTest.java
- Xlayer
  - src
    - main
      - java
        org
        opensextant
        xlayer
        Transforms.java
        XlayerClient.java
        server
        RequestParameters.java
        TaggerResource.java
        XlayerApp.java
        xgeo
        XlayerRestlet.java
        XlayerServer.java
        XponentsGeotagger.java
    - test
      - java
        XlayerClientTest.java

/**
 *
 * Copyright 2009-2013 The MITRE Corporation.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 *
 * **************************************************************************
 * NOTICE This software was produced for the U. S. Government under Contract No.
 * W15P7T-12-C-F600, and is subject to the Rights in Noncommercial Computer
 * Software and Noncommercial Computer Software Documentation Clause
 * 252.227-7014 (JUN 1995)
 *
 * (c) 2012 The MITRE Corporation. All Rights Reserved.
 * **************************************************************************
 */
///** ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~|
//
// _____                                ____                     __                       __
///\  __`\                             /\  _`\                  /\ \__                   /\ \__
//\ \ \/\ \   _____      __     ___    \ \,\L\_\      __   __  _\ \ ,_\     __       ___ \ \ ,_\
// \ \ \ \ \ /\ '__`\  /'__`\ /' _ `\   \/_\__ \    /'__`\/\ \/'\\ \ \/   /'__`\   /' _ `\\ \ \/
//  \ \ \_\ \\ \ \L\ \/\  __/ /\ \/\ \    /\ \L\ \ /\  __/\/>  </ \ \ \_ /\ \L\.\_ /\ \/\ \\ \ \_
//   \ \_____\\ \ ,__/\ \____\\ \_\ \_\   \ `\____\\ \____\/\_/\_\ \ \__\\ \__/.\_\\ \_\ \_\\ \__\
//    \/_____/ \ \ \/  \/____/ \/_/\/_/    \/_____/ \/____/\//\/_/  \/__/ \/__/\/_/ \/_/\/_/ \/__/
//            \ \_\
//             \/_/
//
//   OpenSextant XText
// *  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~|
// */
package org.opensextant.xtext;

import static org.apache.commons.lang3.StringUtils.isBlank;
import gnu.getopt.LongOpt;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;

import javax.activation.MimeType;
import javax.activation.MimeTypeParseException;

import org.apache.commons.io.FileUtils;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.io.IOCase;
import org.apache.commons.io.filefilter.FileFilterUtils;
import org.apache.commons.io.filefilter.SuffixFileFilter;
import org.apache.tika.io.IOUtils;
import org.opensextant.ConfigException;
import org.opensextant.util.FileUtility;
import org.opensextant.xtext.collectors.ArchiveNavigator;
import org.opensextant.xtext.collectors.mailbox.OutlookPSTCrawler;
import org.opensextant.xtext.converters.DefaultConverter;
import org.opensextant.xtext.converters.EmbeddedContentConverter;
import org.opensextant.xtext.converters.ImageMetadataConverter;
import org.opensextant.xtext.converters.MessageConverter;
import org.opensextant.xtext.converters.TextTranscodingConverter;
import org.opensextant.xtext.converters.TikaHTMLConverter;
import org.opensextant.xtext.converters.WebArchiveConverter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 *
 * Traverse a folder and return text versions of the documents found. Archiving
 * the text only copies at an output location of your choice.
 *
 * <pre>
 *
 * if input is a file, convert. Done.
 *
 * if input is an archive, unpack in temp space, iterate over dir, convert each.
 * Done
 *
 * if input is a folder iterate over dir, convert each. Done
 * </pre>
 *
 * TEXT OUTPUT form includes a JSON document header with metadata properties
 * from the original item. These are valid elements of the conversion process.
 * We try to maintain them apart from the true, readable text of the document.
 *
 *
 * Add a ConversiontListener to XText instance to capture the converted document
 * as it comes out of the main loop for converting archives and folders.
 *
 * extractText() runs over any file type and extracts text, saving it pushing
 * events to one optional listener
 *
 * convertFile(File) will convert a single file, returning a ConvertedDocument
 *
 *
 *
 * @author Marc C. Ubaldino, MITRE, ubaldino at mitre dot org
 */
public final class XText implements ExclusionFilter, Converter {

    private final Logger log = LoggerFactory.getLogger(getClass());
    private boolean scrubHTML = false;

    private final PathManager paths = new PathManager();

    public PathManager getPathManager() {
        return paths;
    }

    /**
     * flag to manage if children are extracted or not.
     */
    private boolean extractEmbedded = false;

    /**
     * XText default is 1 MB of text
     */
    private int maxBuffer = DefaultConverter.MAX_TEXT_SIZE;
    /**
     * Heuristic - HTML content is likely 5x, maybe a lot more, the size of the
     * plain text it contains. So with 1 MB the target max text size, 5 MB would be
     * the largest HTML document accepted here, by default.
     */
    private final int maxHTMLBuffer = 5 * maxBuffer;
    private long maxFileSize = FILE_SIZE_LIMIT;

    protected Set<String> archiveFileTypes = new HashSet<String>();

    /**
     *
     */
    public static Map<String, Converter> converters = new HashMap<String, Converter>();
    private Converter defaultConversion;
    private Converter embeddedConversion;
    private final Set<String> requestedFileTypes = new HashSet<String>();
    private final Set<String> ignoreFileTypes = new HashSet<String>();
    private boolean allowNoExtension = false;

    /**
     */
    public XText() {
        defaults();
    }

    public void enableOverwrite(boolean b) {
        ConvertedDocument.overwrite = b;
    }

    /**
     * Sets the archive dir.
     *
     * @param root
     *            the new archive dir
     * @throws IOException
     *             on err
     * @deprecated use getPathManager().setConversionRoot( path )
     */
    @Deprecated
    public void setArchiveDir(String root) throws IOException {
        paths.setConversionCache(root);
    }

    public void setMaxBufferSize(int sz) {
        maxBuffer = sz;
    }

    public void setMaxFileSize(int sz) {
        maxFileSize = sz;
    }

    /**
     * Set if your app requires file extensions or not.
     * 
     * @param b
     *            true to enable
     */
    public void enableNoFileExtension(boolean b) {
        allowNoExtension = b;
    }

    /**
     * Use Tika HTML de-crapifier. Default: No scrubbing.
     *
     * @param b
     *            true if you wish to de-crapify, I mean scrape HTML content
     */
    public void enableHTMLScrubber(boolean b) {
        scrubHTML = b;
    }

    /**
     * enable/disable the extraction of embedded child documents in found documents.
     * Using embedded extraction may yield many small sub documents, aka children.
     * 
     * @param b
     *            true to enable
     */
    public void enableEmbeddedExtraction(boolean b) {
        extractEmbedded = b;
    }

    /**
     * The overall flag to save converted output or not. DEFAULT: true = save
     * it; provided caller specifies either saveWithInput or provides an
     * archiveRoot
     *
     * @param b
     *            true to enable
     */
    public void enableSaving(boolean b) {
        paths.enableSaving(b);
    }

    /**
     * Add the file extension for the file type you wish to convert. if Tika
     * supports it by default it should be no problem.
     * Adding requested file types here only allows the API to know by-file extension
     * what types to filter in and convert. Without a file extension, the file
     * still needs to be ingested and converted to identify the file type.
     * 
     * @param ext
     *            a file extension to convert
     */
    public void convertFileType(String ext) {
        requestedFileTypes.add(ext.toLowerCase());
    }

    /**
     * Ignore files ending with.... or of type ext. No assumption of case is
     * made. This is case sensitive.
     * 
     * @param ext
     *            a file extension to NOT convert
     */
    public void ignoreFileType(String ext) {
        if (ext != null) {
            ignoreFileTypes.add(ext.toLowerCase());
        }
    }

    private ConversionListener postProcessor = null;

    /**
     * A conversion listener is any outside application or routine that will do
     * something more with the converted document. If unset nothing happens. ;)
     * 
     * @param processor
     *            a lisenter that handles the documents that have been found
     */
    public void setConversionListener(ConversionListener processor) {
        postProcessor = processor;
    }

    private boolean useTikaPST = false;

    public void enableTikaPST(boolean flag) {
        useTikaPST = flag;
    }

    /**
     * is the input an archive?.
     *
     * @param fpath
     *            the fpath
     * @return true, if is archive
     */
    public boolean isArchive(String fpath) {
        String ext = FilenameUtils.getExtension(fpath);
        if (ext == null) {
            return false;
        }
        return archiveFileTypes.contains(ext.toLowerCase());
    }

    public boolean isPST(String fpath) {
        return isPSTExtension(FilenameUtils.getExtension(fpath));
    }

    public static boolean isPSTExtension(String ext) {
        if (ext == null) {
            return false;
        }
        return ("pst".equalsIgnoreCase(ext));
    }

    protected long total_conv_time = 0;
    protected int average_conv_time = 0;
    protected int total_conversions = 0;

    /**
     * Records overall counts and conversion times for documents converted.
     * This may not account for error'd documents.
     *
     * @param d
     *            ConvertedDocument
     */
    protected void trackStatistics(ConvertedDocument d) {
        if (d != null) {
            total_conv_time += d.conversion_time;
        }
        ++total_conversions;
    }

    public void reportStatistics() {
        average_conv_time = (int) ((float) total_conv_time / total_conversions);
        log.info("TOTAL of N=" + total_conversions + " documents converted"
                + "\n With an average time (ms) of " + average_conv_time);
    }

    protected long start_time = 0;
    protected long stop_time = 0;

    /**
     * Optional API routine. If XText is used as a main program, this is the entry point for extraction/collection.
     * If XText is used as an API, caller may use convertFile() directly without engaging in the setup and assumptions
     * behind this convenience method.
     * The main entry point to converting compound documents and folders.
     * 
     * @param filepath
     *            item from which we extract text
     * @throws IOException
     *             err
     * @throws ConfigException
     *             err
     */
    public void extractText(String filepath) throws IOException, ConfigException {

        start_time = System.currentTimeMillis();

        log.info("Conversion.  INPUT PATH={}", filepath);
        String path = FilenameUtils.normalize(new File(filepath).getAbsolutePath(), true);
        if (path == null) {
            throw new IOException("Failed to normalize the path: " + filepath);
        }

        File input = new File(path);
        if (!input.exists()) {
            throw new IOException("Non existent input FILE=" + path);
        }

        /* Filter on absolute path */
        if (PathManager.isXTextCache(path)) {
            throw new ConfigException(
                    "XText cannot be directed to extract text from its own cache files. "
                            + "Move the cache files out of ./xtext/ folders if you really need to do this.");
        }

        if (isArchive(input.getName())) {
            // Archive will collect originals to "export"
            // Archive will save conversions to "output"
            // PathManager is STATEFUL for as long as this archive is processing
            // If an archive is uncovered while traversing files, its contents can be dumped to the child export folder.
            convertArchive(input);
        } else if (isPST(input.getName()) && !useTikaPST) {
            this.convertOutlookPST(input);
        } else if (input.isFile()) {
            // If prefix is not set, then conversion will be dumped flatly to output area.
            paths.setInputRoot(input);
            convertFile(input);
        } else if (input.isDirectory()) {
            paths.setInputRoot(input);
            convertFolder(input);
        }

        stop_time = System.currentTimeMillis();

        if (paths.isSaving()) {
            if (paths.isSaveWithInput()) {
                log.info(
                        "Output can be accessed at from the input folder {} in 'xtext' sub-folders",
                        input.getParent());
            } else {
                log.info("Output can be accessed at " + paths.getConversionCache());
            }
        }

        reportStatistics();
    }

    /**
     * Filter out File object if it is an XText conversion of some sort. That is, if
     * file "./a/b/c/xtext/file.doc.txt  is found, it is omitted because it is contained in "./xtext"
     *
     * @param input
     *            file obj
     * @return true if file's immediate parent is named 'xtext'
     */
    private boolean filterOutFile(File input) {
        //
        //
        if (PathManager.isXTextCache(input)) {
            return true;
        }

        return filterOutFile(input.getAbsolutePath());
    }

    /**
     * Filter the type of files to ignore.
     */
    @Override
    public boolean filterOutFile(String filepath) {

        // Filter out any of our own xtext caches
        //
        if (PathManager.isXTextCache(filepath)) {
            return true;
        }

        String n = FilenameUtils.getBaseName(filepath);
        if (n.startsWith(".")) {
            return true;
        }
        if (filepath.contains(".svn")) {
            return true;
        }

        // ignore '-utf8.txt' as XText likely generated them.
        //
        if (n.endsWith(ConvertedDocument.CONVERTED_TEXT_EXT)) {
            return true;
        }

        String ext = FilenameUtils.getExtension(filepath);
        if (isBlank(ext)) {
            if (allowNoExtension) {
                return false;
            }
            return true;
        }
        return !requestedFileTypes.contains(ext.toLowerCase());
    }

    /**
     * Unpack an archive and convert items found.
     * Given (input)/A.zip
     * The zip is dearchived to
     * (input)/A_zip/
     * or (archive)/(input)/A_zip
     *
     * Items are then converted in either folder for the conversion archiving; depending on your choice of embedded vs.
     * non-embedded
     * 
     * @param input
     *            archive file object
     * @throws IOException
     *             on err
     * @throws ConfigException
     *             on err
     *
     */
    public void convertArchive(File input) throws IOException, ConfigException {

        if (!paths.verifyArchiveExport(input.getAbsolutePath())) {
            return;
        }

        File saveFolder = paths.getArchiveExportDir(input);
        String savePrefix = paths.getStipPrefixPath();

        paths.setStripPrefixPath(saveFolder.getAbsolutePath());
        paths.setInputRoot(saveFolder);

        ArchiveNavigator deArchiver = new ArchiveNavigator(input, saveFolder.getAbsolutePath(),
                this, this);
        deArchiver.overwrite = ConvertedDocument.overwrite;

        log.info("\tArchive Found ({}). Expanding to {}", input, saveFolder);

        deArchiver.collect();

        // Done:
        paths.setStripPrefixPath(savePrefix);
    }

    /**
     *
     * @param input
     *            input PST object
     * @throws IOException
     *             on err
     * @throws ConfigException
     *             on err
     */
    public void convertOutlookPST(File input) throws ConfigException, IOException {
        if (!paths.isSaving()) {
            log.error(
                    "Warning -- PST file found, but save = true is required to parse it.  Enable saving and chose a cache folder");
        }

        OutlookPSTCrawler pst = new OutlookPSTCrawler(input);
        pst.setConverter(this);
        pst.overwriteMode = ConvertedDocument.overwrite;
        pst.incrementalMode = true;

        File saveFolder = paths.getArchiveExportDir(input);
        String savePrefix = paths.getStipPrefixPath();

        paths.setStripPrefixPath(saveFolder.getAbsolutePath());
        paths.setInputRoot(saveFolder);
        pst.setOutputPSTDir(saveFolder);
        pst.configure();

        log.info("\tPST Email Archive Found ({}). Expanding to {}", input, saveFolder);

        try {
            pst.collect();
        } catch (Exception err) {
            throw new ConfigException("Unable to fully digest PST file " + input, err);
        }

        // Done:
        paths.setStripPrefixPath(savePrefix);
    }

    /**
     * Arbitrary 32 MB limit on file size. Maybe this should be dependent on the
     * file type.
     */
    public static final long FILE_SIZE_LIMIT = 0x2000000;

    /**
     * This is the proxy interface for traversing archives.
     *
     * Archive Navigator will call this interface to convert and post-process So
     * XText itself is a super-converter, whereas the items in the converter pkg
     * are stateless, simple conversions.
     *
     * this interface implementation calls XText.convertFile() which in turn
     * deals with the details of saving and archiving items
     *
     * Items retrieved from Archive Navigator are deleted from their temp space.
     * 
     * @param input
     *            file
     * @throws ConfigException
     *             on err
     * @throws IOException
     *             on err
     */
    @Override
    public ConvertedDocument convert(File input) throws IOException, ConfigException {
        return convertFile(input);
    }

    /**
     * Unsupported iConvert interface method. To convert text from a String obj
     * rather than a File obj, you would instantiate a converter implementation
     * for the data you think you are converting. E.g., if you know you have a
     * buffer of HTML content and want to save it as text, call
     * TikaHTMLConverter().convert( buffer ) directly.
     *
     * @param data
     *            raw data
     * @return the converted document
     * @throws IOException
     *             on err
     */
    @Override
    public ConvertedDocument convert(String data) throws IOException {
        throw new IOException("Unsupported interface:  To convert text or binary data directly "
                + "you must use an instance of a XText converter, e.g., TikaHTMLConverter");
    }

    /**
     * Convert file.
     *
     * @param input
     *            the input
     * @return the converted document
     * @throws IOException
     *             on err
     * @throws ConfigException
     *             on err
     */
    public ConvertedDocument convertFile(File input) throws IOException, ConfigException {
        return convertFile(input, null);
    }

    /**
     * Convert one file and save it off. We ignore hidden files and files in
     * hidden folders, e.g., .cvs_ignore, mycode/.svn/abc.txt
     *
     * This is the end of the line for the conversion logic; convertFile figures
     * out if it should return the cached version or attempt a conversion; it
     * also tries to save children items As children items may require special
     * attention they are not converted -- caller can pass in ConversionListener
     * and can deal with children file objects on their end.
     *
     * @param input
     *            child input obj to convert
     * @param parent
     *            parent in which child was found
     * @return converted document object
     * @throws IOException
     *             on err
     * @throws ConfigException
     *             on err
     */
    public ConvertedDocument convertFile(File input, ConvertedDocument parent) throws IOException,
            ConfigException {

        if (parent == null && filterOutFile(input)) {
            return null;
        }

        if (paths.isSaving()) {
            if (!paths.isSaveWithInput() && !paths.hasInputRoot()) {

                throw new IOException(
                        "Please set an input root; convertFile() was called in save/cache mode without having PathManager setup");
            }
        }

        String fname = input.getName();

        String ext = FilenameUtils.getExtension(fname).toLowerCase();
        if (!allowNoExtension) {
            if (ignoreFileTypes.contains(ext)) {
                return null;
            }

            if (!requestedFileTypes.contains(ext)) {
                return null;
            }
        }

        log.debug("Converting FILE=" + input.getAbsolutePath());

        /*
         * Handle archives or PST files. Or other large compound single file.
         */
        if (isArchive(fname)) {
            convertArchive(input);

            // NULL here implies the actual file, A.zip does not have any text representation itself.
            // However its children do.
            return null;
        } else if (isPSTExtension(ext) && !useTikaPST) {
            convertOutlookPST(input);
            return null;
        }

        /*
         * Otherwise this is a normal file...
         */
        if (FileUtils.sizeOf(input) > maxFileSize) {
            log.info("Valid File is too large FILE=" + input.getAbsolutePath());
            return null;
        }

        boolean cachable = true;
        Converter converter = converters.get(ext);
        if (converter == null) {
            if (extractEmbedded && EmbeddedContentConverter.isSupported(ext)) {
                converter = embeddedConversion;
                cachable = false; // Such content is processed every time.  Oh well...
            } else {
                converter = defaultConversion;
            }
        }

        ConvertedDocument textDoc = null;

        // ------------------
        // Retrieve previous conversions
        // ------------------
        if (cachable && !ConvertedDocument.overwrite && paths.isSaving()) {
            textDoc = paths.getCachedConversion(input);
        }

        // ------------------
        // Convert or Read object, IFF no cache exists for that object.
        // ------------------
        if (textDoc == null) {
            // Measure how long conversions take.
            long t1 = System.currentTimeMillis();

            try {
                textDoc = converter.convert(input);
            } catch (Exception convErr) {
                throw new IOException("Conversion error FILE=" + input.getPath(), convErr);
            }
            long t2 = System.currentTimeMillis();
            int duration = (int) (t2 - t1);
            if (textDoc != null) {
                // Buffer can be null. If you got this far, you are interested
                // in the file, as it passed
                // all filters above. Return the document with whatever metadata
                // it found.
                // if (textDoc.buffer == null) {
                // throw new
                // IOException("Engineering error: Doc converted, but converter failed to setText()");
                // }
                if (paths.isSaving() && textDoc.is_converted) {
                    // Get Parent info in there.
                    if (parent != null) {
                        textDoc.setParent(parent);
                    }

                    paths.saveConversion(textDoc);

                    // Children items will be persisted in the same folder
                    // structure where the textdoc.textpath resides.
                    // That is, Email or Embedded objects will be parsed are
                    // saved in ./xtext/ folder or in the separate archive.
                    // But this must be down now, as we have all the dynamic
                    // metadata + raw artifacts; As it is all written out to
                    // disk,
                    // it will be written out together.
                    //
                    if (textDoc.hasRawChildren()) {
                        convertChildren(textDoc);

                        // 1. children saved to disk
                        // 2. children converted.
                        // 3. children attached to parent here.
                        // 'textdoc' should now be well endowed with all the
                        // children metadata.
                    }
                }
            } else {
                textDoc = new ConvertedDocument(input);
            }

            textDoc.conversion_time = duration;
            if (textDoc.filetime == null) {
                textDoc.filetime = textDoc.getFiletime();
            }
        }

        /*
         * Conversion Listeners are called only for parent documents. That is
         * for an email with 4 attachments, this listener is called on the
         * parent email message, but not for the individual 4 attachments. The
         * final parent document here will have all Raw Children (bytes +
         * metadata) and Converted Children (ConvertedDocument obj) Caller will
         * have to detect if returned item via listener is a Parent with
         * Children.
         *
         * Behavior here is TBD.
         */
        if (postProcessor != null && parent == null) {
            postProcessor.handleConversion(textDoc, input.getAbsolutePath());
        }

        trackStatistics(textDoc);
        return textDoc;
    }

    /**
     * Navigate a folder trying to convert each file and return something to the
     * listener. Do not sacrifice the entire job if one file fails, so exception
     * is trapped in loop
     *
     * @param input
     *            the input
     * @throws IOException
     *             on err
     */
    public void convertFolder(File input) throws IOException {
        java.util.Collection<File> files = FileUtils.listFiles(input, new SuffixFileFilter(
                fileFilters, IOCase.INSENSITIVE), FileFilterUtils.trueFileFilter());
        for (File f : files) {
            try {
                convertFile(f);
            } catch (Exception convErr) {
                log.error("Conversion error, FILE=" + f.getPath(), convErr);
            }
        }
    }

    /**
     * Save children objects for a given ConvertedDocument to a location....
     * convert those items immediately, saving the Parent metadata along with
     * them. You should have setParent already
     *
     * @param parentDoc
     *            parent conversion
     * @throws IOException
     *             on err
     */
    public void convertChildren(ConvertedDocument parentDoc) throws IOException {

        if (parentDoc.is_webArchive) {
            // Web Archive is a single document.  Only intent here is to convert to a single text document.
            //
            return;
        }

        parentDoc.evalParentChildContainer();
        FileUtility.makeDirectory(parentDoc.parentContainer);
        String targetPath = parentDoc.parentContainer.getAbsolutePath();

        for (Content child : parentDoc.getRawChildren()) {
            if (child.content == null) {
                log.error("Attempted to write out child object with no content {}", child.id);
                continue;
            }

            OutputStream io = null;
            try {
                // We just assume for now Child ID is filename.
                // Alternatively, child.meta.getProperty(
                // ConvertedDocument.CHILD_ENTRY_KEY )
                // same result, just more verbose.
                //
                File childFile = new File(FilenameUtils.concat(targetPath, child.id));
                io = new FileOutputStream(childFile);
                IOUtils.write(child.content, io);

                ConvertedDocument childConv = convertFile(childFile, parentDoc);
                if (childConv != null) {
                    if (childConv.is_converted) {
                        // Push down all child metadata down to ConvertedDoc
                        for (String k : child.meta.stringPropertyNames()) {
                            String val = child.meta.getProperty(k);
                            childConv.addUserProperty(k, val);
                        }
                        // Save cached version once again.
                        childConv.saveBuffer(new File(childConv.textpath));
                    }

                    if (child.mimeType != null) {
                        try {
                            childConv.setMimeType(new MimeType(child.mimeType));
                        } catch (MimeTypeParseException e) {
                            log.warn("Invalid mime type encountered: {} ignoring.", child.mimeType);
                        }
                    }

                    parentDoc.addChild(childConv);
                }
            } catch (Exception err) {
                log.error("Failed to write out child {}, but will continue with others", child.id,
                        err);
            } finally {
                if (io != null) {
                    io.close();
                }
            }
        }
    }

    /**
     * TODO: this is called by default. duh. To change behavior, adjust
     * settings before setup() is called
     */
    public void defaults() {

        archiveFileTypes.add("zip");
        archiveFileTypes.add("gz");
        archiveFileTypes.add("tar");
        archiveFileTypes.add("tgz");
        archiveFileTypes.add("tar.gz");
        // archive_types.add("7z");

        // Get from a config file.
        requestedFileTypes.add("doc");
        requestedFileTypes.add("docx");
        requestedFileTypes.add("pdf");
        requestedFileTypes.add("htm");
        requestedFileTypes.add("html");
        requestedFileTypes.add("txt"); // only for encoding conversions.
        requestedFileTypes.add("msg");
        requestedFileTypes.add("eml");
        requestedFileTypes.add("emlx");
        requestedFileTypes.add("ppt");
        requestedFileTypes.add("pptx");
        requestedFileTypes.add("xlsx");
        requestedFileTypes.add("xls");
        requestedFileTypes.add("rtf");

        // Testing:
        requestedFileTypes.add("dot");
        requestedFileTypes.add("dotx");
        requestedFileTypes.add("odt");
        requestedFileTypes.add("odf");
        requestedFileTypes.add("docm");

        // Web Archives.
        requestedFileTypes.add("mht");
        //requestedFileTypes.add("wps");  MS Works?  No tika support really.

        // Only Photographic images will be supported by default.
        // BMP, GIF, PNG, ICO, etc. must be added by caller.
        //
        requestedFileTypes.add("jpg");
        requestedFileTypes.add("jpeg");

        // Limited PST support here.  PST will not behave the same as other files.
        // Its closer to a Zip archive than an ordinary file.
        requestedFileTypes.add("pst");

        // requested_types.add("log"); // Uncommon. Caller must expclitly add
        // raw data types and archives.
    }

    /**
     * Start over.
     */
    public void clearSettings() {
        requestedFileTypes.clear();
        converters.clear();
    }

    /**
     * If by this point you have taken items out of the requested types the
     * converters will not be setup. E.g., if you don't want PDF or HTML
     * conversion - those resources will not be initialized.
     * 
     * @throws IOException
     *             on err
     */
    public void setup() throws IOException {

        defaultConversion = new DefaultConverter(maxBuffer);
        embeddedConversion = new EmbeddedContentConverter(maxBuffer);

        paths.configure();

        // Invoke converter instances only as requested types suggest.
        // If caller has removed file types from the list, then

        String mimetype = "txt";
        if (requestedFileTypes.contains(mimetype)) {
            converters.put(mimetype, new TextTranscodingConverter());
        }

        mimetype = "html";
        if (requestedFileTypes.contains(mimetype)) {
            Converter webConv = new TikaHTMLConverter(this.scrubHTML, maxHTMLBuffer);
            converters.put(mimetype, webConv);
            converters.put("htm", webConv);
            converters.put("xhtml", webConv);

            requestedFileTypes.add("htm");
            requestedFileTypes.add("xhtml");
        }

        MessageConverter emailParser = new MessageConverter();
        mimetype = "eml";
        if (requestedFileTypes.contains(mimetype)) {
            converters.put(mimetype, emailParser);
        }
        mimetype = "msg";
        if (requestedFileTypes.contains(mimetype)) {
            converters.put(mimetype, emailParser);
        }
        WebArchiveConverter webArchiveParser = new WebArchiveConverter();
        mimetype = "mht"; /* RFC822 */
        if (requestedFileTypes.contains(mimetype)) {
            converters.put(mimetype, webArchiveParser);
        }

        ImageMetadataConverter imgConv = new ImageMetadataConverter();
        String[] imageTypes = { "jpeg", "jpg" };
        for (String img : imageTypes) {
            if (requestedFileTypes.contains(img)) {
                converters.put(img, imgConv);
            }
        }

        // ALWAYS ignore our own text conversions or those of others.
        // So here all known convertable types will need a filter for their
        // conversion, e.g.,
        // pdf => ignore pdf.txt
        // doc => ignore doc.txt
        //
        for (String t : requestedFileTypes) {
            ignoreFileType(t + ".txt");
        }

        fileFilters = requestedFileTypes.toArray(new String[requestedFileTypes.size()]);
    }

    /**
     *
     */
    private String[] fileFilters = null;

    /**
     * Call after setup() has run to add all supported/requested file types
     * 
     * @return file types as a set
     */
    public Set<String> getFileTypes() {
        return requestedFileTypes;
    }

    public static void usage() {
        System.out.println();
        System.out.println("==========XText Usage=============");
        System.out
                .println("XText --input input  [--help] "
                        + "\n\t[--embed-conversion | --output folder ]   "
                        + "\n\t[--embed-children   | --export folder] "
                        + "\n\t[--clean-html]   [--strip-prefix path]");
        System.out.println(" --help  print this message");
        System.out.println(" --input  where <input> is file or folder");
        System.out.println(" --output  where <folder> is output is a folder where you want to archive converted docs");
        System.out.println(" --embed-children embeds the saved conversions in the input folder under 'xtext/'");
        System.out.println(" --embed-conversion embeds the extracted children binaries in the input folder");
        System.out.println("     (NOT the conversions, the binaries from Archives, PST, etc)");
        System.out.println("     Default behavior is to extract originals to output archive.");
        System.out.println(" --export folder\tOpposite of -c. Extract children and save to <folder>");
        System.out.println("     NOTE: -e has same effect as setting output to input");
        System.out.println(" -clean-html enables HTML scrubbing");
        System.out.println("========================");
    }

    /**
     * Purely for logging when using the cmd line variation.
     * *
     * 
     * @author ubaldino
     *
     */
    static class MainProgramListener implements ConversionListener {

        private final Logger log = LoggerFactory.getLogger(getClass());

        @Override
        public void handleConversion(ConvertedDocument doc, String path) {
            boolean converted = false;
            if (doc != null) {
                converted = doc.is_converted;
            }
            log.info("Converted. FILE={} Status={}, Converted={}", path, doc != null, converted);
        }
    }

    public static void main(String[] args) {

        LongOpt[] options = { new LongOpt("input", LongOpt.REQUIRED_ARGUMENT, null, 'i'),
                new LongOpt("output", LongOpt.REQUIRED_ARGUMENT, null, 'o'),
                new LongOpt("export", LongOpt.REQUIRED_ARGUMENT, null, 'x'),
                new LongOpt("strip-prefix", LongOpt.REQUIRED_ARGUMENT, null, 'p'),
                new LongOpt("help", LongOpt.NO_ARGUMENT, null, 'h'),
                new LongOpt("clean-html", LongOpt.NO_ARGUMENT, null, 'H'),
                new LongOpt("embed-conversion", LongOpt.NO_ARGUMENT, null, 'e'),
                new LongOpt("embed-children", LongOpt.NO_ARGUMENT, null, 'c'),
                new LongOpt("tika-pst", LongOpt.NO_ARGUMENT, null, 'T') };

        // "hcex:i:o:p:"
        gnu.getopt.Getopt opts = new gnu.getopt.Getopt("XText", args, "", options);

        String input = null;
        String output = null;
        boolean embed = false;
        boolean filter_html = false;
        boolean saveChildrenWithInput = false;
        String saveChildrenTo = null;
        String prefix = null;

        XText xt = new XText();

        try {
            int c;
            while ((c = opts.getopt()) != -1) {
                switch (c) {

                case 0:
                    // Long opt processed.

                    break;

                case 'i':
                    input = opts.getOptarg();
                    break;
                case 'o':
                    output = opts.getOptarg();
                    break;
                case 'H':
                    filter_html = true;
                    break;
                case 'c':
                    saveChildrenWithInput = true;
                    break;
                case 'x':
                    saveChildrenTo = opts.getOptarg();
                    break;
                case 'p':
                    prefix = opts.getOptarg();
                    break;
                case 'e':
                    embed = true;
                    System.out
                            .println("Saving conversions to Input folder.  Output folder will be ignored.");
                    break;
                case 'T':
                    xt.enableTikaPST(true);
                    break;
                case 'h':
                default:
                    XText.usage();
                    System.exit(1);
                }
            }
        } catch (Exception err) {
            XText.usage();
            System.exit(1);
        }

        if (input == null) {
            System.out.println("An input argument is required, e.g., -Dinput=/Folder/...");
            System.exit(-1);
        }

        // Setting LANG=en_US in your shell.
        //
        // System.setProperty("LANG", "en_US");

        xt.enableOverwrite(true); // Given this is a test application, we will
        // overwrite every time XText is called.
        xt.enableSaving(embed || output != null);
        xt.getPathManager().enableSaveWithInput(embed); // creates a ./text/ Folder locally in
        // directory.
        xt.enableHTMLScrubber(filter_html);
        xt.getPathManager().enableSaveChildrenWithInput(saveChildrenWithInput);

        // If user wishes to strip input paths of some prefix
        // Output will be dumped in the resulting relative path.
        xt.getPathManager().setStripPrefixPath(prefix);

        // Manage the extraction of compound files -- archives, PST mailbox file, etc.
        // ... others?
        if (!saveChildrenWithInput && saveChildrenTo != null) {
            xt.getPathManager().setExtractedChildrenCache(saveChildrenTo);
        }

        try {
            if (!embed) {
                if (output == null) {
                    output = "output";
                    xt.enableSaving(true); // Will save to output dir.
                    FileUtility.makeDirectory(output);
                    xt.getPathManager().setConversionCache(output);
                    System.out.println("Default output folder is $PWD/" + output);
                } else {
                    xt.enableSaving(true);
                    // Notice this main program requires an output path.
                    xt.getPathManager().setConversionCache(output);
                }
            }
            // Set itself to listen, as this is the main program.
            xt.setConversionListener(new MainProgramListener());

            xt.setup();
            xt.extractText(input);
        } catch (IOException ioerr) {
            XText.usage();
            ioerr.printStackTrace();
        } catch (ConfigException cfgerr) {
            XText.usage();
            cfgerr.printStackTrace();
        }
    }
}