BasicGeoTemporalProcessing.java example

Explorer

Xponents-master
- Basics
  - src
    - main
      - java
        org
        opensextant
        ConfigException.java
        data
        Country.java
        DocInput.java
        GeoBase.java
        Geocoding.java
        Language.java
        LatLon.java
        Place.java
        Taxon.java
        TextInput.java
        extraction
        ExtractionException.java
        ExtractionMetrics.java
        ExtractionResult.java
        Extractor.java
        MatchFilter.java
        NormalizationException.java
        TextEntity.java
        TextMatch.java
        processing
        Parameters.java
        ProcessingException.java
        util
        AnyFilenameFilter.java
        FileUtility.java
        GeodeticUtility.java
        GeonamesUtility.java
        TextUtils.java
    - test
      - java
        MetricsTest.java
        TestGeoUtils.java
        TestGeonamesLanguages.java
        TestGeonamesMeta.java
        TestTextUtils.java
- Examples
  - src
    - main
      - java
        org
        opensextant
        examples
        BasicGeoTemporalProcessing.java
        TaxonomicTagger.java
        WebCrawl.java
        twitter
        MicroMessage.java
        Tweet.java
        TweetGeocoder.java
- Extraction
  - src
    - main
      - java
        org
        opensextant
        extraction
        SolrMatcherSupport.java
        SolrTaggerRequest.java
        extractors
        geo
        BoundaryObserver.java
        CountryCount.java
        CountryObserver.java
        GazetteerMatcher.java
        GazetteerUpdateProcessorFactory.java
        LocationObserver.java
        PlaceCandidate.java
        PlaceCount.java
        PlaceEvidence.java
        PlaceGeocoder.java
        ScoredPlace.java
        SolrGazetteer.java
        TagFilter.java
        rules
        ContextualOrganizationRule.java
        CoordinateAssociationRule.java
        CountryRule.java
        GeocodeRule.java
        LocationChooserRule.java
        MajorPlaceRule.java
        NameCodeRule.java
        NameRule.java
        NonsenseFilter.java
        PersonNameFilter.java
        ProvinceAssociationRule.java
        xtax
        TaxonMatch.java
        TaxonMatcher.java
        output
        AbstractFormatter.java
        CSVFormatter.java
        FormatterFactory.java
        GDBFormatter.java
        GISDataFormatter.java
        GISDataModel.java
        GeoCSVFormatter.java
        KMLFormatter.java
        OpenSextantSchema.java
        ResultsFormatter.java
        ShapefileFormatter.java
        WKTFormatter.java
        processing
        ResultsUtility.java
        XtractorGroup.java
        progress
        ProgressListener.java
        ProgressMonitor.java
        ProgressMonitorBase.java
        util
        SolrProxy.java
        SolrUtil.java
    - test
      - java
        org
        opensextant
        extractors
        test
        TestExtraction.java
        TestGazFactory.java
        TestGazMatcher.java
        TestGazetteer.java
        TestGazetteerConflationKey.java
        TestPersonFilter.java
        TestPlaceGeocoder.java
        TestPlaceGeocoderLanguages.java
        TestStopFilters.java
        TestUtils.java
        TestXTax.java
- MapReduce
  - src
    - main
      - java
        org
        opensextant
        mapreduce
        AbstractMapper.java
        GeoTaggerMapper.java
        KeywordTaggerMapper.java
        Log4JUtils.java
        LoggingUtilities.java
        XponentsTaggerDemo.java
    - test
      - java
        org
        apache
        solr
        core
        CoreContainer.java
- Patterns
  - src
    - main
      - java
        org
        opensextant
        extractors
        flexpat
        AbstractFlexPat.java
        PatternTestCase.java
        RegexPattern.java
        RegexPatternManager.java
        TextMatchResult.java
        poli
        PatternsOfLife.java
        PoliMatch.java
        PoliPatternManager.java
        TestCase.java
        data
        MACAddress.java
        Money.java
        TelephoneNumber.java
        xcoord
        DMSFilter.java
        DMSOrdinate.java
        GeocoordMatch.java
        GeocoordMatchFilter.java
        GeocoordNormalization.java
        GeocoordPattern.java
        GeocoordPrecision.java
        GeocoordTestCase.java
        Hemisphere.java
        MGRSFilter.java
        MGRSParser.java
        PatternManager.java
        PrecisionScales.java
        UTMParser.java
        XConstants.java
        XCoord.java
        xtemporal
        DateMatch.java
        DateNormalization.java
        DateTimePattern.java
        PatternManager.java
        TestCase.java
        XTConstants.java
        XTemporal.java
    - test
      - java
        org
        opensextant
        extractors
        test
        DateNormalizationTest.java
        PrecisionScalesTest.java
        TestPoLi.java
        TestPoLiReporter.java
        TestXCoord.java
        TestXCoordReporter.java
        TestXTemporal.java
        TestXTemporalReporter.java
- XText
  - examples
  - src
    - main
      - java
        org
        opensextant
        xtext
        Content.java
        ConversionListener.java
        ConvertedDocument.java
        Converter.java
        ExclusionFilter.java
        PathManager.java
        XText.java
        collectors
        ArchiveNavigator.java
        CollectionListener.java
        Collector.java
        mailbox
        DefaultMailCrawl.java
        MailClient.java
        MailConfig.java
        NTLMAuth.java
        OutlookPSTCrawler.java
        sharepoint
        DefaultSharepointCrawl.java
        SPLink.java
        SharepointClient.java
        web
        CrawlFilter.java
        DefaultWebCrawl.java
        HyperLink.java
        WebClient.java
        converters
        ConverterAdapter.java
        DefaultConverter.java
        EmbeddedContentConverter.java
        ImageMetadataConverter.java
        MessageConverter.java
        TextTranscodingConverter.java
        TikaHTMLConverter.java
        WebArchiveConverter.java
    - test
      - java
        org
        opensextant
        xtext
        converters
        test
        MessageConverterTest.java
        test
        Decomposer.java
        ImageGroper.java
        MailClientTest.java
        SharepointClientTest.java
        SharepointCrawlTest.java
        TestPST.java
        TestSPLinks.java
        TestTikaPST.java
        Tests.java
        TextTranscodingTest.java
        WebLinkTest.java
- Xlayer
  - src
    - main
      - java
        org
        opensextant
        xlayer
        Transforms.java
        XlayerClient.java
        server
        RequestParameters.java
        TaggerResource.java
        XlayerApp.java
        xgeo
        XlayerRestlet.java
        XlayerServer.java
        XponentsGeotagger.java
    - test
      - java
        XlayerClientTest.java

/**
 * Copyright 2009-2013 The MITRE Corporation.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 *
 *
 * **************************************************************************
 * NOTICE This software was produced for the U. S. Government under Contract No.
 * W15P7T-12-C-F600, and is subject to the Rights in Noncommercial Computer
 * Software and Noncommercial Computer Software Documentation Clause
 * 252.227-7014 (JUN 1995)
 *
 * (c) 2012 The MITRE Corporation. All Rights Reserved.
 * **************************************************************************
 *
 */
package org.opensextant.examples;

import java.io.File;
import java.io.IOException;
import java.util.List;

import org.apache.commons.io.FilenameUtils;
import org.opensextant.ConfigException;
import org.opensextant.extraction.ExtractionMetrics;
import org.opensextant.extractors.geo.PlaceGeocoder;
import org.opensextant.extractors.xtemporal.XTemporal;
import org.opensextant.output.AbstractFormatter;
import org.opensextant.output.FormatterFactory;
import org.opensextant.output.ResultsFormatter;
import org.opensextant.processing.Parameters;
import org.opensextant.processing.ProcessingException;
import org.opensextant.processing.XtractorGroup;
import org.opensextant.util.FileUtility;
import org.opensextant.util.TextUtils;
import org.opensextant.xtext.ConversionListener;
import org.opensextant.xtext.ConvertedDocument;
import org.opensextant.xtext.XText;
import org.slf4j.LoggerFactory;

/**
 * <pre>
 * A default illustration of using Xponent xtractors for geo and temporal
 * extraction.  This demo shows how to:
 *
 *  setup some extractors
 *  crawl data
 *  process data
 *  output in particular formats.
 *
 * All showing the most basic aspects of the OpenSextant and Xponents APIs
 *
 * NOTE: this is a variation on OpenSextant v1.4 "Runner" app.
 *
 *</pre>
 *
 * @author ubaldino
 */
public class BasicGeoTemporalProcessing extends XtractorGroup implements ConversionListener {

    private Parameters params = new Parameters();
    protected XText converter;
    /* # of documents */
    private int total_docs = 0;
    private long total_rawbytes = 0;
    private long total_size = 0;
    /* Process 4 MB of text content  800 x 5KB average documents */
    private ExtractionMetrics conversionMetric = new ExtractionMetrics("doc-conversion");
    private ExtractionMetrics processingMetric = new ExtractionMetrics("doc-processing");
    private boolean overwriteOutput = true;

    /**
     *
     */
    public BasicGeoTemporalProcessing() {
        log = LoggerFactory.getLogger(BasicGeoTemporalProcessing.class);
    }

    /**
     * Shutdown: release global resources, if any; Close all formatters
     *
     */
    public void shutdown() {
        //PlacenameMatcher.shutdown();
        cleanupAll();

        for (ResultsFormatter outputter : formatters) {
            outputter.finish();
        }
    }

    /**   Ideally you should separate your one-time initialization steps, configuring your extractors
     * apart from the repetitive steps of setting up Jobs and Inputs.   Outputs you might setup once
     * for the entire JVM session, or it may be something you do periodically.  In summary:
     *
     * configure separately:
     *   a) extractors, converters
     *   b) job inputs and parameters
     *   c) output formatters
     *   d) other resources, e.g., filters
     */
    public void setup(String inFile, List<String> outFormats, String outFile, String tempDir)
            throws ConfigException, ProcessingException, IOException {

        params.isdefault = false;

        if (!validateParameters(inFile, outFormats, outFile, tempDir, params)) {
            throw new ProcessingException("VALIDATION ERRORS: " + runnerMessage.toString());
        }

        // If you are dead-sure you want only coordinates from text, then just use XCoord.
        // Otherwise SimpleGeocoder does both coords + names.
        // 
        //XCoord xcoord = new XCoord();
        //xcoord.configure();
        //this.addExtractor(xcoord);

        // Testing only
        params.tag_places = true;
        params.tag_coordinates = true;
        params.output_countries = false;

        PlaceGeocoder geocoder = new PlaceGeocoder();
        geocoder.enablePersonNameMatching(true);
        geocoder.setParameters(params);
        geocoder.configure();
        this.addExtractor(geocoder);

        XTemporal xtemp = new XTemporal();
        xtemp.configure();
        this.addExtractor(xtemp);

        converter = new XText();

        converter.enableHTMLScrubber(false);
        converter.enableSaving(true);
        converter.enableOverwrite(false);
        converter.setConversionListener(this);

        // Complications:  Where do we save converted items?
        // Developer should change this based on actual environment, paths, perms, etc.
        // Using a "temp" folder as XText cache or no cache at all... 
        // This is for illustration purposes only.
        // 
        if (tempDir != null) {
            converter.getPathManager().setConversionCache(tempDir);
        } else {
            converter.enableSaving(false);
        }

        try {
            converter.setup();
        } catch (IOException ioerr) {
            throw new ConfigException("Document converter could not start", ioerr);
        }

        this.params.inputFile = inFile.trim();
        this.params.outputFile = outFile.trim();

        if (outFormats != null) {
            for (String fmt : outFormats) {
                params.addOutputFormat(fmt);
                AbstractFormatter formatter = createFormatter(fmt, params);
                formatter.overwrite = overwriteOutput;
                this.addFormatter(formatter);

                //if (formatter instanceof CSVFormatter) {
                //    formatter.addField(OpenSextantSchema.FILEPATH.getName());
                //    formatter.addField(OpenSextantSchema.MATCH_TEXT.getName());
                // }
                formatter.start(params.getJobName());
            }
        }

    }

    /**
     * The default formatter
     */
    public static AbstractFormatter createFormatter(String outputFormat, Parameters plist)
            throws IOException, ProcessingException {

        if (plist.isdefault) {
            throw new ProcessingException("Caller is required to use non-default Parameters; "
                    + "\nat least set the output options, folder, jobname, etc.");
        }
        AbstractFormatter formatter = (AbstractFormatter) FormatterFactory
                .getInstance(outputFormat);
        if (formatter == null) {
            throw new ProcessingException("Wrong formatter?");
        }

        formatter.setParameters(plist);
        formatter.setOutputFilename(plist.getJobName() + formatter.outputExtension);

        return formatter;
    }

    /**
     * =============================================== Pipeline mechanics: track
     * # of docs, raw bytes, plain/text chars.
     * ===============================================
     */
    /**
     * Statusing metrics: # of documents processed so far.
     */
    public int getCurrentDocCount() {
        return total_docs;
    }

    /**
     * Statusing metrics: # of raw bytes processed so far.
     */
    public long getCurrentByteCount() {
        return total_rawbytes;
    }

    /**
     * Statusing metrics: # of plain text characters processed so far.
     */
    public long getCurrentTextCharCount() {
        return total_size;
    }

    /**
     * Runs OpenSextant. See the
     * <code>main</code> method for a description of the input parameters. TODO:
     * outFile is not used. It is only used as a part of global settings
     * somewhere....
     * @throws ConfigException 
     *
     */
    public void run() throws ProcessingException, IOException, ConfigException {

        printRequest();

        log.info("Starting document ingest");
        startTime = System.currentTimeMillis();
        prevTime = startTime;

        // All input and processing happens within:
        converter.extractText(this.params.inputFile);

        reportMemory();
        log.info("Finished all processing");
    }

    long startTime = 0;
    long prevTime = 0;

    /**
     * Note -- a corpus will explode in memory if the job is too large.
     * Processor design should account for how to partition the problem -
     * ingest, conversion, geocoding, persistence, output format generation.
     *
     * This implements the XText conversion listener -- when a document is found
     * it is reported here. We add it to the corpus prior to executing the
     * application on the corpus.
     *
     * The preferred mode is to take the list of document URLs and process them
     * as a batch.
     *
     */
    public void handleConversion(ConvertedDocument txtdoc, String fpath) {
        if (txtdoc == null) {
            log.error("NOTE: Document could not be converted FILE={}", fpath);
            return;
        }
        total_rawbytes += txtdoc.filesize;
        ++total_docs;
        total_size += txtdoc.buffer.length();
        long now = System.currentTimeMillis();

        conversionMetric.addTime(now - prevTime);
        prevTime = now;

        this.processAndFormat(txtdoc);
        now = System.currentTimeMillis();
        processingMetric.addTime(now - prevTime);
        prevTime = now;

        if (total_docs % 100 == 0) {
            reportMemory();
        }
    }

    public void reportMemory() {
        Runtime R = Runtime.getRuntime();
        long usedMemory = R.totalMemory() - R.freeMemory();
        log.info("CURRENT MEM USAGE(K)=" + (int) (usedMemory / 1024));
    }

    public void reportMetrics() {
        log.info("===============\nDOCUMENT CONVERSION");
        log.info("\t" + conversionMetric.toString());

        log.info("===============\nDOCUMENT PROCESSING");
        log.info("\t" + processingMetric.toString());
    }

    private static String _inFile = null;
    private static String _outFile = null;
    private static String _outFormat = null;
    private static List<String> _outFormats = null;
    private static String _tempDir = null;

    /**
     * Parse command line options.
     */
    private static void parseCommandLine(String[] args) {
        gnu.getopt.Getopt opts = new gnu.getopt.Getopt("BasicGeoTemp", args, "hi:f:o:t:");

        int c;
        while ((c = opts.getopt()) != -1) {
            switch (c) {

            // -i inputFile = path to file or directory of files to be processed
            case 'i':
                _inFile = opts.getOptarg();
                break;

            // -f outputFormat = the desired output format
            case 'f':
                _outFormat = opts.getOptarg();
                _outFormats = TextUtils.string2list(_outFormat.trim(), ",");
                break;

            // -o outputDir = the path to output file
            case 'o':
                _outFile = opts.getOptarg();
                break;

            // -t tempDir = the path to temp directory
            case 't':
                _tempDir = opts.getOptarg();
                break;
            case 'h':
            default:
                printHelp();
                System.exit(-1);
            }
        }
    }

    protected void printRequest() {
        log.info("----------------- REQUEST -----------------");
        log.info("Input file: " + params.inputFile);
        log.info("Output format: " + params.getOutputFormats());
        log.info("Output location: " + params.outputDir);
    }

    /**
     * Print a usage message
     */
    protected static void printHelp() {

        System.out.println("Options:");
        System.out.println("\t-i inputFile = path to file or directory of files to be processed");
        System.out.println("\t-f outputFormat = the desired output format");
        System.out.println("\t-o outputFile = the path to output file");
        System.out.println("\t-t tempDir = the path to the temporary storage directory");
    }

    private StringBuilder runnerMessage = new StringBuilder();

    /**
     * Check that the input parameters are valid and complete.
     *
     * @return true if parameters and defaults suffice; false otherwise.
     */
    public boolean validateParameters(String inPath, List<String> outFormats, String outPath,
            String tempDir, Parameters plist) {

        runnerMessage = new StringBuilder();

        if (outPath == null) {
            runnerMessage.append("Please specify an Output file or folder");
            return false;
        }

        inPath = inPath.trim();
        outPath = outPath.trim();

        // Make sure input file exists
        File inFile = new File(inPath);
        if (!inFile.exists()) {
            runnerMessage.append("Input file " + inPath + " does not exist");
            return false;
        }

        // Check output format
        if (outFormats != null) {
            for (String outFormat : outFormats) {
                if (!FormatterFactory.isSupported(outFormat)) {
                    runnerMessage.append("Unrecognized output format: " + outFormat);
                    return false;
                }
            }
        }

        if (inPath.startsWith("$") || outPath.startsWith("$")) {
            runnerMessage.append("Invalid input/output -- Ant style arguments are null");
            return false;
        }
        // Verify user has specified a directory for unpacking an archive

        // Get file extension
        //String ext = FilenameUtils.getExtension(inPath);

        if (FileUtility.isArchiveFile(inPath) && tempDir == null) {
            runnerMessage
                    .append("A directory for temporary storage must be provided for unpacking Zip and other archive files");
            return false;
        }

        // Split the path name into directory and file names
        File container = new File(outPath);
        File destDir = null;
        String destFile = null;
        log.info("Working off INPUT=" + container.getAbsolutePath());

        if (container.isDirectory()) {
            destDir = container;
            try {
                // DEFAULT file name.
                plist.setJobName("OpenSextant_Output_" + Parameters.getJobTimestamp());
            } catch (Exception fmterr) {
                runnerMessage
                        .append("Failed to invoke the requested format to create a default output file");
                return false;
            }
        } else {
            destDir = container.getParentFile();
            if (destDir == null) {
                destDir = new File(".");
                log.info("Saving output to current working directory");
            }
            destFile = container.getName();
            plist.setJobName(FilenameUtils.getBaseName(destFile));
        }

        if (!destDir.exists()) {
            // throw new IOException("Sorry - your destination folder " + destDir + " must exist");
            runnerMessage.append("Destination folder must exist, DIR=" + destDir.getAbsolutePath());
            return false;
        }

        plist.outputDir = destDir.getAbsolutePath();

        return true;
    }

    /**
     * Runs Xponent Example from the command line. Command line options are:
     * <ul>
     * <li>
     * <code>-i </code><i>inputFile</i> Path to file or directory of files to be
     * processed
     * </li><li>
     * <code>-f </code><i>outputFormat</i> The desired output format
     * </li><li>
     * <code>-o </code><i>outputDir</i> Path to output file
     * </li><li>
     * <code>-t </code><i>tempDir</i> Path to the temporary storage directory,
     * if one is required
     * </li><li>
     * <code>-d </code><i>descriptionType</i> Choice of text string used to fill
     * description fields, if the output format has a description field.
     * </li>
     * </ul><p>
     */
    public static void main(String[] args) {

        System.out.println("Parsing Commandline");
        parseCommandLine(args);
        try {
            BasicGeoTemporalProcessing runner = new BasicGeoTemporalProcessing();

            runner.setup(_inFile, _outFormats, _outFile, _tempDir);
            runner.run();
            runner.shutdown();
            // Success.
        } catch (Exception err) {
            err.printStackTrace();
        }
        System.exit(0);
    }
}