BasicFields.java example

Explorer

nutchbase-master
- src
  - java
    - org
      - apache
        nutch
        admin
        AdministrationApp.java
        DefaultGuiComponent.java
        DefaultRealm.java
        GuiComponent.java
        GuiConfigUtil.java
        NutchInstance.java
        TaskThread.java
        WebContainer.java
        analysis
        AnalyzerFactory.java
        CharStream.java
        CommonGrams.java
        FastCharStream.java
        NutchAnalysis.java
        NutchAnalysisConstants.java
        NutchAnalysisTokenManager.java
        NutchAnalyzer.java
        NutchDocumentAnalyzer.java
        NutchDocumentTokenizer.java
        ParseException.java
        Token.java
        TokenManager.java
        TokenMgrError.java
        clustering
        HitsCluster.java
        OnlineClusterer.java
        OnlineClustererFactory.java
        crawl
        AbstractFetchSchedule.java
        AdaptiveFetchSchedule.java
        Crawl.java
        CrawlDatum.java
        CrawlDb.java
        CrawlDbFilter.java
        CrawlDbMerger.java
        CrawlDbReader.java
        CrawlDbReducer.java
        DefaultFetchSchedule.java
        FetchSchedule.java
        FetchScheduleFactory.java
        Generator.java
        Injector.java
        Inlink.java
        Inlinks.java
        LinkDb.java
        LinkDbFilter.java
        LinkDbMerger.java
        LinkDbReader.java
        MD5Signature.java
        MapWritable.java
        NutchWritable.java
        PartitionUrlByHost.java
        Signature.java
        SignatureComparator.java
        SignatureFactory.java
        TextProfileSignature.java
        fetcher
        Fetcher.java
        FetcherOutput.java
        FetcherOutputFormat.java
        html
        Entities.java
        indexer
        DeleteDuplicates.java
        FsDirectory.java
        HighFreqTerms.java
        IndexMerger.java
        IndexSorter.java
        Indexer.java
        IndexerMapReduce.java
        IndexerOutputFormat.java
        IndexingException.java
        IndexingFilter.java
        IndexingFilters.java
        NutchDocument.java
        NutchIndexWriter.java
        NutchIndexWriterFactory.java
        NutchSimilarity.java
        field
        AnchorFields.java
        BasicFields.java
        CustomFields.java
        FieldFilter.java
        FieldFilters.java
        FieldIndexer.java
        FieldType.java
        FieldWritable.java
        Fields.java
        FieldsWritable.java
        lucene
        LuceneConstants.java
        LuceneWriter.java
        solr
        SolrConstants.java
        SolrDeleteDuplicates.java
        SolrIndexer.java
        SolrWriter.java
        metadata
        CreativeCommons.java
        DublinCore.java
        Feed.java
        HttpHeaders.java
        MetaWrapper.java
        Metadata.java
        Nutch.java
        Office.java
        SpellCheckedMetadata.java
        net
        URLFilter.java
        URLFilterChecker.java
        URLFilterException.java
        URLFilters.java
        URLNormalizer.java
        URLNormalizerChecker.java
        URLNormalizers.java
        protocols
        HttpDateFormat.java
        ProtocolException.java
        Response.java
        ontology
        Ontology.java
        OntologyFactory.java
        parse
        HTMLMetaTags.java
        HtmlParseFilter.java
        HtmlParseFilters.java
        Outlink.java
        OutlinkExtractor.java
        Parse.java
        ParseData.java
        ParseException.java
        ParseImpl.java
        ParseOutputFormat.java
        ParsePluginList.java
        ParsePluginsReader.java
        ParseResult.java
        ParseSegment.java
        ParseStatus.java
        ParseText.java
        ParseUtil.java
        Parser.java
        ParserChecker.java
        ParserFactory.java
        ParserNotFound.java
        plugin
        CircularDependencyException.java
        Extension.java
        ExtensionPoint.java
        MissingDependencyException.java
        Pluggable.java
        Plugin.java
        PluginClassLoader.java
        PluginDescriptor.java
        PluginManifestParser.java
        PluginRepository.java
        PluginRuntimeException.java
        protocol
        Content.java
        EmptyRobotRules.java
        Protocol.java
        ProtocolException.java
        ProtocolFactory.java
        ProtocolNotFound.java
        ProtocolOutput.java
        ProtocolStatus.java
        RobotRules.java
        scoring
        ScoringFilter.java
        ScoringFilterException.java
        ScoringFilters.java
        webgraph
        LinkDatum.java
        LinkDumper.java
        LinkRank.java
        LoopReader.java
        Loops.java
        Node.java
        NodeDumper.java
        NodeReader.java
        ScoreUpdater.java
        WebGraph.java
        searcher
        DistributedSearch.java
        DistributedSearchBean.java
        DistributedSegmentBean.java
        FetchedSegments.java
        FieldQueryFilter.java
        Hit.java
        HitContent.java
        HitDetailer.java
        HitDetails.java
        HitInlinks.java
        HitSummarizer.java
        Hits.java
        IndexSearcher.java
        LinkDbInlinks.java
        LuceneQueryOptimizer.java
        LuceneSearchBean.java
        NutchBean.java
        OpenSearchServlet.java
        Query.java
        QueryException.java
        QueryFilter.java
        QueryFilters.java
        RPCSearchBean.java
        RPCSegmentBean.java
        RawFieldQueryFilter.java
        SearchBean.java
        Searcher.java
        SegmentBean.java
        SolrSearchBean.java
        Summarizer.java
        SummarizerFactory.java
        Summary.java
        response
        RequestUtils.java
        ResponseWriter.java
        ResponseWriters.java
        SearchResults.java
        SearchServlet.java
        segment
        ContentAsTextInputFormat.java
        SegmentMerger.java
        SegmentPart.java
        SegmentReader.java
        servlet
        Cached.java
        tools
        DmozParser.java
        FreeGenerator.java
        PruneIndexTool.java
        ResolveUrls.java
        SearchLoadTester.java
        arc
        ArcInputFormat.java
        ArcRecordReader.java
        ArcSegmentCreator.java
        compat
        CrawlDbConverter.java
        ReprUrlFixer.java
        util
        CommandRunner.java
        DeflateUtils.java
        DomUtil.java
        EncodingDetector.java
        FSUtils.java
        GZIPUtils.java
        GenericWritableConfigurable.java
        HadoopFSUtil.java
        LockUtil.java
        LogUtil.java
        MimeUtil.java
        NodeWalker.java
        NutchConfiguration.java
        NutchJob.java
        ObjectCache.java
        PrefixStringMatcher.java
        StringUtil.java
        SuffixStringMatcher.java
        TrieStringMatcher.java
        URLUtil.java
        domain
        DomainStatistics.java
        DomainSuffix.java
        DomainSuffixes.java
        DomainSuffixesReader.java
        TopLevelDomain.java
        nutchbase
        crawl
        AbstractFetchScheduleHbase.java
        AdaptiveFetchScheduleHbase.java
        CrawlDatumHbase.java
        DefaultFetchScheduleHbase.java
        FetchScheduleFactoryHbase.java
        FetchScheduleHbase.java
        GeneratorHbase.java
        InjectorHbase.java
        MD5SignatureHbase.java
        SignatureFactoryHbase.java
        SignatureHbase.java
        TextProfileSignatureHbase.java
        UpdateTable.java
        WebtableStatisticsReader.java
        fetcher
        FetcherHbase.java
        indexer
        IndexerHbase.java
        IndexingFilterHbase.java
        IndexingFiltersHbase.java
        parse
        HtmlParseFilterHbase.java
        HtmlParseFiltersHbase.java
        ParseHbase.java
        ParseTable.java
        ParseUtilHbase.java
        ParserFactoryHbase.java
        ParserHbase.java
        plugin
        PluggableHbase.java
        protocol
        ProtocolFactoryHbase.java
        ProtocolHbase.java
        searcher
        NutchBeanHbase.java
        util
        hbase
        ImmutableRowPart.java
        RowPart.java
        TableColumns.java
        TableMapReduce.java
        TableUtil.java
        WebTableCreator.java
  - plugin
    - admin-configuration
      - src
        java
        org
        apache
        nutch
        admin
        configuration
        XpathUtil.java
    - admin-crawldb-status
      - src
        java
        org
        apache
        nutch
        admin
        crawldb
        UrlWithStatus.java
    - admin-index
      - src
        java
        org
        apache
        nutch
        admin
        index
        IndexThread.java
    - admin-inject
      - src
        java
        org
        apache
        nutch
        admin
        inject
        InjectThread.java
    - admin-management
      - src
        java
        org
        apache
        nutch
        admin
        management
        CrawldbThread.java
        CreateFileThread.java
        DeleteFileThread.java
        FetchThread.java
        FileUtil.java
        GenerateThread.java
        IndexThread.java
        LinkdbThread.java
        ParseThread.java
    - admin-pageranks
      - src
        java
        org
        apache
        nutch
        admin
        pageranks
        JenkinsHash.java
        PageRankService.java
        PageranksThread.java
        PageranksViewer.java
    - admin-scheduling
      - src
        java
        org
        apache
        nutch
        admin
        scheduling
        AdminCrawl.java
        FileJobStore.java
        FileJobStoreSerializer.java
        PathSerializable.java
        SchedulingService.java
        test
        org
        apache
        nutch
        admin
        scheduling
        TestSchedulingService.java
    - admin-scores
      - src
        java
        com
        trylog
        scoring
        TrylogScoringFilter.java
        org
        apache
        nutch
        admin
        scores
        Modification.java
        ScoreUpdater.java
        UrlWithScore.java
    - admin-system
      - src
        java
        org
        apache
        nutch
        admin
        system
        SystemUtil.java
    - analysis-de
      - src
        java
        org
        apache
        nutch
        analysis
        de
        GermanAnalyzer.java
    - analysis-fr
      - src
        java
        org
        apache
        nutch
        analysis
        fr
        FrenchAnalyzer.java
    - clustering-carrot2
      - src
        java
        org
        apache
        nutch
        clustering
        carrot2
        Clusterer.java
        HitsClusterAdapter.java
        NutchDocument.java
        NutchInputComponent.java
        test
        org
        apache
        nutch
        clustering
        carrot2
        TestClusterer.java
    - creativecommons
      - src
        java
        org
        creativecommons
        nutch
        CCDeleteUnlicensedTool.java
        CCIndexingFilter.java
        CCParseFilter.java
        CCQueryFilter.java
        test
        org
        creativecommons
        nutch
        TestCCParseFilter.java
    - feed
      - src
        java
        org
        apache
        nutch
        indexer
        feed
        FeedIndexingFilter.java
        parse
        feed
        FeedParser.java
        test
        org
        apache
        nutch
        parse
        feed
        TestFeedParser.java
    - field-basic
      - src
        java
        org
        apache
        nutch
        indexer
        field
        basic
        BasicFieldFilter.java
    - field-boost
      - src
        java
        org
        apache
        nutch
        indexer
        field
        boost
        BoostFieldFilter.java
    - index-anchor
      - src
        java
        org
        apache
        nutch
        indexer
        anchor
        AnchorIndexingFilter.java
    - index-basic
      - src
        java
        org
        apache
        nutch
        indexer
        basic
        BasicIndexingFilter.java
    - index-basichbase
      - src
        java
        org
        apache
        nutchbase
        indexer
        basic
        BasicIndexingFilterHbase.java
    - index-more
      - src
        java
        org
        apache
        nutch
        indexer
        more
        MoreIndexingFilter.java
        test
        org
        apache
        nutch
        indexer
        more
        TestMoreIndexingFilter.java
    - languageidentifier
      - src
        java
        org
        apache
        nutch
        analysis
        lang
        HTMLLanguageParser.java
        LanguageIdentifier.java
        LanguageIndexingFilter.java
        LanguageQueryFilter.java
        NGramProfile.java
        test
        org
        apache
        nutch
        analysis
        lang
        TestHTMLLanguageParser.java
        TestLanguageIdentifier.java
        TestNGramProfile.java
    - lib-http
      - src
        java
        org
        apache
        nutch
        protocol
        http
        api
        BlockedException.java
        HttpBase.java
        HttpException.java
        RobotRulesParser.java
        test
        org
        apache
        nutch
        protocol
        http
        api
        TestRobotRulesParser.java
    - lib-httphbase
      - src
        java
        org
        apache
        nutchbase
        protocol
        http
        api
        BlockedException.java
        HttpBase.java
        HttpException.java
        RobotRulesParser.java
    - lib-parsems
      - src
        java
        org
        apache
        nutch
        parse
        ms
        MSBaseParser.java
        MSExtractor.java
    - lib-regex-filter
      - src
        java
        org
        apache
        nutch
        urlfilter
        api
        RegexRule.java
        RegexURLFilterBase.java
        test
        org
        apache
        nutch
        urlfilter
        api
        RegexURLFilterBaseTest.java
    - microformats-reltag
      - src
        java
        org
        apache
        nutch
        microformats
        reltag
        RelTagIndexingFilter.java
        RelTagParser.java
        RelTagQueryFilter.java
    - ontology
      - src
        java
        org
        apache
        nutch
        ontology
        jena
        OntologyImpl.java
        OwlParser.java
        Parser.java
        test
        org
        apache
        nutch
        ontology
        jena
        TestOntology.java
    - parse-ext
      - src
        java
        org
        apache
        nutch
        parse
        ext
        ExtParser.java
        test
        org
        apache
        nutch
        parse
        ext
        TestExtParser.java
    - parse-html
      - src
        java
        org
        apache
        nutch
        parse
        html
        DOMBuilder.java
        DOMContentUtils.java
        HTMLMetaProcessor.java
        HtmlParser.java
        XMLCharacterRecognizer.java
        test
        org
        apache
        nutch
        parse
        html
        TestDOMContentUtils.java
        TestRobotsMetaProcessor.java
    - parse-htmlhbase
      - src
        java
        org
        apache
        nutchbase
        parse
        html
        DOMBuilder.java
        DOMContentUtils.java
        HTMLMetaProcessor.java
        HtmlParserHbase.java
        XMLCharacterRecognizer.java
    - parse-js
      - src
        java
        org
        apache
        nutch
        parse
        js
        JSParseFilter.java
    - parse-jshbase
      - src
        java
        org
        apache
        nutchbase
        parse
        js
        JSParseFilterHbase.java
    - parse-msexcel
      - src
        java
        org
        apache
        nutch
        parse
        msexcel
        ExcelExtractor.java
        MSExcelParser.java
        test
        org
        apache
        nutch
        parse
        msexcel
        TestMSExcelParser.java
    - parse-mspowerpoint
      - src
        java
        org
        apache
        nutch
        parse
        mspowerpoint
        ContentReaderListener.java
        FilteredStringWriter.java
        MSPowerPointParser.java
        PPTConstants.java
        PPTExtractor.java
        Slide.java
        TextBox.java
        test
        org
        apache
        nutch
        parse
        mspowerpoint
        AllTests.java
        FileExtensionFilter.java
        TestMSPowerPointParser.java
    - parse-msword
      - src
        java
        org
        apache
        nutch
        parse
        msword
        FastSavedException.java
        MSWordParser.java
        PasswordProtectedException.java
        Test.java
        Word6Extractor.java
        WordExtractor.java
        WordTextBuffer.java
        WordTextPiece.java
        chp
        Word6CHPBinTable.java
        test
        org
        apache
        nutch
        parse
        msword
        TestMSWordParser.java
    - parse-oo
      - src
        java
        org
        apache
        nutch
        parse
        oo
        OOParser.java
        test
        org
        apache
        nutch
        parse
        oo
        TestOOParser.java
    - parse-pdf
      - src
        java
        org
        apache
        nutch
        parse
        pdf
        PdfParser.java
        test
        org
        apache
        nutch
        parse
        pdf
        TestPdfParser.java
    - parse-rss
      - src
        java
        org
        apache
        nutch
        parse
        rss
        FeedParserListenerImpl.java
        RSSParser.java
        structs
        RSSChannel.java
        RSSItem.java
        test
        org
        apache
        nutch
        parse
        rss
        TestRSSParser.java
    - parse-swf
      - src
        java
        org
        apache
        nutch
        parse
        swf
        SWFParser.java
        test
        org
        apache
        nutch
        parse
        swf
        TestSWFParser.java
    - parse-text
      - src
        java
        org
        apache
        nutch
        parse
        text
        TextParser.java
    - parse-zip
      - src
        java
        org
        apache
        nutch
        parse
        zip
        ZipParser.java
        ZipTextExtractor.java
        test
        org
        apache
        nutch
        parse
        zip
        TestZipParser.java
    - protocol-file
      - src
        java
        org
        apache
        nutch
        protocol
        file
        File.java
        FileError.java
        FileException.java
        FileResponse.java
        test
        org
        apache
        nutch
        protocol
        file
        TestProtocolFile.java
    - protocol-ftp
      - src
        java
        org
        apache
        nutch
        protocol
        ftp
        Client.java
        Ftp.java
        FtpError.java
        FtpException.java
        FtpExceptionBadSystResponse.java
        FtpExceptionCanNotHaveDataConnection.java
        FtpExceptionControlClosedByForcedDataClose.java
        FtpExceptionUnknownForcedDataClose.java
        FtpResponse.java
        PrintCommandListener.java
    - protocol-http
      - src
        java
        org
        apache
        nutch
        protocol
        http
        Http.java
        HttpResponse.java
    - protocol-httpclient
      - src
        java
        org
        apache
        nutch
        protocol
        httpclient
        DummySSLProtocolSocketFactory.java
        DummyX509TrustManager.java
        Http.java
        HttpAuthentication.java
        HttpAuthenticationException.java
        HttpAuthenticationFactory.java
        HttpBasicAuthentication.java
        HttpResponse.java
        test
        org
        apache
        nutch
        protocol
        httpclient
        TestProtocolHttpClient.java
    - protocol-httphbase
      - src
        java
        org
        apache
        nutchbase
        protocol
        http
        Http.java
        HttpResponse.java
    - query-basic
      - src
        java
        org
        apache
        nutch
        searcher
        basic
        BasicQueryFilter.java
    - query-custom
      - src
        java
        org
        apache
        nutch
        searcher
        custom
        CustomFieldQueryFilter.java
    - query-more
      - src
        java
        org
        apache
        nutch
        searcher
        more
        DateQueryFilter.java
        TypeQueryFilter.java
    - query-site
      - src
        java
        org
        apache
        nutch
        searcher
        site
        SiteQueryFilter.java
    - query-url
      - src
        java
        org
        apache
        nutch
        searcher
        url
        URLQueryFilter.java
        test
        org
        apache
        nutch
        searcher
        url
        TestURLQueryFilter.java
    - response-json
      - src
        java
        org
        apache
        nutch
        searcher
        response
        json
        JSONResponseWriter.java
    - response-xml
      - src
        java
        org
        apache
        nutch
        searcher
        response
        xml
        XMLResponseWriter.java
    - scoring-link
      - src
        java
        org
        apache
        nutch
        scoring
        link
        LinkAnalysisScoringFilter.java
    - scoring-opic
      - src
        java
        org
        apache
        nutch
        scoring
        opic
        OPICScoringFilter.java
    - subcollection
      - src
        java
        org
        apache
        nutch
        collection
        CollectionManager.java
        Subcollection.java
        indexer
        subcollection
        SubcollectionIndexingFilter.java
        searcher
        subcollection
        SubcollectionQueryFilter.java
        test
        org
        apache
        nutch
        collection
        TestSubcollection.java
    - summary-basic
      - src
        java
        org
        apache
        nutch
        summary
        basic
        BasicSummarizer.java
    - summary-lucene
      - src
        java
        org
        apache
        nutch
        summary
        lucene
        LuceneSummarizer.java
    - tld
      - src
        java
        org
        apache
        nutch
        indexer
        tld
        TLDIndexingFilter.java
        scoring
        tld
        TLDScoringFilter.java
    - urlfilter-automaton
      - src
        java
        org
        apache
        nutch
        urlfilter
        automaton
        AutomatonURLFilter.java
        test
        org
        apache
        nutch
        urlfilter
        automaton
        TestAutomatonURLFilter.java
    - urlfilter-domain
      - src
        java
        org
        apache
        nutch
        urlfilter
        domain
        DomainURLFilter.java
        test
        org
        apache
        nutch
        urlfilter
        domain
        TestDomainURLFilter.java
    - urlfilter-prefix
      - src
        java
        org
        apache
        nutch
        urlfilter
        prefix
        PrefixURLFilter.java
    - urlfilter-regex
      - src
        java
        org
        apache
        nutch
        urlfilter
        regex
        RegexURLFilter.java
        test
        org
        apache
        nutch
        urlfilter
        regex
        TestRegexURLFilter.java
    - urlfilter-suffix
      - src
        java
        org
        apache
        nutch
        urlfilter
        suffix
        SuffixURLFilter.java
        test
        org
        apache
        nutch
        urlfilter
        suffix
        TestSuffixURLFilter.java
    - urlfilter-validator
      - src
        java
        org
        apache
        nutch
        urlfilter
        validator
        UrlValidator.java
    - urlnormalizer-basic
      - src
        java
        org
        apache
        nutch
        net
        urlnormalizer
        basic
        BasicURLNormalizer.java
        test
        org
        apache
        nutch
        net
        urlnormalizer
        basic
        TestBasicURLNormalizer.java
    - urlnormalizer-pass
      - src
        java
        org
        apache
        nutch
        net
        urlnormalizer
        pass
        PassURLNormalizer.java
        test
        org
        apache
        nutch
        net
        urlnormalizer
        pass
        TestPassURLNormalizer.java
    - urlnormalizer-regex
      - src
        java
        org
        apache
        nutch
        net
        urlnormalizer
        regex
        RegexURLNormalizer.java
        test
        org
        apache
        nutch
        net
        urlnormalizer
        regex
        TestRegexURLNormalizer.java
  - test
    - org
      - apache
        nutch
        analysis
        TestAnalyzerFactory.java
        TestQueryParser.java
        clustering
        TestOnlineClustererFactory.java
        crawl
        CrawlDBTestUtil.java
        DummyWritable.java
        TestCrawlDbMerger.java
        TestGenerator.java
        TestInjector.java
        TestLinkDbMerger.java
        TestSignatureFactory.java
        fetcher
        TestFetcher.java
        indexer
        TestDeleteDuplicates.java
        TestIndexSorter.java
        TestIndexingFilters.java
        metadata
        TestMetadata.java
        TestSpellCheckedMetadata.java
        net
        TestURLFilters.java
        TestURLNormalizers.java
        ontology
        TestOntologyFactory.java
        parse
        TestOutlinkExtractor.java
        TestParseData.java
        TestParseText.java
        TestParserFactory.java
        plugin
        HelloWorldExtension.java
        ITestExtension.java
        SimpleTestPlugin.java
        TestPluginSystem.java
        protocol
        TestContent.java
        TestProtocolFactory.java
        searcher
        TestHitDetails.java
        TestOpenSearchServlet.java
        TestQuery.java
        TestSummarizerFactory.java
        TestSummary.java
        util
        TestEncodingDetector.java
        TestGZIPUtils.java
        TestNodeWalker.java
        TestPrefixStringMatcher.java
        TestStringUtil.java
        TestSuffixStringMatcher.java
        TestURLUtil.java
        WritableTestUtils.java
        nutchbase
        util
        TestTableUtil.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.nutch.indexer.field;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Random;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.ObjectWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableUtils;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.lucene.document.DateTools;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseImpl;
import org.apache.nutch.parse.ParseText;
import org.apache.nutch.scoring.webgraph.LinkDatum;
import org.apache.nutch.scoring.webgraph.Node;
import org.apache.nutch.scoring.webgraph.WebGraph;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.URLUtil;

/**
 * Creates the basic FieldWritable objects.  The basic fields are the main 
 * fields used in indexing segments.  Many other fields jobs will rely on the
 * urls being present in the basic fields output to create their fields for 
 * indexing.
 * 
 * Basic fields are extracted from segements.  Only urls that were successfully
 * fetched and parsed will be converted.  This job also implements a portion of
 * redirect logic.  If a url contains both a redirect or orig url then both the
 * url and its orig will be measured against their link analysis score with the
 * highest scoring one being the url used for display in the index.  This 
 * ensures that we index content under the best, most popular, url which is most
 * often the one users are expecting.
 * 
 * The BasicFields tool can accept one or more segments to convert to fields.
 * If multiple segments have overlapping content, only the latest successfully
 * fetched content will be converted.
 */
public class BasicFields
  extends Configured
  implements Tool {

  public static final Log LOG = LogFactory.getLog(BasicFields.class);

  /**
   * Runs the Extractor job. Extracts basic fields from segments.
   * 
   * @param nodeDb The node database
   * @param segment A single segment to process.
   * @param outputDir The extractor output.
   * 
   * @throws IOException If an error occurs while processing the segment.
   */
  private void runExtractor(Path nodeDb, Path segment, Path outputDir)
    throws IOException {

    LOG.info("BasicFields: starting extractor");
    JobConf job = new NutchJob(getConf());
    job.setJobName("BasicFields " + outputDir);

    LOG.info("BasicFields: extractor adding segment: " + segment);
    FileInputFormat.addInputPath(job, new Path(segment,
      CrawlDatum.FETCH_DIR_NAME));
    FileInputFormat.addInputPath(job, new Path(segment, ParseData.DIR_NAME));
    FileInputFormat.addInputPath(job, new Path(segment, ParseText.DIR_NAME));
    FileInputFormat.addInputPath(job, nodeDb);
    job.setInputFormat(SequenceFileInputFormat.class);
    job.setMapperClass(Extractor.class);
    job.setReducerClass(Extractor.class);
    FileOutputFormat.setOutputPath(job, outputDir);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(ObjectWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(FieldsWritable.class);

    JobClient.runJob(job);
    if (LOG.isInfoEnabled()) {
      LOG.info("BasicFields: finished extractor");
    }
  }

  /**
   * Runs the Flipper job. Flipper is the first of a two part job to implement
   * redirect logic.
   * 
   * @param basicFields The basic fields temporary output.
   * @param nodeDb The node database.
   * @param outputDir The flipper output.
   * 
   * @throws IOException If an error occurs while processing.
   */
  private void runFlipper(Path basicFields, Path nodeDb, Path outputDir)
    throws IOException {

    LOG.info("BasicFields: starting flipper");
    JobConf job = new NutchJob(getConf());
    job.setJobName("BasicFields " + outputDir);
    FileInputFormat.addInputPath(job, nodeDb);
    FileInputFormat.addInputPath(job, basicFields);
    job.setInputFormat(SequenceFileInputFormat.class);
    job.setMapperClass(Flipper.class);
    job.setReducerClass(Flipper.class);
    FileOutputFormat.setOutputPath(job, outputDir);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(ObjectWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LinkDatum.class);

    JobClient.runJob(job);
    if (LOG.isInfoEnabled()) {
      LOG.info("BasicFields: finished flipper");
    }
  }

  /**
   * Runs the Scorer job. Scorer is the second of a two part job to implement
   * redirect logic.
   * 
   * @param basicFields The basic fields temporary output.
   * @param links The temporary output holding urls and any redirects.
   * @param outputDir The scorer output.
   * 
   * @throws IOException If an error occurs while processing.
   */
  private void runScorer(Path basicFields, Path links, Path outputDir)
    throws IOException {

    LOG.info("BasicFields: starting scorer");
    JobConf job = new NutchJob(getConf());
    job.setJobName("BasicFields " + outputDir);
    FileInputFormat.addInputPath(job, links);
    FileInputFormat.addInputPath(job, basicFields);
    job.setInputFormat(SequenceFileInputFormat.class);
    job.setMapperClass(Scorer.class);
    job.setReducerClass(Scorer.class);
    FileOutputFormat.setOutputPath(job, outputDir);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(ObjectWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(FieldsWritable.class);

    JobClient.runJob(job);
    if (LOG.isInfoEnabled()) {
      LOG.info("BasicFields: finished scorer");
    }
  }

  /**
   * Runs the Merger job. Merger ensures that the most recent set of fields for
   * any given url is collected.
   * 
   * @param basicFields The basic fields final output.
   * @param outputDir The merger output.
   * 
   * @throws IOException If an error occurs while processing.
   */
  private void runMerger(Path[] basicFields, Path outputDir)
    throws IOException {

    LOG.info("BasicFields: starting merger");
    JobConf job = new NutchJob(getConf());
    job.setJobName("BasicFields " + outputDir);
    for (Path basic : basicFields) {
      FileInputFormat.addInputPath(job, basic);
    }
    job.setInputFormat(SequenceFileInputFormat.class);
    job.setReducerClass(Merger.class);
    FileOutputFormat.setOutputPath(job, outputDir);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(FieldsWritable.class);

    JobClient.runJob(job);
    if (LOG.isInfoEnabled()) {
      LOG.info("BasicFields: finished merger");
    }
  }

  /**
   * Extracts basic fields from a single segment.
   */
  private static class Extractor
    extends Configured
    implements Mapper<Text, Writable, Text, ObjectWritable>,
    Reducer<Text, ObjectWritable, Text, FieldsWritable> {

    private int MAX_TITLE_LENGTH;
    private Configuration conf;

    /**
     * Default constructor.
     */
    public Extractor() {

    }

    /**
     * Configurable constructor.
     */
    public Extractor(Configuration conf) {
      setConf(conf);
    }

    /**
     * Configures the job.
     */
    public void configure(JobConf conf) {
      this.conf = conf;
      this.MAX_TITLE_LENGTH = conf.getInt("indexer.max.title.length", 100);
    }

    public void close() {
    }

    /**
     * Wraps values in ObjectWritable.
     */
    public void map(Text key, Writable value,
      OutputCollector<Text, ObjectWritable> output, Reporter reporter)
      throws IOException {
      ObjectWritable objWrite = new ObjectWritable();
      objWrite.set(value);
      output.collect(key, objWrite);
    }

    /**
     * Creates basic fields from a single segment.
     */
    public void reduce(Text key, Iterator<ObjectWritable> values,
      OutputCollector<Text, FieldsWritable> output, Reporter reporter)
      throws IOException {

      Node nodeDb = null;
      List<CrawlDatum> fetchDatums = new ArrayList<CrawlDatum>();
      ParseData parseData = null;
      ParseText parseText = null;
      List<FieldWritable> fieldsList = new ArrayList<FieldWritable>();

      // assign values, url must be successfully fetched and parsed
      while (values.hasNext()) {

        ObjectWritable objWrite = values.next();
        Object value = objWrite.get();
        if (value instanceof CrawlDatum) {
          CrawlDatum datum = (CrawlDatum)value;
          if (datum.getStatus() == CrawlDatum.STATUS_FETCH_SUCCESS) {
            fetchDatums.add(datum);
          }
        }
        else if (value instanceof Node) {
          nodeDb = (Node)value;
        }
        else if (value instanceof ParseData
          && ((ParseData)value).getStatus().isSuccess()) {
          parseData = (ParseData)value;
        }
        else if (value instanceof ParseText) {
          parseText = (ParseText)value;
        }
      }

      // if not successfully fetched and parsed then stop processing
      int numDatums = fetchDatums.size();
      if (numDatums == 0 || nodeDb == null || parseText == null
        || parseData == null) {
        return;
      }

      // get the most recent fetch time, this is duplicates inside of a single
      // segment, usually due to redirects
      CrawlDatum fetchDatum = null;
      long mostRecent = 0L;
      for (CrawlDatum cur : fetchDatums) {
        long fetchTime = cur.getFetchTime();
        if (fetchDatum == null || fetchTime > mostRecent) {
          fetchDatum = cur;
          mostRecent = fetchTime;
        }
      }

      // get parse metadata
      Metadata metadata = parseData.getContentMeta();
      Parse parse = new ParseImpl(parseText, parseData);

      // handle redirect urls
      Text reprUrlText = (Text)fetchDatum.getMetaData().get(
        Nutch.WRITABLE_REPR_URL_KEY);
      String reprUrl = reprUrlText != null ? reprUrlText.toString() : null;
      String url = key.toString();
      String fieldUrl = (reprUrl != null) ? reprUrl : url;
      String host = URLUtil.getHost(fieldUrl);

      // add segment, used to map from merged index back to segment files
      FieldWritable segField = new FieldWritable(Fields.SEGMENT,
        metadata.get(Nutch.SEGMENT_NAME_KEY), FieldType.CONTENT, false, true,
        false);
      fieldsList.add(segField);

      // add digest, used by dedup
      FieldWritable digestField = new FieldWritable(Fields.DIGEST,
        metadata.get(Nutch.SIGNATURE_KEY), FieldType.CONTENT, false, true,
        false);
      fieldsList.add(digestField);

      // url is both stored and indexed, so it's both searchable and returned
      fieldsList.add(new FieldWritable(Fields.URL, fieldUrl, FieldType.CONTENT,
        true, true, true));
      fieldsList.add(new FieldWritable(Fields.SEG_URL, url, FieldType.CONTENT,
        false, true, false));

      if (reprUrl != null) {
        // also store original url as both stored and indexes
        fieldsList.add(new FieldWritable(Fields.ORIG_URL, url,
          FieldType.CONTENT, true, true, true));
      }

      if (host != null) {
        // add host as un-stored, indexed and tokenized
        FieldWritable hostField = new FieldWritable(Fields.HOST, host,
          FieldType.CONTENT, true, false, true);
        fieldsList.add(hostField);

        // add site as un-stored, indexed and un-tokenized
        FieldWritable siteField = new FieldWritable(Fields.SITE, host,
          FieldType.CONTENT, true, false, false);
        fieldsList.add(siteField);
      }

      // content is indexed, so that it's searchable, but not stored in index
      fieldsList.add(new FieldWritable(Fields.CONTENT, parse.getText(),
        FieldType.CONTENT, true, false, true));

      // title
      String title = parse.getData().getTitle();
      if (title.length() > MAX_TITLE_LENGTH) { // truncate title if needed
        title = title.substring(0, MAX_TITLE_LENGTH);
      }
      // add title indexed and stored so that it can be displayed
      fieldsList.add(new FieldWritable(Fields.TITLE, title, FieldType.CONTENT,
        true, true, true));

      // add cached content/summary display policy, if available
      String caching = parse.getData().getMeta(Nutch.CACHING_FORBIDDEN_KEY);
      if (caching != null && !caching.equals(Nutch.CACHING_FORBIDDEN_NONE)) {
        fieldsList.add(new FieldWritable(Fields.CACHE, caching,
          FieldType.CONTENT, false, true, false));
      }

      // add timestamp when fetched, for deduplication
      fieldsList.add(new FieldWritable(Fields.TSTAMP, DateTools.timeToString(
        fetchDatum.getFetchTime(), DateTools.Resolution.MILLISECOND),
        FieldType.CONTENT, false, true, false));

      FieldsWritable fields = new FieldsWritable();
      fields.setFieldsList(fieldsList);
      output.collect(key, fields);
    }
  }

  /**
   * Runs the first part of redirect logic.  Breaks out fields if a page
   * contains a redirect.
   */
  public static class Flipper
    extends Configured
    implements Mapper<Text, Writable, Text, ObjectWritable>,
    Reducer<Text, ObjectWritable, Text, LinkDatum> {

    private JobConf conf;

    /**
     * Configures the job.
     */
    public void configure(JobConf conf) {
      this.conf = conf;
    }

    public void close() {
    }

    /**
     * Breaks out the collection of fields for url and redirects if necessary.
     */
    public void map(Text key, Writable value,
      OutputCollector<Text, ObjectWritable> output, Reporter reporter)
      throws IOException {

      ObjectWritable objUrl = new ObjectWritable();
      objUrl.set(key);

      if (value instanceof FieldsWritable) {

        // collect the fields for the url
        FieldsWritable fields = (FieldsWritable)value;
        FieldWritable url = fields.getField(Fields.URL);
        FieldWritable orig = fields.getField(Fields.ORIG_URL);
        output.collect(new Text(url.getValue()), objUrl);

        // collect for the orig / redirect url if one exists
        if (orig != null) {
          output.collect(new Text(orig.getValue()), objUrl);
        }
      }
      else {
        
        // anything else passes through
        ObjectWritable objWrite = new ObjectWritable();
        objWrite.set(value);
        output.collect(key, objWrite);
      }
    }

    /**
     * Collects redirect and original links for a given url key.  This will be
     * used in the Scorer to handle redirects.
     */
    public void reduce(Text key, Iterator<ObjectWritable> values,
      OutputCollector<Text, LinkDatum> output, Reporter reporter)
      throws IOException {

      Node node = null;
      List<String> urls = new ArrayList<String>();

      while (values.hasNext()) {
        ObjectWritable objWrite = values.next();
        Object obj = objWrite.get();
        if (obj instanceof Node) {
          node = (Node)obj;
        }
        else if (obj instanceof Text) {
          urls.add(obj.toString());
        }
      }

      if (urls.size() > 0) {
        float score = (node != null) ? node.getInlinkScore() : 0.0f;
        for (String url : urls) {
          LinkDatum datum = new LinkDatum(key.toString());
          datum.setScore(score);
          output.collect(new Text(url), datum);
        }
      }
    }
  }

  /**
   * The Scorer job sets the boost field from the NodeDb score.
   * 
   * It also runs the second part of redirect logic.  Determining the highest 
   * scoring url for pages that contain redirects.
   */
  public static class Scorer
    extends Configured
    implements Mapper<Text, Writable, Text, ObjectWritable>,
    Reducer<Text, ObjectWritable, Text, FieldsWritable> {

    private JobConf conf;

    /**
     * Configures the job.
     */
    public void configure(JobConf conf) {
      this.conf = conf;
    }

    public void close() {
    }

    /**
     * Wraps values in ObjectWritable.
     */
    public void map(Text key, Writable value,
      OutputCollector<Text, ObjectWritable> output, Reporter reporter)
      throws IOException {

      ObjectWritable objWrite = new ObjectWritable();
      objWrite.set(value);
      output.collect(key, objWrite);
    }

    /**
     * Sets a document boost field from the NodeDb and determines the best 
     * scoring url for pages that have rediects.  Uses the highest scoring url 
     * as the display url in the index.
     */
    public void reduce(Text key, Iterator<ObjectWritable> values,
      OutputCollector<Text, FieldsWritable> output, Reporter reporter)
      throws IOException {

      FieldsWritable fields = null;
      List<LinkDatum> datums = new ArrayList<LinkDatum>();

      while (values.hasNext()) {
        ObjectWritable objWrite = values.next();
        Object obj = objWrite.get();
        if (obj instanceof FieldsWritable) {
          fields = (FieldsWritable)obj;
        }
        else if (obj instanceof LinkDatum) {
          datums.add((LinkDatum)obj);
        }
      }

      int numDatums = datums.size();
      if (fields != null && numDatums > 0) {

        // if no redirect for the page just assign the linkrank boost
        List<FieldWritable> fieldsList = fields.getFieldsList();
        if (numDatums == 1) {
          float linkRank = datums.get(0).getScore();
          fieldsList.add(new FieldWritable(Fields.BOOST, "linkrank",
            FieldType.BOOST, linkRank));
          output.collect(new Text(key), fields);
        }
        else {

          // get both the url and any rediect url stored
          FieldWritable url = fields.getField(Fields.URL);
          FieldWritable orig = fields.getField(Fields.ORIG_URL);
          float urlScore = 0.0f;
          float origScore = 0.0f;

          // get the scores for each
          for (LinkDatum datum : datums) {
            String curUrl = datum.getUrl();
            if (curUrl.equals(url.getValue())) {
              urlScore = datum.getScore();
            }
            else if (curUrl.equals(orig.getValue())) {
              origScore = datum.getScore();
            }
          }

          // if the highest scoring url is not the one currently displayed in 
          // the index under the current basic fields, then switch it
          String urlKey = url.getValue();
          float linkRank = urlScore;
          if (origScore > urlScore) {
            url.setName(Fields.ORIG_URL);
            orig.setName(Fields.URL);

            // We also need to fix the host because we are changing urls
            String host = URLUtil.getHost(orig.getValue());
            if (host != null) {
              fieldsList.remove(fields.getField(Fields.SITE));
              fieldsList.remove(fields.getField(Fields.HOST));
              fieldsList.add(new FieldWritable(Fields.HOST, host,
                FieldType.CONTENT, true, false, true));
              fieldsList.add(new FieldWritable(Fields.SITE, host,
                FieldType.CONTENT, true, false, false));
            }

            linkRank = origScore;
            urlKey = orig.getValue();
          }

          // create the final document boost field
          fieldsList.add(new FieldWritable(Fields.BOOST, "linkrank",
            FieldType.BOOST, linkRank));
          output.collect(new Text(urlKey), fields);
        }
      }
    }
  }

  /**
   * Merges output of all segments fields collecting only the most recent set
   * of fields for any given url.
   */
  public static class Merger
    extends Configured
    implements Reducer<Text, FieldsWritable, Text, FieldsWritable> {

    private JobConf conf;

    /**
     * Configures the job.
     */
    public void configure(JobConf conf) {
      this.conf = conf;
    }

    public void close() {
    }

    /**
     * Collects the most recent set of fields for any url.
     */
    public void reduce(Text key, Iterator<FieldsWritable> values,
      OutputCollector<Text, FieldsWritable> output, Reporter reporter)
      throws IOException {

      List<FieldsWritable> fields = new ArrayList<FieldsWritable>();

      // collects the various sets of fields
      while (values.hasNext()) {
        fields.add((FieldsWritable)WritableUtils.clone(values.next(), conf));
      }

      // if only one set of fields for a given url passthrough
      FieldsWritable outFields = null;
      int numFields = fields.size();
      if (numFields == 1) {
        outFields = fields.get(0);
      }
      else if (numFields > 1) {

        // more than one set of fields means url has been fetched more than 
        // once, collect only the most recent set of fields
        FieldsWritable mostRecent = null;
        long recentTime = 0L;
        for (int i = 0; i < numFields; i++) {
          FieldsWritable cur = fields.get(i);
          String tStampStr = cur.getField(Fields.TSTAMP).getValue();
          long timestamp = Long.parseLong(tStampStr);
          if (mostRecent == null || recentTime < timestamp) {
            recentTime = timestamp;
            mostRecent = cur;
          }
        }

        outFields = mostRecent;
      }

      output.collect(key, outFields);
    }
  }

  /**
   * Runs the BasicFields jobs for every segment and aggregates and filters 
   * the output to create a final database of FieldWritable objects.
   * 
   * @param nodeDb The node database.
   * @param segments The array of segments to process.
   * @param output The BasicFields output.
   * 
   * @throws IOException If an error occurs while processing the segments.
   */
  public void createFields(Path nodeDb, Path[] segments, Path output)
    throws IOException {

    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);
    Path tempOutput = new Path(output.toString() + "-temp");
    fs.mkdirs(tempOutput);
    int numSegments = segments.length;
    Path[] basicFields = new Path[numSegments];

    // one pass per segment to extract and create the basic fields
    for (int i = 0; i < numSegments; i++) {

      Path segment = segments[i];
      Path segOutput = new Path(tempOutput, String.valueOf(i));
      Path tempBasic = new Path(tempOutput, "basic-"
        + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
      Path tempFlip = new Path(tempOutput, "flip-"
        + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

      runExtractor(nodeDb, segment, tempBasic);
      runFlipper(tempBasic, nodeDb, tempFlip);
      runScorer(tempBasic, tempFlip, segOutput);

      fs.delete(tempBasic, true);
      fs.delete(tempFlip, true);
      basicFields[i] = segOutput;
    }

    // merge all of the segments and delete any temporary output
    runMerger(basicFields, output);
    fs.delete(tempOutput, true);
  }

  public static void main(String[] args)
    throws Exception {
    int res = ToolRunner.run(NutchConfiguration.create(), new BasicFields(),
      args);
    System.exit(res);
  }

  /**
   * Runs the BasicFields tool.
   */
  public int run(String[] args)
    throws Exception {

    Options options = new Options();
    Option helpOpts = OptionBuilder.withArgName("help").withDescription(
      "show this help message").create("help");
    Option outputOpts = OptionBuilder.withArgName("output").hasArg().withDescription(
      "the output index directory").create("output");
    Option webGraphOpts = OptionBuilder.withArgName("webgraphdb").hasArg().withDescription(
      "the webgraphdb to use").create("webgraphdb");
    Option segOpts = OptionBuilder.withArgName("segment").hasArgs().withDescription(
      "the segment(s) to use").create("segment");
    options.addOption(helpOpts);
    options.addOption(webGraphOpts);
    options.addOption(segOpts);
    options.addOption(outputOpts);

    CommandLineParser parser = new GnuParser();
    try {

      CommandLine line = parser.parse(options, args);
      if (line.hasOption("help") || !line.hasOption("webgraphdb")
        || !line.hasOption("output") || !line.hasOption("segment")) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp("BasicFields", options);
        return -1;
      }

      // get the command line options and all of the segments
      String webGraphDb = line.getOptionValue("webgraphdb");
      String output = line.getOptionValue("output");
      String[] segments = line.getOptionValues("segment");
      Path[] segPaths = new Path[segments.length];
      for (int i = 0; i < segments.length; i++) {
        segPaths[i] = new Path(segments[i]);
      }

      createFields(new Path(webGraphDb, WebGraph.NODE_DIR), segPaths, new Path(
        output));
      return 0;
    }
    catch (Exception e) {
      LOG.fatal("BasicFields: " + StringUtils.stringifyException(e));
      return -2;
    }
  }
}