ExampleDocumentSource.java example

Explorer

PersonalityExtraction-master
- lair
  - carrot2-java-api-3.5.0-dev
    - examples
      - org
        carrot2
        examples
        ConsoleFormatter.java
        CreateLuceneIndex.java
        SampleDocumentData.java
        clustering
        ClusteringDataFromDocumentSources.java
        ClusteringDataFromLucene.java
        ClusteringDataFromLuceneWithCustomFields.java
        ClusteringDataFromPubMed.java
        ClusteringDocumentList.java
        ClusteringNonEnglishContent.java
        MoreConfigurationsOfOneAlgorithmInCachingController.java
        UsingAttributes.java
        UsingCachingController.java
        UsingComponentSuites.java
        UsingCustomLanguageModel.java
        UsingCustomLexicalResources.java
        core
        LoadingAttributeValuesFromXml.java
        SavingAttributeValuesToXml.java
        SavingResultsToJson.java
        SavingResultsToXml.java
        research
        ClusteringQualityBenchmark.java
        source
        ExampleDocumentSource.java
  - freebase-java-1.0.0
    - src
      - main
        java
        com
        freebase
        api
        Freebase.java
        FreebaseException.java
        JSONTransport.java
        json
        JSON.java
      - test
        java
        com
        freebase
        api
        tests
        Tests.java
        json
        tests
        Tests.java
  - httpcomponents-client-4.0.3
    - examples
      - org
        apache
        http
        examples
        client
        ClientAbortMethod.java
        ClientAuthentication.java
        ClientChunkEncodedPost.java
        ClientConnectionRelease.java
        ClientCustomContext.java
        ClientCustomSSL.java
        ClientEvictExpiredConnections.java
        ClientExecuteDirect.java
        ClientExecuteProxy.java
        ClientFormLogin.java
        ClientGZipContentCompression.java
        ClientInteractiveAuthentication.java
        ClientMultiThreadedExecution.java
        ClientPreemptiveBasicAuthentication.java
        ClientPreemptiveDigestAuthentication.java
        ClientProxyAuthentication.java
        ClientWithResponseHandler.java
        conn
        ManagerConnectDirect.java
        ManagerConnectProxy.java
        OperatorConnectDirect.java
        OperatorConnectProxy.java
        entity
        mime
        ClientMultipartFormPost.java
  - httpcomponents-core-4.0.1
    - examples
      - org
        apache
        http
        examples
        ElementalHttpGet.java
        ElementalHttpPost.java
        ElementalHttpServer.java
        ElementalReverseProxy.java
        PrintVersionInfo.java
        nio
        ElementalEchoServer.java
        NHttpClient.java
        NHttpClientConnManagement.java
        NHttpFileServer.java
        NHttpReverseProxy.java
        NHttpSSLClient.java
        NHttpSSLServer.java
        NHttpServer.java
- src
  - clustering
    - CarrotClustering.java
  - com
    - personalityextractor
      - PGTest.java
      - Runner.java
      - commons
        FileRW.java
        ReadJSON.java
        data
        NounPhrase.java
        Tweet.java
      - data
        Lattice.java
        LatticeEdge.java
        LatticeNode.java
        source
        Facebook.java
        Twitter.java
        Wikiminer.java
      - entity
        Entity.java
        WikipediaEntity.java
        extractor
        BaselineExtractor.java
        CommonNounPhraseExtractor.java
        ConsecutiveWordsEntityExtractor.java
        EntityExtractFactory.java
        IEntityExtractor.java
        NounPhraseExtractor.java
        ProperNounPhraseExtractor.java
        SRLExtractor.java
        SennaNounPhraseExtractor.java
        frequencybased
        IFrequencyBasedExtractor.java
        TopNNPHashTagsExtractor.java
        TopNPExtractor.java
        graph
        Edge.java
        Graph.java
        Node.java
        ranking
        AbstractRanker.java
        IRanker.java
        WeightGraphRanker.java
        resolver
        BaseEntityResolver.java
        ExtractEntities.java
        IEntityResolver.java
        ViterbiResolver.java
        WikiMinerEntityResolver.java
      - evaluation
        EntityExtractionEvaluation.java
        EvalMetrics.java
        PerfMetrics.java
      - store
        LuceneStore.java
        MysqlStore.java
        WikiminerDB.java
      - url
        HTMLParser
        Readability
        Readability.java
        data
        URLContent.java
        URLEntityExtractor.java
  - cs224n
    - util
  - features
    - FreebaseFeatures.java
    - IFeatures.java
  - senna
  - tathya
    - Main.java
    - db
      - DBI.java
      - YahooBOSS.java
    - fs
      - SennaReader.java
    - semantics
      - Event.java
      - Triple.java
      - TripletExtractor.java
      - Word.java
      - datasource
        FreebaseWrapper.java
    - text
      - tokenizer
        ITokenizer.java
        SentenceTokenizer.java
        TwitterTokenizer.java
        WordTokenizer.java
  - twitter


/*
 * Carrot2 project.
 *
 * Copyright (C) 2002-2010, Dawid Weiss, Stanisław Osiński.
 * All rights reserved.
 *
 * Refer to the full license file "carrot2.LICENSE"
 * in the root folder of the repository checkout or at:
 * http://www.carrot2.org/carrot2.LICENSE
 */

package org.carrot2.examples.source;

import java.util.*;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.util.Version;
import org.carrot2.clustering.lingo.LingoClusteringAlgorithm;
import org.carrot2.core.*;
import org.carrot2.core.attribute.*;
import org.carrot2.examples.ConsoleFormatter;
import org.carrot2.examples.SampleDocumentData;
import org.carrot2.util.attribute.*;
import org.carrot2.util.attribute.constraint.ImplementingClasses;
import org.carrot2.util.attribute.constraint.IntRange;

/**
 * This example shows how to implement a simple Carrot2 {@link IDocumentSource}.
 */
@Bindable
public class ExampleDocumentSource extends ProcessingComponentBase implements
    IDocumentSource
{
    @Processing
    @Input
    @Attribute(key = CommonAttributesDescriptor.Keys.QUERY)
    public String query;

    @Processing
    @Input
    @Attribute(key = CommonAttributesDescriptor.Keys.RESULTS)
    @IntRange(min = 1, max = 1000)
    public int results = 20;

    /**
     * Documents produced by this document source. The documents are returned in an output
     * attribute with key equal to {@link CommonAttributesDescriptor.Keys#DOCUMENTS},
     */
    @Processing
    @Output
    @Attribute(key = CommonAttributesDescriptor.Keys.DOCUMENTS)
    @Internal
    public List<Document> documents;

    /**
     * Modulo to fetch the documents with. This dummy input attribute is just to show how
     * custom input attributes can be implemented.
     */
    @Processing
    @Input
    @Attribute
    public int modulo = 1;

    /**
     * Another dummy attribute. This one shows that if the attribute is not a primitive
     * type for the implementation), {@link ImplementingClasses} constraint must be added to specify
     * which assignable types are allowed as values for the attribute. To allow all
     * assignable values, specify empty {@link ImplementingClasses#classes()} and
     * {@link ImplementingClasses#strict()} equal to <code>false</code>.
     */
    @Processing
    @Input
    @Attribute
    @ImplementingClasses(classes = {}, strict = false)
    public Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);

    @Override
    public void process() throws ProcessingException
    {
        // The input attributes will have already been bound at this point

        // Create a place holder for the results
        this.documents = new ArrayList<Document>();

        // Fetch results.
        final List<Document> inputDocuments = 
            new ArrayList<Document>(SampleDocumentData.DOCUMENTS_DATA_MINING);
        int resultsToPush = Math.min(inputDocuments.size(), this.results);
        for (int i = 0; i < resultsToPush; i++)
        {
            if (i % this.modulo == 0)
            {
                final Document originalDocument = inputDocuments.get(i);

                // For the sake of example we just copy the original document fields
                final Document document = new Document();
                document.setField(Document.TITLE, originalDocument
                    .getField(Document.TITLE));
                document.setField(Document.SUMMARY, "");
                document.setField(Document.CONTENT_URL, originalDocument
                    .getField(Document.CONTENT_URL));
                documents.add(document);
            }
        }

        // We've assigned and populated the documents field and we're done, Carrot2 core
        // will take care of the rest.
    }

    public static void main(String [] args)
    {
        final Controller controller = ControllerFactory.createSimple();
        final Map<String, Object> params = new HashMap<String, Object>();

        /*
         * This computes the attribute key dynamically based on the class and field name. 
         */
        params.put(
            AttributeUtils.getKey(ExampleDocumentSource.class, "modulo"), 
            2);

        params.put(
            AttributeUtils.getKey(ExampleDocumentSource.class, "analyzer"), 
            new WhitespaceAnalyzer());
        
        /*
         * An alternative is to generate additional descriptor classes for bindables.
         * These classes provide type-safe attribute builders. Unfortunately due to
         * limitations of java compiler preprocessors, the generated class cannot be used
         * in the same compilation round as the code it is generated from (you can try
         * to split the compilation into more than one phase, however).
         * 
         * ExampleDocumentSourceDescriptor.attributes()
         *   .modulo(2)
         *   .analyzer(new WhitespaceAnalyzer())
         *   .build();
         */

        final ProcessingResult result = controller.process(params,
            ExampleDocumentSource.class, LingoClusteringAlgorithm.class);

        ConsoleFormatter.displayResults(result);
    }
}