StopWordAnnotator.java example

Explorer

DoSeR-master
- doser-core
  - src
    - main
      - java
        doser
        categorysuggestion
        algorithm
        StandardDbPediaCategorySuggestion.java
        dpo
        CatSugRequest.java
        CatSugResponse.java
        entity
        disambiguation
        feedback
        Feedback.java
        entitydisambiguation
        algorithms
        AbstractDisambiguationAlgorithm.java
        Candidate.java
        DisambiguationHandler.java
        DocumentCentricAlgorithmDefault.java
        EntityCentricAlgorithmCSTable.java
        EntityCentricAlgorithmDefault.java
        EntityCentricAlgorithmTableDefault.java
        IllegalDisambiguationAlgorithmInputException.java
        SurfaceForm.java
        collective
        AbstractWord2VecPageRank.java
        CandidatePruning.java
        CandidateReduction.java
        Edge.java
        Vertex.java
        dbpedia
        AdditionalCandidateQuery.java
        CandidateReductionDBpediaW2V.java
        CollectiveAndContextDriver.java
        CollectiveDisambiguationDBpediaEntities.java
        FinalEntityDisambiguation.java
        LocationDisambiguation.java
        TableColumnFilter.java
        TimeNumberDisambiguation.java
        Word2VecDisambiguator.java
        general
        CandidateReductionGeneralW2V.java
        CollectiveContextDriverGeneral.java
        CollectiveDisambiguationGeneralEntities.java
        FinalEntityDisambiguatorGeneral.java
        Word2VecDisambiguatorGeneral.java
        rules
        AbstractRule.java
        CheckGeneralEntities.java
        ContextRule.java
        NoCandidatesCheckPlural.java
        NoCandidatesExpansionRules.java
        PatternRule.java
        RuleAdapation.java
        UnambiguousToAmbiguousRule.java
        backend
        AbstractDisambiguationTask.java
        DisambiguationMainService.java
        DisambiguationTaskCollective.java
        DisambiguationTaskSingle.java
        dpo
        BoundingBox.java
        DisambiguatedEntity.java
        DisambiguationRequest.java
        DisambiguationResponse.java
        EntityDisambiguationDPO.java
        Response.java
        Time.java
        package-info.java
        feedback
        dpo
        BoundingBox.java
        FeedbackItem.java
        FeedbackRequest.java
        FeedbackResponse.java
        RequestFeedbackProxy.java
        knowledgebases
        AbstractEntityCentricKBGeneral.java
        AbstractKnowledgeBase.java
        DocumentCentricKnowledgeBaseDefault.java
        EnCenKBCStable.java
        EntityCentricKBBiomed.java
        EntityCentricKBDBpedia.java
        EntityCentricKnowledgeBase.java
        KnowledgeBaseIdentifiers.java
        modknowledgebase
        AbstractKnowledgebaseOperator.java
        AddNewDocumentsOperator.java
        KBModifications.java
        KnowledgeBaseEntryCreation.java
        KnowledgebaseModification.java
        ModifyKnowledgeBaseException.java
        NewDocumentOrUpdateOperator.java
        UpdateKnowledgeBaseEntryOperator.java
        dpo
        DocumentToProcess.java
        EntryToProcess.java
        KBEnrichmentRequest.java
        KBEnrichmentResponse.java
        properties
        Properties.java
        table
        celldisambiguation
        CellDisAlgorithm_CSDomain.java
        CellDisAlgorithm_Standard.java
        CellDisambiguationInterface.java
        columndisambiguation
        AbstractTypeDisFeatures.java
        ColumnDisAlgorithm.java
        ColumnHeaderFeature.java
        HillClimbingColumnDisambiguation.java
        IncreaseOfEntitiesFeature.java
        InverseDocumentFrequencyFeature.java
        LayerVarianceFeature.java
        LearntoRankOutputObject.java
        NumberOfEntitiesFeature.java
        TypePathLengthFeature.java
        TypeRankHillClimbingFactory.java
        TypeRankHillClimbingGoalTest.java
        TypeRankHillClimbingHeuristicFunction.java
        dpo
        CellResponse.java
        ColumnResponseItem.java
        TableCell.java
        TableColumn.java
        TableDisambiguationRequest.java
        TableDisambiguationResponse.java
        logic
        DisambiguateTable.java
        LearnToRankTableDisambiguationOutput.java
        Table.java
        TableCell.java
        TableColumn.java
        TableDisambiguationMainService.java
        TableDisambiguationTask.java
        Type.java
        language
        Languages.java
        summarization
        algorithm
        SummarizationSOLRIndex.java
        SummaryInfos.java
        dpo
        RDFSRequest.java
        RDFSResponse.java
        Summary.java
        tools
        Inflector.java
        LuceneTest.java
        NTToDbPediaUrlEncoding.java
        RDFGraphOperations.java
        ServiceQueries.java
        webclassify
        algorithm
        EntityRelevanceAlgorithm.java
        EntitySignificanceAlgorithmHITSRelations.java
        EntitySignificanceAlgorithmPR_W2V.java
        EntitySignificanceAlgorithm_Doc2Vec.java
        PageSimilarity.java
        SessionBreakDetection.java
        annotation
        AnnotateCategories.java
        AnnotateEntities.java
        AnnotateTime.java
        dpo
        DBpediaResourceNotIncluded.java
        Document.java
        DocumentStatistic.java
        Paragraph.java
        SimpleMainTopicInput.java
        SimpleMainTopicOutput.java
        WebClassificationRequest.java
        WebClassificationResponse.java
        WebSite.java
        WebTypeRequest_Deprecated.java
        WebTypeResponse_Deprecated.java
        word2vec
        Data.java
        Doc2VecJsonFormat.java
        Word2VecJsonFormat.java
        Word2VecModel.java
    - test
      - java
        doser
        test
        breakdetection
        BreakDetection.java
- doser-disambiguationserver
  - src
    - main
      - java
        doser
        server
        actions
        FrameworkInitialization.java
        categorysuggestion
        CategorySuggestionService.java
        disambiguation
        DisambiguationService.java
        FeedbackService.java
        documentannotation
        DocumentClassification.java
        WebSessionClassification.java
        WebSessionGetTypes_Deprecation.java
        kbenrichment
        CheckRequestsForKBModification.java
        DisplayEntityCandidatesServlet.java
        KBEnrichmentService.java
        ShowIndexEntityServlet.java
        StoreIndexModificationServlet.java
        StoreNewIndexEntryServlet.java
        package-info.java
        rdfsummarization
        RDFSummarizationService.java
        tabledisambiguation
        TableDisambiguationServiceProxy.java
- doser-experiments
  - src
    - main
      - java
        ACE_MSNBC_AQUAINT_Evaluation
        MainEvaluation.java
        AidaDatasetEvaluation
        AidaDataSetEvaluation.java
        Type.java
        CalbCDataSetCollectiveEvaluation
        Evaluation.java
        DisambiguationApproachDPO
        BoundingBox.java
        Category.java
        DisambiguatedEntity.java
        DisambiguationRequest.java
        DisambiguationResponse.java
        EntityDisambiguationDPO.java
        Response.java
        doser
        sequencedetection
        graph
        AbstractGraph.java
        Dijkstra.java
        DijkstraData.java
        Edge.java
        GraphContent.java
        NoRouteFoundException.java
        Node.java
        NodeTypes.java
        TemporalGraph.java
        UndirectedWeightedShotGraph.java
        word2vec
        dbpediaGraphThinning
        DbpediaGraphModification.java
        EvaluatePureDbpediaCategories.java
        FullyEvaluateCategories.java
        TestNegativeCosine.java
        semanticCategories
        ComputeSimilarities.java
        EntityPair.java
        Sampling.java
        experiments
        collective
        entdoccentric
        CollectiveTestApproach.java
        CompleteCalbCSGeneration.java
        LTR
        ConjunctionScorer.java
        ConjunctionTermScorer.java
        DisjunctionScorer.java
        DisjunctionSumScorer.java
        LTRBooleanQuery.java
        LearnToRankClause.java
        LearnToRankFuzzyQuery.java
        LearnToRankQuery.java
        LearnToRankScorer.java
        LearnToRankTermQuery.java
        LearnToRankTermScorer.java
        ReqOptSumScorer.java
        LearntoRankOutputObject.java
        ParameterHandler.java
        PriorLoader.java
        QueryDataGeneration.java
        ResultProcessing.java
        StandardInitialize.java
        StandardQueryDataObject.java
        StartEvaluation.java
        StartupInformationLoader.java
        TestClass.java
        TrecEvalResultObject.java
        TrecEvalResultProcessing.java
        calbc
        Author.java
        CalbCPubMedID.java
        Concept.java
        Entity.java
        Metadata.java
        dpo
        BoundingBox.java
        EntityToDisambiguate.java
        Position.java
        filter
        Filter.java
        query
        CalbCAnalyzer.java
        CalbCTokenizer.java
        LearnToRankFeatureSetup.java
        LearnToRankFeatureSetupDocumentCentric.java
        LearnToRankFeatureSetupEntityBased.java
        LearnToRankInitialize.java
        PositionalPorterStopAnalyzer.java
        PositionalStopFilter.java
        PriorQuery.java
        QueryGenerator.java
        QuerySettings.java
        SensePriorQuery.java
        evaluation
        CorrectEntry.java
        FilePreProcessing.java
        LineParsingException.java
        M_Accuracy.java
        M_Accuracy2.java
        M_F1.java
        M_MAP.java
        M_Precision.java
        M_Recall.java
        M_ReciprocalRank.java
        M_StringVariance.java
        Output.java
        Query.java
        ResultEntry.java
        StartEvaluation.java
        StatisticalMeasure.java
        UnicodeBOMInputStream.java
        WorkingChain.java
        table
        imdbAndMusicBrainz
        StartEvaluationTableEntities.java
        limaye
        DisServiceAnswer.java
        DisServiceAnswerResult.java
        DisServiceAnswerResultEntities.java
        EvaluationPoster.java
        LimayeAnnotationParserWebTables.java
        LimayeGroundtruthAnnotationParser.java
        StartEvaluationTableEntities.java
        Table.java
        Type.java
        WikiPediaUriConverter.java
        corrected
        LimayeAnnotationParserWebTables.java
        StartEvaluationTableEntities.java
        Table.java
        webclassify
        firstexperiments
        AnnotateSinglePages.java
        ExtractWikipediaText.java
        ldatest
        CreateLDAOutput.java
        table
        imdb
        IMDBTableConverter.java
        MusicBrainzConverter.java
        test
        test.java
- doser-extensions
  - src
    - main
      - java
        doser
        algorithms
        MajorityVoteAlgorithm.java
        general
        HelpfulMethods.java
        Test.java
        UnicodeBOMInputStream.java
        lucene
        analysis
        DoserIDAnalyzer.java
        DoserIDFilter.java
        DoserIDTokenizer.java
        DoserStandardAnalyzer.java
        DoserStandardTokenizer.java
        features
        DocCenExtFeatures.java
        IEntityCentricExtFeatures.java
        LuceneFeatures.java
        query
        AbstractDisjunctionScorer.java
        ConjunctionScorer.java
        DisjunctionSumScorer.java
        FuzzyLabelSimilarity.java
        LTRBooleanQuery.java
        LearnToRankClause.java
        LearnToRankFeatureDefaultValueManager.java
        LearnToRankFuzzyQuery.java
        LearnToRankQuery.java
        LearnToRankScorer.java
        LearnToRankTermQuery.java
        LearnToRankTermScorer.java
        PriorQuery.java
        ReqOptSumScorer.java
        SensePriorQuery.java
        TermQuery.java
        TermScorer.java
        nlp
        NLPTools.java
        StopWordAnnotator.java
        word2vec
        Word2VecModel.java
- doser-externtools
  - src
    - main
      - java
        DBpediaCategoryCorrection
        DBpediaCategoryCorrection.java
        ExtractRelevantDBpediaCategories.java
        doc2vec
        corpuscreation
        CreateD2VCorpus_Wikipedia.java
        CreateD2VCorpus_Wikipedia_WikiSFContext.java
        ExtractContextOfWikipediaPages.java
        doser
        tools
        indexcreation
        AddFactsToIndex.java
        AddPattyFactsToIndex.java
        CountYago2sTypes.java
        CreateBiomedicalDomainIndex.java
        CreateDBPediaIndex.java
        CreateDBpediaIndexV2.java
        CreateEntityList.java
        CreateWikipediaDocumentCentricKB.java
        MergeEntityLists.java
        Test.java
        Test1.java
        WikiPediaUriConverter.java
        evidencemining
        parse
        wikipedia
        S1HtmlToPlainTextWithEntities.java
        S2PlainTextWithEntitiesToAnnotationList.java
        word2vec
        corpuscreation
        CreateBiomedicalEntityCorpus.java
        CreateDBpediaEdgeList.java
        CreateEntityCorpus.java
        CreateEntityWordCorpus.java
        CreateRandomDBpediaModel.java
        CreateWikipediaAndWebEntityCorpus.java
        evidencecomputation
        EvidenceThread.java
        W2VEvidenceMain.java
        tools
        Word2VecModel.java
- doser-gerbilrest
  - src
    - main
      - java
        doser
        gerbilwrapper
        AidaWrapper.java
        BoundingBox.java
        DisambiguatedEntity.java
        DisambiguationRequest.java
        DisambiguationResponse.java
        DoserResource.java
        EntityDisambiguationDPO.java
        IllinoisWrapper.java
        Response.java
        Type.java
        WrapperApplication.java
- doser-hadoop
  - src
    - main
      - java
        examples
        mapred
        temperature
        MaxTemperatureDriver.java
        MaxTemperatureMapper.java
        MaxTemperatureReducer.java
        hadoop
        convertFilesToSequenceFile
        ConvertWebTablesToSequenceFile.java
        SequenceFileReader.java
        extensions
        JobBuilder.java
        WebTableInputFormat.java
        WebTableRecordReader.java
        WholeFileInputFormat.java
        WholeFileRecordReader.java
        hdfs
        examples
        ReadFileAndWriteData.java
        ReadHadoopUrl.java
        webtables
        wordcount
        WordcountJob.java
        WordcountMapper.java
        WordcountReducer.java
        wikievidence
        ldadataconstruction
        WikipediaLDADataGeneratorDriver.java
        WikipediaLDADataGeneratorMapper.java
        WikipediaLDADataGeneratorReducer.java
        hbase
        operations
        HBaseOperations.java
        TestHbaseConnection.java
        lda
        categoryclass
        dataconstruction
        S1CategoryToEntities.java
        properties
        LDAProperties.java
        wikievidence
        dataconstruction
        S3ConstructHBaseContext.java
        S3ConstructHBaseEntries.java
        S4CreateCircles.java
        modelcreation
        ConfigCreation.java
        LDAClient.java
        LDAClientExtractProbabilities.java
        LDAExecutor.java
        MineEvidences.java
        WikipediaLDAThreadExtractEvidenceTerms.java

package doser.nlp;

import java.util.Collections;
import java.util.List;
import java.util.Properties;
import java.util.Set;

//import edu.stanford.nlp.ling.CoreAnnotation;
//import edu.stanford.nlp.pipeline.Annotator;
//
//import org.apache.lucene.analysis.core.StopAnalyzer;
//import org.apache.lucene.analysis.util.CharArraySet;
//import org.apache.lucene.util.Version;
//
//import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
//import edu.stanford.nlp.ling.CoreLabel;
//import edu.stanford.nlp.pipeline.Annotation;
//import edu.stanford.nlp.util.Pair;

/**
 * User: jconwell CoreNlp Annotator that checks if in coming token is a stopword
 */
//public class StopWordAnnotator implements Annotator,
//		CoreAnnotation<Pair<Boolean, Boolean>> {
//
//	public static final String customStopWordList = "a, about, above, across, after, again, against, all, almost, alone, along, already, also, although, always, am, among, an, and, another, any, anybody, anyone, anything, anywhere, are, area, areas, aren't, around, as, ask, asked, asking, asks, at, away, b, back, backed, backing, backs, be, became, because, become, becomes, been, before, began, behind, being, beings, below, best, better, between, big, both, but, by, c, came, can, cannot, can't, case, cases, certain, certainly, clear, clearly, come, could, couldn't, d, did, didn't, differ, different, differently, do, does, doesn't, doing, done, don't, down, downed, downing, downs, during, e, each, early, either, end, ended, ending, ends, enough, even, evenly, ever, every, everybody, everyone, everything, everywhere, f, face, faces, fact, facts, far, felt, few, find, finds, first, for, four, from, full, fully, further, furthered, furthering, furthers, g, gave, general, generally, get, gets, give, given, gives, go, going, good, goods, got, great, greater, greatest, group, grouped, grouping, groups, h, had, hadn't, has, hasn't, have, haven't, having, he, he'd, he'll, her, here, here's, hers, herself, he's, high, higher, highest, him, himself, his, how, however, how's, i, i'd, if, i'll, i'm, important, in, interest, interested, interesting, interests, into, is, isn't, it, its, it's, itself, i've, j, just, k, keep, keeps, kind, knew, know, known, knows, l, large, largely, last, later, latest, least, less, let, lets, let's, like, likely, long, longer, longest, m, made, make, making, man, many, may, me, member, members, men, might, more, most, mostly, mr, mrs, much, must, mustn't, my, myself, n, necessary, need, needed, needing, needs, never, new, newer, newest, next, no, nobody, non, noone, nor, not, nothing, now, nowhere, number, numbers, o, of, off, often, old, older, oldest, on, once, one, only, open, opened, opening, opens, or, order, ordered, ordering, orders, other, others, ought, our, ours, ourselves, out, over, own, p, part, parted, parting, parts, per, perhaps, place, places, point, pointed, pointing, points, possible, present, presented, presenting, presents, problem, problems, put, puts, q, quite, r, rather, really, right, room, rooms, s, said, same, saw, say, says, second, seconds, see, seem, seemed, seeming, seems, sees, several, shall, shan't, she, she'd, she'll, she's, should, shouldn't, show, showed, showing, shows, side, sides, since, small, smaller, smallest, so, some, somebody, someone, something, somewhere, state, states, still, such, sure, t, take, taken, than, that, that's, the, their, theirs, them, themselves, then, there, therefore, there's, these, they, they'd, they'll, they're, they've, thing, things, think, thinks, this, those, though, thought, thoughts, three, through, thus, to, today, together, too, took, toward, turn, turned, turning, turns, two, u, under, until, up, upon, us, use, used, uses, v, very, w, want, wanted, wanting, wants, was, wasn't, way, ways, we, we'd, well, we'll, wells, went, were, we're, weren't, we've, what, what's, when, when's, where, where's, whether, which, while, who, whole, whom, who's, whose, why, why's, will, with, within, without, won't, work, worked, working, works, would, wouldn't, x, y, year, years, yes, yet, you, you'd, you'll, young, younger, youngest, your, you're, yours, yourself, yourselves, you've, z";
//
//	/**
//	 * stopword annotator class name used in annotators property
//	 */
//	public static final String ANNOTATOR_CLASS = "stopword";
//
//	public static final String STANFORD_STOPWORD = ANNOTATOR_CLASS;
//	public static final Requirement STOPWORD_REQUIREMENT = new Requirement(
//			STANFORD_STOPWORD);
//
//	/**
//	 * Property key to specify the comma delimited list of custom stopwords
//	 */
//	public static final String STOPWORDS_LIST = "stopword-list";
//
//	/**
//	 * Property key to specify if stopword list is case insensitive
//	 */
//	public static final String IGNORE_STOPWORD_CASE = "ignore-stopword-case";
//
//	/**
//	 * Property key to specify of StopwordAnnotator should check word lemma as
//	 * stopword
//	 */
//	public static final String CHECK_LEMMA = "check-lemma";
//
//	private static Class<? extends Pair> boolPair = Pair.makePair(true, true)
//			.getClass();
//
//	private Properties props;
//	private CharArraySet stopwords;
//	private boolean checkLemma;
//
//	public StopWordAnnotator(String annotatorClass, Properties props) {
//		this.props = props;
//
//		this.checkLemma = Boolean.parseBoolean(props.getProperty(CHECK_LEMMA,
//				"false"));
//
//		if (this.props.containsKey(STOPWORDS_LIST)) {
//			String stopwordList = props.getProperty(STOPWORDS_LIST);
//			boolean ignoreCase = Boolean.parseBoolean(props.getProperty(
//					IGNORE_STOPWORD_CASE, "false"));
//			this.stopwords = getStopWordList(Version.LATEST, stopwordList,
//					ignoreCase);
//		} else {
//			System.out.println("Ich hole mir die normalen Lucene StopWords");
//			this.stopwords = (CharArraySet) StopAnalyzer.ENGLISH_STOP_WORDS_SET;
//			System.out.println(this.stopwords.toString());
//		}
//	}
//
//	@Override
//	public void annotate(Annotation annotation) {
//		if (stopwords != null && stopwords.size() > 0
//				&& annotation.containsKey(TokensAnnotation.class)) {
//			List<CoreLabel> tokens = annotation.get(TokensAnnotation.class);
//			for (CoreLabel token : tokens) {
//				boolean isWordStopword = stopwords.contains(token.word()
//						.toLowerCase());
//				boolean isLemmaStopword = checkLemma ? stopwords.contains(token
//						.word().toLowerCase()) : false;
//				Pair<Boolean, Boolean> pair = Pair.makePair(isWordStopword,
//						isLemmaStopword);
//				token.set(StopWordAnnotator.class, pair);
//			}
//		}
//	}
//
//	@Override
//	public Set<Requirement> requirementsSatisfied() {
//		return Collections.singleton(STOPWORD_REQUIREMENT);//
//	}
//
//	@Override
//	public Set<Requirement> requires() {
//		if (checkLemma) {
//			return TOKENIZE_SSPLIT_POS_LEMMA;
//		} else {
//			return TOKENIZE_AND_SSPLIT;
//		}
//	}
//
//	@Override
//	@SuppressWarnings("unchecked")
//	public Class<Pair<Boolean, Boolean>> getType() {
//		return (Class<Pair<Boolean, Boolean>>) boolPair;
//	}
//
//	public static CharArraySet getStopWordList(Version luceneVersion,
//			String stopwordList, boolean ignoreCase) {
//		String[] terms = stopwordList.split(",");
//		CharArraySet stopwordSet = new CharArraySet(luceneVersion,
//				terms.length, ignoreCase);
//		for (String term : terms) {
//			stopwordSet.add(term.trim());
//		}
//		return CharArraySet.unmodifiableSet(stopwordSet);
//	}
//}