ApacheExtractor.java example

Explorer

CLAVIN-master
- src
  - main
    - java
      - com
        bericotech
        clavin
        ClavinException.java
        GeoParser.java
        GeoParserFactory.java
        WorkflowDemo.java
        extractor
        ApacheExtractor.java
        LocationExtractor.java
        LocationOccurrence.java
        gazetteer
        BasicGeoName.java
        CountryCode.java
        FeatureClass.java
        FeatureCode.java
        FeatureCodeBuilder.java
        GeoName.java
        LazyAncestryGeoName.java
        query
        AncestryMode.java
        FuzzyMode.java
        Gazetteer.java
        GazetteerQuery.java
        LuceneGazetteer.java
        QueryBuilder.java
        index
        BinarySimilarity.java
        IndexDirectoryBuilder.java
        IndexField.java
        WhitespaceLowerCaseAnalyzer.java
        WhitespaceLowerCaseTokenizer.java
        resolver
        ClavinLocationResolver.java
        LocationResolver.java
        LuceneLocationResolver.java
        ResolvedLocation.java
        multipart
        DefaultScorer.java
        MatchedLocation.java
        MultipartLocationName.java
        MultipartLocationResolver.java
        ResolvedMultipartLocation.java
        Scorer.java
        SearchLevel.java
        SearchResult.java
        util
        DamerauLevenshtein.java
        ListUtils.java
        TextUtils.java
  - test
    - java
      - com
        bericotech
        clavin
        AllTestsSuite.java
        GeoParserFactoryTest.java
        GeoParserTest.java
        extractor
        ApacheExtractorTest.java
        LocationOccurrenceTest.java
        gazetteer
        BasicGeoNameTest.java
        LazyAncestryGeoNameTest.java
        query
        LuceneGazetteerTest.java
        index
        BinarySimilarityTest.java
        resolver
        ClavinLocationResolverHeuristicsTest.java
        ClavinLocationResolverTest.java
        ResolvedLocationTest.java
        multipart
        MultiLevelMultipartLocationResolverTest.java
        MultipartLocationResolverTest.java
        util
        DamerauLevenshteinTest.java
        ListUtilsTest.java
        TextUtilsTest.java

package com.bericotech.clavin.extractor;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import opennlp.tools.namefind.NameFinderME;
import opennlp.tools.namefind.TokenNameFinderModel;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
import opennlp.tools.util.Span;

/*#####################################################################
 * 
 * CLAVIN (Cartographic Location And Vicinity INdexer)
 * ---------------------------------------------------
 * 
 * Copyright (C) 2012-2013 Berico Technologies
 * http://clavin.bericotechnologies.com
 * 
 * ====================================================================
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * permissions and limitations under the License.
 * 
 * ====================================================================
 * 
 * ApacheExtractor.java
 * 
 *###################################################################*/

/**
 * Extracts location names from unstructured text documents using a
 * named entity recognizer (Apache OpenNLP Name Finder).
 *
 */
public class ApacheExtractor implements LocationExtractor {
    
    // the actual named entity recognizer (NER) object
    private NameFinderME nameFinder;
    
    // used to tokenize plain text into the OpenNLP format
    private TokenizerME tokenizer;

    // used to split the input into sentences before finding names
    private SentenceDetectorME sentenceDetector;
    
    // resource files used by Apache OpenNLP Name Finder
    private static final String pathToNERModel = "/en-ner-location.bin";
    private static final String pathToTokenizerModel = "/en-token.bin";
    private static final String pathToSentenceDetectorModel = "/en-sent.bin";

    
    /**
     * Builds an {@link ApacheExtractor} by instantiating the OpenNLP
     * Name Finder and Tokenizer.
     * 
     * @throws IOException 
     */
    public ApacheExtractor() throws IOException {
        nameFinder = new NameFinderME(new TokenNameFinderModel(ApacheExtractor.class.getResourceAsStream(pathToNERModel)));
        tokenizer = new TokenizerME(new TokenizerModel(ApacheExtractor.class.getResourceAsStream(pathToTokenizerModel)));
        sentenceDetector = new SentenceDetectorME(new SentenceModel(ApacheExtractor.class.getResourceAsStream(pathToSentenceDetectorModel)));
    }
    
    /**
     * Extracts location names from unstructured text using the named
     * entity recognizer (NER) feature provided by the Apache OpenNLP
     * Name Finder.
     * 
     * @param plainText     Contents of text document
     * @return List of location names and positions
     */
    public List<LocationOccurrence> extractLocationNames(String plainText) {
        if(plainText == null) {
            throw new IllegalArgumentException("plaintext input to extractLocationNames should not be null");
        }

        List<LocationOccurrence> nerResults = new ArrayList<LocationOccurrence>();

        // The values used in these Spans are string character offsets
        Span sentenceSpans[] = sentenceDetector.sentPosDetect(plainText);

        // Each sentence gets processed on its own
        for (Span sentenceSpan : sentenceSpans) {

            // find the start and end position of this sentence in the document
            String sentence = plainText.substring(sentenceSpan.getStart(), sentenceSpan.getEnd());

            // tokenize the text into the required OpenNLP format
            String[] tokens = tokenizer.tokenize(sentence);

            //the values used in these Spans are string character offsets of each token from the sentence beginning
            Span[] tokenPositionsWithinSentence = tokenizer.tokenizePos(sentence);

            // find the location names in the tokenized text
            // the values used in these Spans are NOT string character offsets, they are indices into the 'tokens' array
            Span names[] = nameFinder.find(tokens);


            //for each name that got found, create our corresponding occurrence
            for (Span name : names) {

                //find offsets relative to the start of the sentence
                int beginningOfFirstWord = tokenPositionsWithinSentence[name.getStart()].getStart();
                // -1 because the high end of a Span is noninclusive
                int endOfLastWord = tokenPositionsWithinSentence[name.getEnd() - 1].getEnd();

                //to get offsets relative to the document as a whole, just add the offset for the sentence itself
                int startOffsetInDoc = sentenceSpan.getStart() + beginningOfFirstWord;
                int endOffsetInDoc = sentenceSpan.getStart() + endOfLastWord;

                //look back into the original input string to figure out what the text is that I got a hit on
                String nameInDocument = plainText.substring(startOffsetInDoc, endOffsetInDoc);

                // add to List of results to return
                nerResults.add(new LocationOccurrence(nameInDocument, startOffsetInDoc));
            }

        }

        // this is necessary to maintain consistent results across
        // multiple runs on the same data, which is what we want
        nameFinder.clearAdaptiveData();

        return nerResults;
    }

}