package com.bericotech.clavin.extractor; import java.io.IOException; import java.util.ArrayList; import java.util.List; import opennlp.tools.namefind.NameFinderME; import opennlp.tools.namefind.TokenNameFinderModel; import opennlp.tools.sentdetect.SentenceDetectorME; import opennlp.tools.sentdetect.SentenceModel; import opennlp.tools.tokenize.TokenizerME; import opennlp.tools.tokenize.TokenizerModel; import opennlp.tools.util.Span; /*##################################################################### * * CLAVIN (Cartographic Location And Vicinity INdexer) * --------------------------------------------------- * * Copyright (C) 2012-2013 Berico Technologies * http://clavin.bericotechnologies.com * * ==================================================================== * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. * * ==================================================================== * * ApacheExtractor.java * *###################################################################*/ /** * Extracts location names from unstructured text documents using a * named entity recognizer (Apache OpenNLP Name Finder). * */ public class ApacheExtractor implements LocationExtractor { // the actual named entity recognizer (NER) object private NameFinderME nameFinder; // used to tokenize plain text into the OpenNLP format private TokenizerME tokenizer; // used to split the input into sentences before finding names private SentenceDetectorME sentenceDetector; // resource files used by Apache OpenNLP Name Finder private static final String pathToNERModel = "/en-ner-location.bin"; private static final String pathToTokenizerModel = "/en-token.bin"; private static final String pathToSentenceDetectorModel = "/en-sent.bin"; /** * Builds an {@link ApacheExtractor} by instantiating the OpenNLP * Name Finder and Tokenizer. * * @throws IOException */ public ApacheExtractor() throws IOException { nameFinder = new NameFinderME(new TokenNameFinderModel(ApacheExtractor.class.getResourceAsStream(pathToNERModel))); tokenizer = new TokenizerME(new TokenizerModel(ApacheExtractor.class.getResourceAsStream(pathToTokenizerModel))); sentenceDetector = new SentenceDetectorME(new SentenceModel(ApacheExtractor.class.getResourceAsStream(pathToSentenceDetectorModel))); } /** * Extracts location names from unstructured text using the named * entity recognizer (NER) feature provided by the Apache OpenNLP * Name Finder. * * @param plainText Contents of text document * @return List of location names and positions */ public List<LocationOccurrence> extractLocationNames(String plainText) { if(plainText == null) { throw new IllegalArgumentException("plaintext input to extractLocationNames should not be null"); } List<LocationOccurrence> nerResults = new ArrayList<LocationOccurrence>(); // The values used in these Spans are string character offsets Span sentenceSpans[] = sentenceDetector.sentPosDetect(plainText); // Each sentence gets processed on its own for (Span sentenceSpan : sentenceSpans) { // find the start and end position of this sentence in the document String sentence = plainText.substring(sentenceSpan.getStart(), sentenceSpan.getEnd()); // tokenize the text into the required OpenNLP format String[] tokens = tokenizer.tokenize(sentence); //the values used in these Spans are string character offsets of each token from the sentence beginning Span[] tokenPositionsWithinSentence = tokenizer.tokenizePos(sentence); // find the location names in the tokenized text // the values used in these Spans are NOT string character offsets, they are indices into the 'tokens' array Span names[] = nameFinder.find(tokens); //for each name that got found, create our corresponding occurrence for (Span name : names) { //find offsets relative to the start of the sentence int beginningOfFirstWord = tokenPositionsWithinSentence[name.getStart()].getStart(); // -1 because the high end of a Span is noninclusive int endOfLastWord = tokenPositionsWithinSentence[name.getEnd() - 1].getEnd(); //to get offsets relative to the document as a whole, just add the offset for the sentence itself int startOffsetInDoc = sentenceSpan.getStart() + beginningOfFirstWord; int endOffsetInDoc = sentenceSpan.getStart() + endOfLastWord; //look back into the original input string to figure out what the text is that I got a hit on String nameInDocument = plainText.substring(startOffsetInDoc, endOffsetInDoc); // add to List of results to return nerResults.add(new LocationOccurrence(nameInDocument, startOffsetInDoc)); } } // this is necessary to maintain consistent results across // multiple runs on the same data, which is what we want nameFinder.clearAdaptiveData(); return nerResults; } }