/* Copyright 2009-2013 The MITRE Corporation. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. * ************************************************************************** * NOTICE * This software was produced for the U. S. Government under Contract No. * W15P7T-12-C-F600, and is subject to the Rights in Noncommercial Computer * Software and Noncommercial Computer Software Documentation Clause * 252.227-7014 (JUN 1995) * * (c) 2012 The MITRE Corporation. All Rights Reserved. * ************************************************************************** **/ package org.opensextant.examples; import java.io.File; import java.io.IOException; import java.util.List; import org.apache.commons.io.FileUtils; import org.opensextant.matching.MatcherFactory; import org.opensextant.matching.PlacenameMatcher; import org.opensextant.placedata.Place; import org.opensextant.placedata.PlaceCandidate; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Simple example of using the PlaceNameMatcher which uses the Solr gazetteer to * find candidate place names in text. */ public class MatcherTest { /** Log object. */ private static final Logger LOGGER = LoggerFactory.getLogger(MatcherTest.class); /** * Instantiates a new matcher test. */ private MatcherTest() { } /** * The main method. * * @param args * the arguments */ public static void main(String[] args) { // the file with some text to be processed File testText = new File(args[0]); // location of solr containg the gazetteer // could be a directory, URL or missing String solrHome = ""; if (args.length == 2) { LOGGER.info("Using supplied arg for location of solr gazetteer"); solrHome = args[1]; } else { LOGGER.info("No arg supplied for location of solr gazetteer. Using environment variable"); } // configure and start the Matcher Factory MatcherFactory.config(solrHome); MatcherFactory.start(); // get a matcher PlacenameMatcher m = MatcherFactory.getMatcher(); // check to see if its there if (null == m) { LOGGER.error("Got a null Matcher from Factory."); return; } // get some sample text String sampleText; try { sampleText = FileUtils.readFileToString(testText, "UTF-8"); } catch (IOException e) { LOGGER.error("Exception reading text from file" + testText.getName(), e); return; } // choose to NOT tag questionable abbreviations m.tagAbbreviations(false); // send the sample text to be tagged List<PlaceCandidate> cands = m.matchText(sampleText, "test document"); // see what got tagged LOGGER.info("Without questionable abbreviations,found " + cands.size() + " place candidates"); for (PlaceCandidate pc : cands) { String placeName = pc.getPlaceName(); // if you want all possible places List<Place> allPlaces = pc.getPlaces(); // if you want only the apriori most likely Place samplePlace = pc.getBestPlace(); LOGGER.info("\t" + placeName + " could be " + allPlaces.size() + " places, like " + samplePlace); } // Now turn on tagging of questionable abbreviations and tag again to // see differences m.tagAbbreviations(true); List<PlaceCandidate> cands2 = m.matchText(sampleText, "test document"); // see what got tagged this time LOGGER.info("Tagging questionable abbreviations,found " + cands2.size() + " place candidates"); for (PlaceCandidate pc : cands2) { String placeName = pc.getPlaceName(); // if you want all possible places List<Place> allPlaces = pc.getPlaces(); // if you want only the apriori most likely Place samplePlace = pc.getBestPlace(); LOGGER.info("\t" + placeName + " could be " + allPlaces.size() + " places, like " + samplePlace); } // cleanup the matcher m.cleanup(); } }