package org.opensextant.matching; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import org.apache.commons.lang.StringUtils; import org.apache.solr.client.solrj.SolrClient; import org.apache.solr.client.solrj.SolrRequest; import org.apache.solr.client.solrj.SolrServerException; import org.apache.solr.client.solrj.response.QueryResponse; import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrDocumentList; import org.apache.solr.common.params.ModifiableSolrParams; import org.apache.solr.common.util.NamedList; import org.opensextant.placedata.Place; import org.opensextant.placedata.PlaceCandidate; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class PlacenameMatcher { private SolrClient solrServer; private ModifiableSolrParams matchParams; private static final String APRIORI_NAME_RULE = "AprioriNameBias"; private SolrTaggerRequest tagRequest; private Map<Integer, Place> placeIDMap = new HashMap<Integer, Place>(100); private boolean tagAbbrev; /** Log object. */ private static final Logger LOGGER = LoggerFactory.getLogger(PlacenameMatcher.class); protected PlacenameMatcher(SolrClient svr, ModifiableSolrParams prms) { this.solrServer = svr; matchParams = new ModifiableSolrParams(prms); } public void tagAbbreviations(boolean b) { tagAbbrev = b; } public List<PlaceCandidate> matchText(String buffer, String docName) { List<PlaceCandidate> candidates = new ArrayList<PlaceCandidate>(); // Setup request to tag tagRequest = new SolrTaggerRequest(matchParams, SolrRequest.METHOD.POST); tagRequest.setInput(buffer); QueryResponse response = null; try { response = tagRequest.process(solrServer); } catch (SolrServerException | IOException e) { LOGGER.error("Got exception when attempting to match " + docName, e); return candidates; } // Process Solr Response SolrDocumentList docList = response.getResults(); // TODO convert this section to use a StreamingResponseCallback // clear out the place id map placeIDMap.clear(); // populate the place id map from the solr documents for (SolrDocument solrDoc : docList) { Integer id = (Integer) solrDoc.getFirstValue("id"); Place place = MatcherFactory.createPlace(solrDoc); placeIDMap.put(id, place); } @SuppressWarnings("unchecked") List<NamedList<?>> tags = (List<NamedList<?>>) response.getResponse().get("tags"); PlaceCandidate pc = null; int x1 = -1, x2 = -1; Set<String> seenPlaces = new HashSet<String>(); double nameBias = 0.0; String matchText = null; for (NamedList<?> tag : tags) { // clear out seen places set seenPlaces.clear(); // get the start, end and list of matching place IDs x1 = (Integer) tag.get("startOffset"); x2 = (Integer) tag.get("endOffset"); @SuppressWarnings("unchecked") List<Integer> placeIDList = (List<Integer>) tag.get("ids"); // create and populate the PlaceCandidate pc = new PlaceCandidate(); pc.setStart(x1); pc.setEnd(x2); matchText = buffer.substring(x1, x2); pc.setPlaceName(matchText); nameBias = 0.0; boolean isValid = true; boolean isLower = StringUtils.isAllLowerCase(matchText); for (Integer placeID : placeIDList) { // get the Place that corresponds to this ID Place place = placeIDMap.get(placeID); // don't tag if place name is an abbrev and matchtext is all // lower case if (!tagAbbrev && place.isAbbreviation() && isLower) { isValid = false; LOGGER.debug("Not tagging abbreviation:" + matchText); break; } // don't add places already on candidate if (!seenPlaces.contains(place.getPlaceID())) { pc.addPlaceWithScore(place, place.getIdBias()); seenPlaces.add(place.getPlaceID()); // get max name bias double nBias = place.getNameBias(); if (nBias > nameBias) { nameBias = nBias; } } } // end placeID loop if (!isValid || !pc.hasPlaces()) { continue; } // if the max name bias seen >0; add apriori evidence if (nameBias > 0.0) { pc.addRuleAndConfidence(APRIORI_NAME_RULE, nameBias); } candidates.add(pc); } // clear out the place id map placeIDMap.clear(); return candidates; } public void cleanup() { MatcherFactory.shutdown(this); } }