/**
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*
*
* **************************************************************************
* NOTICE This software was produced for the U. S. Government under Contract No.
* W15P7T-12-C-F600, and is subject to the Rights in Noncommercial Computer
* Software and Noncommercial Computer Software Documentation Clause
* 252.227-7014 (JUN 1995)
*
* (c) 2012 The MITRE Corporation. All Rights Reserved.
* **************************************************************************
*
* Continue contributions:
* Copyright 2013-2015 The MITRE Corporation.
*/
package org.opensextant.extractors.xtax;
///** ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~|
//
//_____ ____ __ __
///\ __`\ /\ _`\ /\ \__ /\ \__
//\ \ \/\ \ _____ __ ___ \ \,\L\_\ __ __ _\ \ ,_\ __ ___ \ \ ,_\
//\ \ \ \ \ /\ '__`\ /'__`\ /' _ `\ \/_\__ \ /'__`\/\ \/'\\ \ \/ /'__`\ /' _ `\\ \ \/
//\ \ \_\ \\ \ \L\ \/\ __/ /\ \/\ \ /\ \L\ \ /\ __/\/> </ \ \ \_ /\ \L\.\_ /\ \/\ \\ \ \_
//\ \_____\\ \ ,__/\ \____\\ \_\ \_\ \ `\____\\ \____\/\_/\_\ \ \__\\ \__/.\_\\ \_\ \_\\ \__\
//\/_____/ \ \ \/ \/____/ \/_/\/_/ \/_____/ \/____/\//\/_/ \/__/ \/__/\/_/ \/_/\/_/ \/__/
// \ \_\
// \/_/
//
//OpenSextant TaxonMatcher
//* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~|
//*/
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.solr.client.solrj.SolrRequest;
import org.apache.solr.client.solrj.SolrServer;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.params.CommonParams;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
import org.opensextant.ConfigException;
import org.opensextant.data.Taxon;
import org.opensextant.data.TextInput;
import org.opensextant.extraction.ExtractionException;
import org.opensextant.extraction.Extractor;
import org.opensextant.extraction.SolrMatcherSupport;
import org.opensextant.extraction.TextMatch;
import org.opensextant.util.SolrProxy;
import org.opensextant.util.TextUtils;
/**
* TaxonMatcher uses SolrTextTagger to tag mentions of phrases in documents. The
* phrases can be from simple word lists or they can connect to a taxonomy of
* sorts -- the "taxcat" solr core (see Xponents/solr/taxcat and Xponents/XTax
* for implementation)
*
* JVM arg to use is "opensextant.solr" to point to the local path Less tested:
* solr.solr.home might conflict with a Solr document server instead of this
* tagger. solr.url is good for RESTful integration, but not recommended
*
* @author Marc Ubaldino - ubaldino@mitre.org
*/
public class TaxonMatcher extends SolrMatcherSupport implements Extractor {
private static ModifiableSolrParams params;
static {
params = new ModifiableSolrParams();
// params.set(CommonParams.QT, requestHandler);
params.set(CommonParams.FL, "id,catalog,taxnode,phrase,tag,name_type");
params.set("tagsLimit", 100000);
params.set("subTags", false);
params.set("matchText", false);
params.set(CommonParams.FQ, "valid:true");
/*
* Possible overlaps: ALL, NO_SUB, LONGEST_DOMINANT_RIGHT See Solr Text
* Tagger documentation for details.
*/
params.set("overlaps", "NO_SUB");
}
private boolean tagAll = true;
private boolean filterNonAcronyms = true;
// private ProgressMonitor progressMonitor;
/**
*
* @throws IOException
* @throws ConfigException
*/
public TaxonMatcher() throws IOException, ConfigException {
configure();
}
/**
* Extractor interface.
*/
@Override
public void cleanup() {
this.shutdown();
}
/**
* Be explicit about the solr core to use for tagging
*/
@Override
public String getCoreName() {
return "taxcat";
}
/**
* Return the Solr Parameters for the tagger op.
*
* @return solr params
*/
@Override
public SolrParams getMatcherParameters() {
return params;
}
/**
* Create a Taxon tag, which is filtered based on established catalog
* filters.
*
* Caller must implement their domain objects, POJOs... this callback
* handler only hashes them.
*
* @param refData
* solr doc
* @return tag data
*/
@Override
public Object createTag(SolrDocument refData) {
String _cat = SolrProxy.getString(refData, "catalog");
// Filter out unused matching records.
if (!tagAll && !this.catalogs.contains(_cat)) {
return null;
}
return createTaxon(refData);
}
/**
* Parse the taxon reference data from a solr doc and return Taxon obj.
*
* @param refData
* solr doc
* @return taxon obj
*/
public static Taxon createTaxon(SolrDocument refData) {
Taxon label = new Taxon();
label.name = SolrProxy.getString(refData, "taxnode");
label.isAcronym = "A".equals(SolrProxy.getString(refData, "name_type"));
label.catalog = SolrProxy.getString(refData, "catalog");
label.addTerm(SolrProxy.getString(refData, "phrase"));
label.addTags(refData.getFieldValues("tag"));
return label;
}
/**
* Extractor interface: getName
*
* @return
*/
@Override
public String getName() {
return "XTax";
}
@Override
public void configure() throws ConfigException {
try {
initialize();
extract(new TextInput("__initialization___", "trivial priming of the solr pump"));
} catch (Exception err) {
throw new ConfigException("Failed to configure TaxMatcher", err);
}
}
/**
* Configure an Extractor using a config file named by a path
*
* @param patfile
* configuration file path
*/
@Override
public void configure(String patfile) throws ConfigException {
throw new ConfigException("Not a valid configuration routine");
}
/**
* Configure an Extractor using a config file named by a URL
*
* @param patfile
* configuration URL
*/
@Override
public void configure(java.net.URL patfile) throws ConfigException {
throw new ConfigException("Not a valid configuration routine");
}
/**
* Catalogs is a list of catalogs caller wants to tag for. If set, only
* taxon matches with the catalog ID in this list will be returned by
* tagText()
*/
public Set<String> catalogs = new HashSet<String>();
public void addCatalogFilters(String[] cats) {
catalogs.addAll(Arrays.asList(cats));
tagAll = false;
// reset();
}
public void addCatalogFilter(String cat) {
catalogs.add(cat);
tagAll = false;
}
public void removeFilters() {
catalogs.clear();
tagAll = true;
}
/**
* Light-weight usage: text in, matches out. Behaviors: ACRONYMS matching
* lower case terms will automatically be omitted from results.
*
*/
@Override
public List<TextMatch> extract(String input_buf) throws ExtractionException {
return extractorImpl(null, input_buf);
}
/**
* Implementation details -- use with or without the formal ID/buffer
* pairing.
*
* @param id
* doc id
* @param buf
* input text
* @return list of matches
* @throws ExtractionException
*/
private List<TextMatch> extractorImpl(String id, String buf) throws ExtractionException {
List<TextMatch> matches = new ArrayList<TextMatch>();
String docid = (id != null ? id : NO_DOC_ID);
Map<Integer, Object> beanMap = new HashMap<Integer, Object>(100);
QueryResponse response = tagTextCallSolrTagger(buf, docid, beanMap);
@SuppressWarnings("unchecked")
List<NamedList<?>> tags = (List<NamedList<?>>) response.getResponse().get("tags");
log.debug("TAGS SIZE = {}", tags.size());
/*
* Retrieve all offsets into a long list.
*/
TaxonMatch m = null;
// int x1 = -1, x2 = -1;
int tag_count = 0;
String id_prefix = docid + "#";
for (NamedList<?> tag : tags) {
m = new TaxonMatch();
m.start = ((Integer) tag.get("startOffset")).intValue();
m.end = ((Integer) tag.get("endOffset")).intValue();// +1 char after
// last matched
// m.pattern_id = "taxtag";
++tag_count;
m.match_id = id_prefix + tag_count;
// m.setText((String) tag.get("matchText")); // Not reliable.
// matchText can be null.
m.setText(buf.substring(m.start, m.end));
if (TextUtils.countFormattingSpace(m.getText()) > 1) {
// Phrases with words broken across more than one line are not
// valid matches.
// Phrase with a single TAB is okay
continue;
}
@SuppressWarnings("unchecked")
List<Integer> taxonIDs = (List<Integer>) tag.get("ids");
for (Integer solrId : taxonIDs) {
Object refData = beanMap.get(solrId);
if (refData == null) {
continue;
}
/*
* Filter out non-Acronyms. e.g., 'who' is not a match for 'WHO'
*/
Taxon tx = (Taxon) refData;
if (this.filterNonAcronyms) {
if (tx.isAcronym && !m.isUpper()) {
continue;
}
}
m.addTaxon(tx);
}
// If the match has valid taxons add the match to the
// accumulator for this document.
//
if (m.hasTaxons()) {
matches.add(m);
}
}
log.debug("FOUND LABELS count={}", matches.size());
return matches;
}
/**
* "tags" are instances of the matching text spans from your input buffer
* "matchingDocs" are records from the taxonomy catalog. They have all the
* metadata.
*
* tags' ids array are pointers into matchingDocs, by Solr record ID.
*
* // "tagsCount":10, "tags":[{ "ids":[35], "endOffset":40,
* "startOffset":38}, // { "ids":[750308, 2769912, 2770041, 10413973,
* 10417546], "endOffset":49, // "startOffset":41}, // ... //
* "matchingDocs":{"numFound":75, "start":0, "docs":[ // {records matching}]
*
*/
@Override
public List<TextMatch> extract(TextInput input) throws ExtractionException {
return extractorImpl(input.id, input.buffer);
}
public static List<Taxon> search(SolrServer index, String query) throws SolrServerException {
ModifiableSolrParams qp = new ModifiableSolrParams();
qp.set(CommonParams.FL, "id,catalog,taxnode,phrase,tag,name_type");
qp.set(CommonParams.Q, query);
return search(index, qp);
}
public static List<Taxon> search(SolrServer index, SolrParams qparams) throws SolrServerException {
QueryResponse response = index.query(qparams, SolrRequest.METHOD.GET);
List<Taxon> taxons = new ArrayList<>();
SolrDocumentList docList = response.getResults();
for (SolrDocument solrDoc : docList) {
taxons.add(createTaxon(solrDoc));
}
return taxons;
}
/**
* search the current taxonomic catalog.
*
* @param query
* Solr "q" parameter only
* @return list of taxons
* @throws SolrServerException
* on err
*/
public List<Taxon> search(String query) throws SolrServerException {
return search(this.solr.getInternalSolrServer(), query);
}
/**
* search the current taxonomic catalog.
*
* @param qparams
* Solr parameters in full.
* @return list of taxons
* @throws SolrServerException
* on err
*/
public List<Taxon> search(SolrParams qparams) throws SolrServerException {
return search(this.solr.getInternalSolrServer(), qparams);
}
}