//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.annotators.structural; import java.lang.reflect.Constructor; import java.util.Collections; import java.util.regex.Pattern; import java.util.stream.Stream; import org.apache.commons.lang.StringUtils; import org.apache.uima.UIMAException; import org.apache.uima.UimaContext; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import com.google.common.base.Strings; import com.google.common.collect.ImmutableSet; import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction; import uk.gov.dstl.baleen.core.utils.ConfigUtils; import uk.gov.dstl.baleen.exceptions.BaleenException; import uk.gov.dstl.baleen.types.semantic.Entity; import uk.gov.dstl.baleen.types.structure.TableCell; import uk.gov.dstl.baleen.uima.BaleenAnnotator; import uk.gov.dstl.baleen.uima.utils.TypeSystemSingleton; import uk.gov.dstl.baleen.uima.utils.TypeUtils; /** * Extract entities from tables using a regular expression to find columns by * their name. * * <p> * The regular expression supplied by the user is run over the table headers (or * row 1 if missing) to identify columns. TableCells from that column are then * annotated as a user specified type, which must inherit from the Entity class. * Users can supply a confidence to assign to annotations created by this * annotator. * </p> * * * @baleen.javadoc */ public class TableEntity extends BaleenAnnotator { /** * Is the regular expression case sensitive? * * @baleen.config false */ public static final String PARAM_CASE_SENSITIVE = "caseSensitive"; @ConfigurationParameter(name = PARAM_CASE_SENSITIVE, defaultValue = "false") private boolean caseSensitive = false; /** * The regular expression to search for * * @baleen.config */ public static final String PARAM_PATTERN = "pattern"; @ConfigurationParameter(name = PARAM_PATTERN, defaultValue = "") private String pattern; /** * The entity type to use for matched entities * * @baleen.config uk.gov.dstl.baleen.types.semantic.Entity */ public static final String PARAM_TYPE = "type"; @ConfigurationParameter(name = PARAM_TYPE, defaultValue = "uk.gov.dstl.baleen.types.semantic.Entity") private String type; /** * The entity subType to use for matched entities * * @baleen.config */ public static final String PARAM_SUB_TYPE = "subType"; @ConfigurationParameter(name = PARAM_SUB_TYPE, defaultValue = "") private String subType; /** * The confidence to assign to matched entities * * @baleen.config 1.0 */ public static final String PARAM_CONFIDENCE = "confidence"; @ConfigurationParameter(name = PARAM_CONFIDENCE, defaultValue = "1.0") private String confidenceString; // Parse the confidence config parameter into this variable to avoid issues // with parameter types private Float confidence; private Pattern p = null; private Constructor<? extends Entity> constructor; @Override public void doInitialize(final UimaContext aContext) throws ResourceInitializationException { confidence = ConfigUtils.stringToFloat(confidenceString, 1.0f); if (!caseSensitive) { pattern = "(?i)" + pattern; } p = Pattern.compile(pattern); getMonitor().debug("The regular expression is \"{}\"", p.pattern()); try { final Class<? extends Entity> et = TypeUtils.getEntityClass(type, JCasFactory.createJCas(TypeSystemSingleton.getTypeSystemDescriptionInstance())); constructor = et.getConstructor(JCas.class); } catch (UIMAException | BaleenException | NoSuchMethodException | SecurityException e) { throw new ResourceInitializationException(e); } } @Override protected void doProcess(JCas jCas) throws AnalysisEngineProcessException { Stream<TableCell> rows = new Tables(jCas).withColumn(p).getFilteredCells(); rows.forEach(cell -> { String text = cell.getCoveredText(); if (StringUtils.isNotBlank(text)) { Entity ret; try { ret = constructor.newInstance(jCas); ret.setBegin(cell.getBegin()); ret.setEnd(cell.getEnd()); ret.setValue(text); ret.setConfidence(confidence); if (!Strings.isNullOrEmpty(subType)) { ret.setSubType(subType); } addToJCasIndex(ret); } catch (Exception e) { throw new RuntimeException("Can not create entity type " + type, e); } } }); } @Override public AnalysisEngineAction getAction() { return new AnalysisEngineAction(Collections.emptySet(), ImmutableSet.of(constructor.getDeclaringClass())); } }