/** Copyright 2009-2013 The MITRE Corporation. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. * ************************************************************************** * NOTICE * This software was produced for the U. S. Government under Contract No. * W15P7T-12-C-F600, and is subject to the Rights in Noncommercial Computer * Software and Noncommercial Computer Software Documentation Clause * 252.227-7014 (JUN 1995) * * (c) 2012 The MITRE Corporation. All Rights Reserved. * ************************************************************************** **/ package org.opensextant.toolbox; import java.util.HashMap; import java.util.Map; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import gate.Annotation; import gate.AnnotationSet; import gate.FeatureMap; import gate.ProcessingResource; import gate.Resource; import gate.creole.AbstractLanguageAnalyser; import gate.creole.ExecutionException; import gate.creole.ResourceInstantiationException; import gate.creole.metadata.CreoleParameter; import gate.creole.metadata.CreoleResource; import gate.creole.metadata.Optional; import gate.creole.metadata.RunTime; /** * * This GATE Processing Resource determines if a document appears to be in * all/mostly lower case, all/mostly upper case or proper mixed case. Its * primary purpose is to allow downstream rule sets to be applied only where * their case assumptions are valid. * */ @CreoleResource(name = "OpenSextant Case Detector ", comment = "Determines if the document is in proper case," + " all upper case or all lower case") public class CaseDetectorPR extends AbstractLanguageAnalyser implements ProcessingResource { /** * */ private static final long serialVersionUID = -8638479048484746444L; // The parameters passed in by the user String inputASName; // The name of the input AnnotationSet String tokenAnnoName; // the annotation to examine,usually "Token" String sentenceAnnoName; // the annotation to examine,usually "Sentence" String stringFeatureName; // the existing feature on tokenAnnoType // containing the word, usually "string" String caseFeatureName; // the feature on the token which is the case // category usually "orth" String caseDecisionName; // the feature to create on the document that // contains the result of the analysis // Log object private static final Logger LOGGER = LoggerFactory.getLogger(CaseDetectorPR.class); // the thresholds thst define the majority case level // TODO: make these runtime parameters double lowerCaseThresh = 0.90; double upperCaseThresh = 0.90; double sentenceRatioThresh = 1.0; /** * @return * @throws ResourceInstantiationException */ @Override public Resource init() throws ResourceInstantiationException { return this; } /** * @throws ResourceInstantiationException */ @Override public void reInit() throws ResourceInstantiationException { init(); } /** * @throws ExecutionException */ @Override public void execute() throws ExecutionException { // If no Annotation set was given, use the default AnnotationSet annotSet = (inputASName == null || "".equals(inputASName)) ? document.getAnnotations() : document.getAnnotations(inputASName); // the histogram of case types counts seen in the document Map<String, Integer> caseCounts = new HashMap<String, Integer>(); // find out how many sentences there are in the document AnnotationSet sentenceSet = annotSet.get(sentenceAnnoName); int sentenceCount = sentenceSet.size(); // get the tokens AnnotationSet tokenSet = annotSet.get(tokenAnnoName); // see if there any tokens to work with if (tokenSet == null || tokenSet.isEmpty()) { LOGGER.error("No tokens found in " + document.getName()); return; } // accumulate the case stats for (Annotation an : tokenSet) { FeatureMap fm = an.getFeatures(); // get the case for this token String kase = (String) fm.get(caseFeatureName); if (!caseCounts.containsKey(kase)) { caseCounts.put(kase, 0); } // increment case count caseCounts.put(kase, caseCounts.get(kase) + 1); } // make decision String decision = null; // sum total relevant counts // TODO: the case names should be parameterized Integer lowercaseCount = caseCounts.get("lowercase"); Integer uppercaseCount = caseCounts.get("allCaps"); Integer initialcaseCount = caseCounts.get("upperInitial"); // set nulls to 0 if (lowercaseCount == null) { lowercaseCount = 0; } if (uppercaseCount == null) { uppercaseCount = 0; } if (initialcaseCount == null) { initialcaseCount = 0; } // calculate percentage of total for each case category double total = 1.0 * lowercaseCount + uppercaseCount + initialcaseCount; double lowerPercent = lowercaseCount / total; double upperPercent = uppercaseCount / total; double sentenceInitialRatio = initialcaseCount / (1.0 * sentenceCount); // if mostly lower case and (significantly) fewer initials than // sentences if (lowerPercent > lowerCaseThresh && sentenceInitialRatio < sentenceRatioThresh) { decision = "LOWERCASE"; } // if mostly upper case if (upperPercent > upperCaseThresh) { decision = "UPPERCASE"; } // if neither of the above, must be proper if (decision == null) { decision = "PROPERCASE"; } // attach decision to document document.getFeatures().put(caseDecisionName, decision); // cleanup caseCounts.clear(); } // end execute /** * @return */ public String getInputASName() { return inputASName; } /** * @param inputASName */ @Optional @RunTime @CreoleParameter public void setInputASName(String inputASName) { this.inputASName = inputASName; } /** * @return */ public String getTokenAnnoName() { return tokenAnnoName; } /** * @param tokenAnnoName */ @Optional @RunTime @CreoleParameter(defaultValue = "Token") public void setTokenAnnoName(String tokenAnnoName) { this.tokenAnnoName = tokenAnnoName; } /** * @return */ public String getSentenceAnnoName() { return sentenceAnnoName; } /** * @param sentenceAnnoName */ @Optional @RunTime @CreoleParameter(defaultValue = "Sentence") public void setSentenceAnnoName(String sentenceAnnoName) { this.sentenceAnnoName = sentenceAnnoName; } /** * @return */ public String getStringFeatureName() { return stringFeatureName; } /** * @param stringFeatureName */ @Optional @RunTime @CreoleParameter(defaultValue = "string") public void setStringFeatureName(String stringFeatureName) { this.stringFeatureName = stringFeatureName; } /** * @return */ public String getCaseFeatureName() { return caseFeatureName; } /** * @param caseFeatureName */ @Optional @RunTime @CreoleParameter(defaultValue = "orth") public void setCaseFeatureName(String caseFeatureName) { this.caseFeatureName = caseFeatureName; } /** * @return */ public String getCaseDecisionName() { return caseDecisionName; } /** * @param caseDecisionName */ @Optional @RunTime @CreoleParameter(defaultValue = "CaseDecision") public void setCaseDecisionName(String caseDecisionName) { this.caseDecisionName = caseDecisionName; } }