/**
*
* Copyright 2012-2013 The MITRE Corporation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* **************************************************************************
* NOTICE
* This software was produced for the U. S. Government under Contract No.
* W15P7T-12-C-F600, and is subject to the Rights in Noncommercial Computer
* Software and Noncommercial Computer Software Documentation Clause
* 252.227-7014 (JUN 1995)
*
* (c) 2012 The MITRE Corporation. All Rights Reserved.
* **************************************************************************
*/
///** ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~|
//
// _____ ____ __ __
///\ __`\ /\ _`\ /\ \__ /\ \__
//\ \ \/\ \ _____ __ ___ \ \,\L\_\ __ __ _\ \ ,_\ __ ___ \ \ ,_\
// \ \ \ \ \ /\ '__`\ /'__`\ /' _ `\ \/_\__ \ /'__`\/\ \/'\\ \ \/ /'__`\ /' _ `\\ \ \/
// \ \ \_\ \\ \ \L\ \/\ __/ /\ \/\ \ /\ \L\ \ /\ __/\/> </ \ \ \_ /\ \L\.\_ /\ \/\ \\ \ \_
// \ \_____\\ \ ,__/\ \____\\ \_\ \_\ \ `\____\\ \____\/\_/\_\ \ \__\\ \__/.\_\\ \_\ \_\\ \__\
// \/_____/ \ \ \/ \/____/ \/_/\/_/ \/_____/ \/____/\//\/_/ \/__/ \/__/\/_/ \/_/\/_/ \/__/
// \ \_\
// \/_/
//
// OpenSextant Commons
// * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~|
// */
package org.opensextant.extraction;
import org.opensextant.util.TextUtils;
/**
* A very simple struct to hold data useful for post-processing entities once found.
*
* @author Marc C. Ubaldino, MITRE, ubaldino at mitre dot org
*/
public class TextEntity {
/**
*
*/
protected String text = null;
/**
* char offset of entity; location in document where entity starts.
*/
public int start = -1;
/**
* char offset of entity; location in document where entity ends.
*/
public int end = -1;
// Use this
private String context = null;
// OR this
private String prematch = null;
private String postmatch = null;
/** */
public String match_id = null;
/** If this entity is contained completely within some other */
public boolean is_submatch = false;
/** If this entity is a overlaps with some other */
public boolean is_overlap = false;
/** If this entity is a duplicate of some other */
public boolean is_duplicate = false;
/**
*
*/
public TextEntity() {
}
/**
* sets the value of the TextEntity
*
* @param t
* text
*/
public void setText(String t) {
text = t;
if (text != null) {
isLower = TextUtils.isLower(text);
isUpper = TextUtils.isUpper(text);
// Worth tracking if matched text is ASCII only. If name or entity has diacritics then
// you may look at it differently.
//
try {
isASCII = TextUtils.isASCII(TextUtils.removePunctuation(text));
} catch (Exception err) {
isASCII = false;
}
}
}
/**
* Set just the value, without incurring the cost of other
* metrics or flags about the text that likely are unchanged.
* @param t
*/
public void setTextOnly(String t){
text = t;
}
private boolean isLower = false;
private boolean isUpper = false;
private boolean isASCII = false;
/**
* If non-punctuation content is purely ASCII vs. Latin1 vs. unicode.
*
* @return true if text value is purely ASCII
*/
public boolean isASCII() {
return isASCII;
}
/**
* test If text (that has a case sense) is ALL lower case
*
* @return true if all lower.
*/
public boolean isLower() {
return isLower;
}
/**
* test If text (that has a case sense) is ALL upper case
*
* @return true if all upper.
*/
public boolean isUpper() {
return isUpper;
}
/**
*
* @return text, value of a TextEntity
*/
public String getText() {
return text;
}
/**
* get the length of the matched text
*
* @return int, length
*/
public int getLength() {
if (start < 0) {
// Match not initialized
return 0;
}
return (end - start);
}
/** Convenience methods for carrying the context through the output processing */
/**
* Set the context with before and after windows
*
* @param before
* text before match
* @param after
* text after match
*/
public void setContext(String before, String after) {
this.prematch = before;
this.postmatch = after;
StringBuilder buf = new StringBuilder();
buf.append(this.prematch);
buf.append(" ");
buf.append(this.text);
buf.append(" ");
buf.append(this.postmatch);
this.context = buf.toString();
}
/**
* Set the context buffer from a single window
*
* @param window
* textual window
*/
public void setContext(String window) {
this.context = window;
}
/**
*
* @return context buffer regardless if it is singular context or separate pre/post match
*/
public String getContext() {
return this.context;
}
/**
*
* @return text before match
*/
public String getContextBefore() {
return this.prematch;
}
/**
*
* @return text after match
*/
public String getContextAfter() {
return this.postmatch;
}
/**
*
* @return string representation of entity
*/
@Override
public String toString() {
return text + " @(" + start + ":" + end + ")";
}
/**
*
* @param m
* match/entity object to copy
*/
public void copy(TextEntity m) {
// TextMatch generic stuff:
this.text = m.text;
this.start = m.start;
this.end = m.end;
this.is_duplicate = m.is_duplicate;
this.is_overlap = m.is_overlap;
this.is_submatch = m.is_submatch;
// These are private. maybe should use this.setA(m.getA())
this.postmatch = m.postmatch;
this.prematch = m.prematch;
this.context = m.context;
this.match_id = m.match_id;
}
public boolean isWithin(TextEntity t) {
return (end <= t.end && start >= t.start);
}
public boolean isSameMatch(TextEntity t) {
return (start == t.start && end == t.end);
}
public boolean isRightMatch(TextEntity t) {
return (start == t.start);
}
public boolean isLeftMatch(TextEntity t) {
return (end == t.end);
}
public boolean isOverlap(TextEntity t) {
// t overlaps with self on the left side
// OR t overlaps with self on right side
//
return (end > t.end && start > t.start && start < t.end)
|| (end < t.end && start < t.start && end > t.start);
}
}