/** * Copyright (C) 2012 cogroo <cogroo@cogroo.org> * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.cogroo.entities; import java.io.Serializable; import java.util.Arrays; import java.util.Objects; import org.cogroo.entities.impl.ChunkTag; import org.cogroo.entities.impl.MorphologicalTag; import org.cogroo.entities.impl.SyntacticTag; import org.cogroo.util.ToStringHelper; import opennlp.tools.util.Span; /** * A token is the smallest annotated unit of the text. Examples: "home" "," "." * "12.55" * * @author William Colen * */ public abstract class Token implements Serializable { private static final long serialVersionUID = 5748072170017854287L; /** * The string of the token as it is written in the text. */ protected String lexeme; /** * A lexeme from which derives the lexeme of this token. Example: lexeme = * meninas, primitive = menino */ protected String[] primitive; /** * States the morphological function of this token. */ protected MorphologicalTag morphologicalTag; /** * States if the token starts a phrase, is part of a phrase, or none of them, * in the sentence. */ protected ChunkTag chunkTag; /** * States the chunk that the token is part of. */ protected Chunk chunk; /** * States the type of the lexeme. Example: a word, a punctuation mark, a * number, etc. */ protected LexemeTypes lexemeType; /** * The indexes, counted by chars, that represents the position of the token in * the sentence. The first char of the token is the start index and the last * char of the token + 1 is the end index. */ protected Span span; private SyntacticChunk syntacticChunk; /** * Constructs a token without a lexeme and with a default span (0, 0). * */ public Token() { this.span = new Span(0, 0); } /** * Constructs a token with the given span. * * @param span */ public Token(Span span) { this.span = span; } /** * Constructs a token without a lexeme and with the given span indexes. * * @param start * the start index of the span * @param end * the end index of the span */ public Token(int start, int end) { this.span = new Span(start, end); } public String getLexeme() { return this.lexeme; } public abstract void setLexeme(String lexeme); public String[] getPrimitive() { return this.primitive; } public void setPrimitive(String[] primitive) { this.primitive = primitive; } public MorphologicalTag getMorphologicalTag() { return this.morphologicalTag; } public void setMorphologicalTag(MorphologicalTag tag) { this.morphologicalTag = tag; } public ChunkTag getChunkTag() { return this.chunkTag; } public void setChunkTag(ChunkTag ct) { this.chunkTag = ct; } public void setSpan(Span span) { this.span = span; } public SyntacticTag getSyntacticTag() { if (this.getSyntacticChunk() == null) { return null; } return this.getSyntacticChunk().getSyntacticTag(); } public Chunk getChunk() { return this.chunk; } public void setChunk(Chunk chunk) { this.chunk = chunk; } public LexemeTypes getLexemeType() { return this.lexemeType; } public Span getSpan() { return this.span; } @Override public String toString() { return ToStringHelper.toStringHelper(this).add("lxm", lexeme).add("pr", Arrays.toString(primitive)) .add("mp", morphologicalTag).add("ch", chunkTag) // .add("lexemeType", lexemeType) // .add("span", span) .toString(); } @Override public boolean equals(Object obj) { if (obj instanceof Token) { Token that = (Token) obj; return Objects.equals(this.lexeme, that.lexeme) && Objects.equals(this.primitive, that.primitive) && Objects.equals(this.morphologicalTag, that.morphologicalTag) && Objects.equals(this.chunkTag, that.chunkTag) // && Objects.equals(this.syntacticChunk, that.syntacticChunk) // && Objects.equals(this.chunk, that.chunk) && Objects.equals(this.lexemeType, that.lexemeType) && Objects.equals(this.span, that.span); } return false; } /* * (non-Javadoc) * @see java.lang.Object#hashCode() */ @Override public int hashCode() { return Objects.hash(this.lexeme, this.primitive, this.morphologicalTag, this.chunkTag, this.lexemeType, this.span); } public void setSyntacticChunk(SyntacticChunk syntacticChunk) { this.syntacticChunk = syntacticChunk; } public SyntacticChunk getSyntacticChunk() { return this.syntacticChunk; } }