/* LanguageTool, a natural language style checker
* Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
* USA
*/
package org.languagetool;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.languagetool.tools.StringTools;
/**
* An array of {@link AnalyzedToken}s used to store multiple POS tags and lemmas
* for a given single token.
*
* @author Marcin Milkowski
*/
public class AnalyzedTokenReadings {
protected AnalyzedToken[] anTokReadings;
private int startPos;
private String token;
private boolean isWhitespace;
private boolean isLinebreak;
private boolean isSentEnd;
private boolean isSentStart;
private boolean isParaEnd;
private boolean isWhitespaceBefore;
/**
* If true, then the token is marked up as immune against tests:
* it should never be matched by any rule. Used to have generalized
* mechanism for exceptions in rules.
*/
private boolean isImmunized;
/**
* Used to hold the string representation of the disambiguator actions on a token.
*/
private String historicalAnnotations = "";
public AnalyzedTokenReadings(final AnalyzedToken[] token, final int startPos) {
anTokReadings = token.clone();
this.startPos = startPos;
init();
}
public AnalyzedTokenReadings(final List<AnalyzedToken> tokens, final int startPos) {
anTokReadings = tokens.toArray(new AnalyzedToken[tokens.size()]);
this.startPos = startPos;
init();
}
public AnalyzedTokenReadings(final AnalyzedToken token, final int startPos) {
this(token);
this.startPos = startPos;
}
AnalyzedTokenReadings(final AnalyzedToken token) {
anTokReadings = new AnalyzedToken[1];
anTokReadings[0] = token;
isWhitespaceBefore = token.isWhitespaceBefore();
init();
}
private void init() {
token = anTokReadings[0].getToken();
isWhitespace = StringTools.isWhitespace(token);
isLinebreak = "\n".equals(token) || "\r\n".equals(token)
|| "\r".equals(token) || "\n\r".equals(token);
isSentStart = JLanguageTool.SENTENCE_START_TAGNAME.equals(anTokReadings[0]
.getPOSTag());
isParaEnd = hasPosTag(JLanguageTool.PARAGRAPH_END_TAGNAME);
isSentEnd = hasPosTag(JLanguageTool.SENTENCE_END_TAGNAME);
setNoRealPOStag();
}
public final List<AnalyzedToken> getReadings() {
return Arrays.asList(anTokReadings);
}
/**
* Get a token reading
* @see #getReadingsLength() getReadingsLength() for how many token readings there are
*/
public final AnalyzedToken getAnalyzedToken(final int idx) {
return anTokReadings[idx];
}
/**
* Checks if the token has a particular POS tag.
*
* @param posTag POS tag to look for
*/
public final boolean hasPosTag(final String posTag) {
boolean found = false;
for (final AnalyzedToken reading : anTokReadings) {
if (reading.getPOSTag() != null) {
found = posTag.equals(reading.getPOSTag());
if (found) {
break;
}
}
}
return found;
}
/**
* Checks if one of the token's readings has a particular lemma.
*
* @param lemma lemma POS tag to look for
*/
public final boolean hasLemma(final String lemma) {
boolean found = false;
for (final AnalyzedToken reading : anTokReadings) {
if (reading.getLemma() != null) {
found = lemma.equals(reading.getLemma());
if (found) {
break;
}
}
}
return found;
}
/**
* Checks if the token has a particular POS tag, whereas a part of the given POS tag needs to match.
*
* @param posTag POS tag substring to look for
* @since 1.8
*/
public final boolean hasPartialPosTag(final String posTag) {
boolean found = false;
for (final AnalyzedToken reading : anTokReadings) {
if (reading.getPOSTag() != null) {
found = reading.getPOSTag().contains(posTag);
if (found) {
break;
}
}
}
return found;
}
/**
* Add a new reading.
* @param token - new reading, given as {@link AnalyzedToken}
*/
public final void addReading(final AnalyzedToken token) {
final ArrayList<AnalyzedToken> l = new ArrayList<AnalyzedToken>();
for (int i = 0; i < anTokReadings.length - 1; i++) {
l.add(anTokReadings[i]);
}
if (anTokReadings[anTokReadings.length - 1].getPOSTag() != null) {
l.add(anTokReadings[anTokReadings.length - 1]);
}
token.setWhitespaceBefore(isWhitespaceBefore);
l.add(token);
anTokReadings = l.toArray(new AnalyzedToken[l.size()]);
if (token.getToken().length() > this.token.length()) { //in case a longer token is added
this.token = token.getToken();
}
anTokReadings[anTokReadings.length - 1].setWhitespaceBefore(isWhitespaceBefore);
isParaEnd = hasPosTag(JLanguageTool.PARAGRAPH_END_TAGNAME);
isSentEnd = hasPosTag(JLanguageTool.SENTENCE_END_TAGNAME);
setNoRealPOStag();
}
/**
* Removes a reading from the list of readings. Note: if the token
* has only one reading, then a new reading with an empty POS tag
* and an empty lemma is created.
* @param token - reading to be removed.
*/
public final void removeReading(final AnalyzedToken token) {
final ArrayList<AnalyzedToken> l = new ArrayList<AnalyzedToken>();
final AnalyzedToken tmpTok = new AnalyzedToken(token.getToken(), token
.getPOSTag(), token.getLemma());
tmpTok.setWhitespaceBefore(isWhitespaceBefore);
for (AnalyzedToken anTokReading : anTokReadings) {
if (!anTokReading.matches(tmpTok)) {
l.add(anTokReading);
}
}
if (l.isEmpty()) {
l.add(new AnalyzedToken(this.token, null, null));
}
anTokReadings = l.toArray(new AnalyzedToken[l.size()]);
setNoRealPOStag();
}
/**
* Removes all the readings but the one that match the token given.
* @since 1.5
* @param token Token to be matched
*/
public final void leaveReading(final AnalyzedToken token) {
final ArrayList<AnalyzedToken> l = new ArrayList<AnalyzedToken>();
final AnalyzedToken tmpTok = new AnalyzedToken(token.getToken(), token
.getPOSTag(), token.getLemma());
tmpTok.setWhitespaceBefore(isWhitespaceBefore);
for (AnalyzedToken anTokReading : anTokReadings) {
if (anTokReading.matches(tmpTok)) {
l.add(anTokReading);
}
}
if (l.isEmpty()) {
l.add(new AnalyzedToken(this.token, null, null));
}
anTokReadings = l.toArray(new AnalyzedToken[l.size()]);
setNoRealPOStag();
}
public final int getReadingsLength() {
return anTokReadings.length;
}
public final boolean isWhitespace() {
return isWhitespace;
}
/**
* Returns true if the token equals \n, \r\n \n\r or \r\n.
*/
public final boolean isLinebreak() {
return isLinebreak;
}
public final boolean isSentStart() {
return isSentStart;
}
/**
* @return true when the token is a last token in a paragraph.
*/
public final boolean isParaEnd() {
return isParaEnd;
}
/**
* Add PARA_END tag.
*/
public void setParaEnd() {
if (!isParaEnd()) {
final AnalyzedToken paragraphEnd = new AnalyzedToken(getToken(),
JLanguageTool.PARAGRAPH_END_TAGNAME, getAnalyzedToken(0).getLemma());
addReading(paragraphEnd);
}
}
/**
* @return true when the token is a last token in a sentence.
*/
public final boolean isSentEnd() {
return isSentEnd;
}
/**
* @since 0.9.9
* @return true if the token is OpenOffice field code.
*/
public final boolean isFieldCode() {
return "\u0001".equals(token) || "\u0002".equals(token);
}
/**
* Add a SENT_END tag.
*/
public final void setSentEnd() {
if (!isSentEnd()) {
final AnalyzedToken sentenceEnd = new AnalyzedToken(getToken(),
JLanguageTool.SENTENCE_END_TAGNAME, getAnalyzedToken(0).getLemma());
addReading(sentenceEnd);
}
}
public final int getStartPos() {
return startPos;
}
public final void setStartPos(final int position) {
startPos = position;
}
public final String getToken() {
return token;
}
public final void setWhitespaceBefore(final boolean isWhiteSpaceBefore) {
isWhitespaceBefore = isWhiteSpaceBefore;
for (final AnalyzedToken aTok : anTokReadings) {
aTok.setWhitespaceBefore(isWhiteSpaceBefore);
}
}
public final boolean isWhitespaceBefore() {
return isWhitespaceBefore;
}
public final void immunize() {
isImmunized = true;
}
public final boolean isImmunized() {
return isImmunized;
}
/**
* Sets the flag on AnalyzedTokens to make matching
* on "UNKNOWN" POS tag correct in the Element class.
*/
private void setNoRealPOStag() {
boolean hasNoPOStag = !isLinebreak();
for (AnalyzedToken an: anTokReadings) {
if (JLanguageTool.PARAGRAPH_END_TAGNAME.equals(an.getPOSTag())
|| JLanguageTool.SENTENCE_END_TAGNAME.equals(an.getPOSTag())) {
continue;
}
if (an.getPOSTag() != null) {
hasNoPOStag = false;
}
}
for (AnalyzedToken an: anTokReadings) {
an.setNoPOSTag(hasNoPOStag);
}
}
@Override
public String toString() {
final StringBuilder sb = new StringBuilder();
sb.append(token);
sb.append("[");
for (final AnalyzedToken element : anTokReadings) {
sb.append(element);
if (!element.isWhitespaceBefore()) {
sb.append("*");
}
sb.append(",");
}
sb.delete(sb.length() - 1, sb.length());
sb.append("]");
return sb.toString();
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + Arrays.hashCode(anTokReadings);
result = prime * result + (isLinebreak ? 1231 : 1237);
result = prime * result + (isParaEnd ? 1231 : 1237);
result = prime * result + (isSentEnd ? 1231 : 1237);
result = prime * result + (isSentStart ? 1231 : 1237);
result = prime * result + (isWhitespace ? 1231 : 1237);
result = prime * result + (isWhitespaceBefore ? 1231 : 1237);
result = prime * result + startPos;
result = prime * result + ((token == null) ? 0 : token.hashCode());
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (getClass() != obj.getClass())
return false;
final AnalyzedTokenReadings other = (AnalyzedTokenReadings) obj;
if (!Arrays.equals(anTokReadings, other.anTokReadings))
return false;
if (isLinebreak != other.isLinebreak)
return false;
if (isParaEnd != other.isParaEnd)
return false;
if (isSentEnd != other.isSentEnd)
return false;
if (isSentStart != other.isSentStart)
return false;
if (isWhitespace != other.isWhitespace)
return false;
if (isWhitespaceBefore != other.isWhitespaceBefore)
return false;
if (isImmunized != other.isImmunized)
return false;
if (startPos != other.startPos)
return false;
if (token == null) {
if (other.token != null)
return false;
} else if (!token.equals(other.token))
return false;
return true;
}
/**
* Used to track disambiguator actions.
* @return the historicalAnnotations
*/
public String getHistoricalAnnotations() {
return historicalAnnotations;
}
/**
* Used to track disambiguator actions.
* @param historicalAnnotations the historicalAnnotations to set
*/
public void setHistoricalAnnotations(String historicalAnnotations) {
this.historicalAnnotations = historicalAnnotations;
}
}