package hu.ppke.itk.nlpg.purepos.common.lemma;
import hu.ppke.itk.nlpg.purepos.common.Util;
import hu.ppke.itk.nlpg.purepos.common.lemma.GeneralizedLemmaTransformation.Transformation;
import java.io.Serializable;
import org.apache.commons.lang3.tuple.Pair;
public class GeneralizedLemmaTransformation extends
AbstractLemmaTransformation<Transformation> {
public GeneralizedLemmaTransformation(String word, String lemma, Integer tag) {
super(word, lemma, tag);
}
private static final long serialVersionUID = -2376160223585239419L;
public class Transformation implements Serializable {
private static final long serialVersionUID = 8291301251800430106L;
public final int removeStart;
public final int removeEnd;
public final String addStart;
public final String addEnd;
public final Integer tag;
public final boolean toLower;
private final String strRep;
private final int hashCode;
public Transformation(int removeStart, int removeEnd, String addStart,
String addEnd, Integer tag, boolean toLower) {
super();
this.removeStart = removeStart;
this.removeEnd = removeEnd;
this.addStart = addStart;
this.addEnd = addEnd;
this.tag = tag;
this.toLower = toLower;
String l = toLower ? "_" : "-";
this.strRep = "(" + l + ",< -" + removeStart + "+'" + addStart
+ "', >-" + removeEnd + "+'" + addEnd + "' -" + tag + ")";
this.hashCode = this.strRep.hashCode();
}
public String toString() {
return strRep;
}
@Override
public int hashCode() {
return this.hashCode;
}
@Override
public boolean equals(Object other) {
if (other instanceof Transformation) {
Transformation o = (Transformation) other;
return this.removeEnd == o.removeEnd
&& this.removeStart == o.removeStart
&& this.addStart.equals(o.addStart)
&& this.addEnd.equals(o.addEnd)
&& this.tag.equals(o.tag);
} else
return false;
}
}
/**
* Calculates the longest substring efficiently.
*
* See http://karussell.wordpress.com/2011/04/14/longest-common-substring-
* algorithm-in-java/
*
* @param str1
* @param str2
* @return start position and length
*/
public static Pair<Integer, Integer> longestSubstring(String str1,
String str2) {
StringBuilder sb = new StringBuilder();
if (str1 == null || str1.isEmpty() || str2 == null || str2.isEmpty())
return Pair.of(0, 0);
// ignore case
str1 = str1.toLowerCase();
str2 = str2.toLowerCase();
// java initializes them already with 0
int[][] num = new int[str1.length()][str2.length()];
int maxlen = 0;
int lastSubsBegin = 0;
for (int i = 0; i < str1.length(); i++) {
for (int j = 0; j < str2.length(); j++) {
if (str1.charAt(i) == str2.charAt(j)) {
if ((i == 0) || (j == 0))
num[i][j] = 1;
else
num[i][j] = 1 + num[i - 1][j - 1];
if (num[i][j] > maxlen) {
maxlen = num[i][j];
// generate substring from str1 => i
int thisSubsBegin = i - num[i][j] + 1;
if (lastSubsBegin == thisSubsBegin) {
// if the current LCS is the same as the last time
// this block ran
sb.append(str1.charAt(i));
} else {
// this block resets the string builder if a
// different LCS is found
lastSubsBegin = thisSubsBegin;
sb = new StringBuilder();
sb.append(str1.substring(lastSubsBegin, i + 1));
}
}
}
}
}
return Pair.of(lastSubsBegin, sb.length());
}
@Override
public int minimalCutLength() {
return representation.removeEnd;
}
protected boolean lowerTransformed(String word, String lemma) {
if (word.length() > 0 && lemma.length() > 0) {
String ws = word.substring(0, 1), ls = lemma.substring(0, 1);
boolean isWordUpper = ws.toUpperCase().equals(ws);
boolean isLemmaLower = ls.toLowerCase().equals(ls);
return isWordUpper && isLemmaLower;
} else
return false;
}
@Override
protected Transformation decode(String word, String lemma, Integer tag) {
Pair<Integer, Integer> posWord_Lemma = longestSubstring(word, lemma);
Pair<Integer, Integer> posLemma_Word = longestSubstring(lemma, word);
boolean lowered = lowerTransformed(word, lemma);
if (posWord_Lemma.getRight() < 2) {
return new Transformation(0, word.length(), "", lemma, tag, lowered);
}
int removeStart = posWord_Lemma.getLeft();
int removeEnd = word.length()
- (posWord_Lemma.getLeft() + posWord_Lemma.getRight());
String addStart = lemma.substring(0, posLemma_Word.getLeft());
String addEnd = lemma.substring(posLemma_Word.getLeft()
+ posLemma_Word.getRight());
return new Transformation(removeStart, removeEnd, addStart, addEnd,
tag, lowered);
}
@Override
protected Pair<String, Integer> encode(String word,
Transformation representation) {
boolean upperWord = Util.isUpper(word);
// try {
int subEnd = Math.max(0, word.length() - representation.removeEnd);
String lemma = word.substring(0, subEnd) + representation.addEnd;
lemma = representation.addStart
+ lemma.substring(Math.min(representation.removeStart,
lemma.length()));
lemma = lemma.toLowerCase();
if (upperWord && !representation.toLower && lemma.length() > 0) {
lemma = lemma.substring(0, 1).toUpperCase() + lemma.substring(1);
}
return Pair.of(lemma, representation.tag);
// } catch (Exception e) {
// System.err.println(word);
// System.err.println(representation);
// System.err.println(e.getMessage());
// }
// return null;
}
}