package hu.u_szeged.nlp.pos;
import hu.u_szeged.nlp.pos.KRUtils.KRPOS;
import java.util.Collection;
import java.util.LinkedHashSet;
/**
* összetett szavak elemzése
*
* @author zsjanos
*
*/
public class CompoundWord {
public static boolean isCompatibleAnalyises(String firstPartKR, String secondPartKR) {
KRPOS secondPartPOS = KRUtils.getPOS(secondPartKR);
KRPOS firstPartPOS = KRUtils.getPOS(secondPartKR);
// UTT-INT nem lehet a második rész
if (secondPartPOS.equals(KRPOS.UTT_INT)) {
return false;
}
// ART nem lehet a második rész
if (secondPartPOS.equals(KRPOS.ART)) {
return false;
}
// NUM előtt csak NUM állhat
if (secondPartPOS.equals(KRPOS.NUM) && !firstPartPOS.equals(KRPOS.NUM)) {
return false;
}
// PREV nem lehet a második rész
if (secondPartPOS.equals(KRPOS.PREV)) {
return false;
}
// NOUN + ADV letiltva
if (firstPartPOS.equals(KRPOS.NOUN) && secondPartPOS.equals(KRPOS.ADV)) {
return false;
}
// VERB + ADV letiltva
if (firstPartPOS.equals(KRPOS.VERB) && secondPartPOS.equals(KRPOS.ADV)) {
return false;
}
// PREV + NOUN letiltva
if (firstPartPOS.equals(KRPOS.PREV) && secondPartPOS.equals(KRPOS.NOUN)) {
return false;
}
// ADJ + VERB letiltva
if (firstPartPOS.equals(KRPOS.ADJ) && secondPartPOS.equals(KRPOS.VERB)) {
return false;
}
// VERB + NOUN letiltva
if (firstPartPOS.equals(KRPOS.VERB) && secondPartPOS.equals(KRPOS.NOUN)) {
return false;
}
// NOUN + VERB csak akkor lehet, ha van a NOUN-nak <CAS>
if (firstPartPOS.equals(KRPOS.NOUN) && secondPartPOS.equals(KRPOS.VERB) && !firstPartKR.contains("CAS")) {
return false;
}
// NOUN + VERB<PAST><DEF> és nincs a NOUN-nak <CAS> akkor /ADJ
if (firstPartPOS.equals(KRPOS.NOUN) && secondPartPOS.equals(KRPOS.VERB) && !firstPartKR.contains("CAS") && secondPartKR.contains("<PAST><DEF>")
&& secondPartKR.contains("<DEF>")) {
return false;
}
return true;
}
public static boolean isBisectable(String compoundWord) {
for (int i = 2; i < compoundWord.length() - 1; ++i) {
if (MagyarlancResourceHolder.getRFSA().analyse(compoundWord.substring(0, i)).size() > 0
&& MagyarlancResourceHolder.getRFSA().analyse(compoundWord.substring(i, compoundWord.length())).size() > 0) {
return true;
}
}
return false;
}
public static int bisectIndex(String compoundWord) {
for (int i = 2; i < compoundWord.length() - 1; ++i) {
if (MagyarlancResourceHolder.getRFSA().analyse(compoundWord.substring(0, i)).size() > 0
&& MagyarlancResourceHolder.getRFSA().analyse(compoundWord.substring(i, compoundWord.length())).size() > 0) {
return i;
}
}
return 0;
}
public static LinkedHashSet<String> getCompatibleAnalises(String firstPart, String secondPart) {
return getCompatibleAnalises(firstPart, secondPart, false);
}
public static LinkedHashSet<String> getCompatibleAnalises(String firstPart, String secondPart, boolean hyphenic) {
LinkedHashSet<String> analises = null;
analises = new LinkedHashSet<String>();
Collection<String> firstAnalises = null;
Collection<String> secondAnalises = null;
firstAnalises = MagyarlancResourceHolder.getRFSA().analyse(firstPart);
secondAnalises = MagyarlancResourceHolder.getRFSA().analyse(secondPart);
String firstPartKR = null;
String secondPartKR = null;
if (firstAnalises.size() > 0 && secondAnalises.size() > 0) {
for (String f : firstAnalises) {
for (String s : secondAnalises) {
firstPartKR = KRUtils.getRoot(f);
secondPartKR = KRUtils.getRoot(s);
if (isCompatibleAnalyises(firstPartKR, secondPartKR)) {
if (hyphenic) {
analises.add(secondPartKR.replace("$", "$" + firstPart + "-"));
} else {
analises.add(secondPartKR.replace("$", "$" + firstPart));
}
}
}
}
}
return analises;
}
public static LinkedHashSet<String> analyseCompoundWord(String compoundWord) {
LinkedHashSet<String> analises = null;
analises = new LinkedHashSet<String>();
String firstPart = null;
String secondPart = null;
int bisectIndex = 0;
// 2 részre vágható van elemzés
if (isBisectable(compoundWord)) {
bisectIndex = bisectIndex(compoundWord);
firstPart = compoundWord.substring(0, bisectIndex);
secondPart = compoundWord.substring(bisectIndex, compoundWord.length());
analises = getCompatibleAnalises(firstPart, secondPart);
}
// ha nem bontható 2 részre
else {
for (int i = 2; i < compoundWord.length() - 1; ++i) {
firstPart = compoundWord.substring(0, i);
secondPart = compoundWord.substring(i, compoundWord.length());
Collection<String> firstPartAnalises = null;
firstPartAnalises = MagyarlancResourceHolder.getRFSA().analyse(firstPart);
if (firstPartAnalises.size() > 0) {
// ha a második rész két részre bontható
if (isBisectable(secondPart)) {
LinkedHashSet<String> secondPartAnalises = null;
String firstPartOfSecondSection = null;
String secondPartOfSecondSection = null;
bisectIndex = bisectIndex(secondPart);
firstPartOfSecondSection = secondPart.substring(0, bisectIndex);
secondPartOfSecondSection = secondPart.substring(bisectIndex, secondPart.length());
secondPartAnalises = getCompatibleAnalises(firstPartOfSecondSection, secondPartOfSecondSection);
for (String firstAnalyse : firstPartAnalises) {
for (String secondAnalyse : secondPartAnalises) {
if (isCompatibleAnalyises(KRUtils.getRoot(firstAnalyse), KRUtils.getRoot(secondAnalyse))) {
analises.add(KRUtils.getRoot(secondAnalyse).replace("$", "$" + firstPart));
}
}
}
}
}
}
}
return analises;
}
}