package org.molgenis.data.mapper.algorithmgenerator.rules.impl;
import com.google.common.base.Splitter;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import org.apache.commons.lang3.StringUtils;
import org.molgenis.data.mapper.algorithmgenerator.bean.Category;
import org.molgenis.data.mapper.algorithmgenerator.rules.CategoryMatchQuality;
import org.molgenis.data.mapper.algorithmgenerator.rules.CategoryRule;
import org.molgenis.data.mapper.algorithmgenerator.rules.quality.Quality;
import org.molgenis.data.mapper.algorithmgenerator.rules.quality.impl.NumericQuality;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import static java.util.Objects.requireNonNull;
public abstract class InternalAbstractCategoryRule implements CategoryRule
{
private static final Splitter TERM_SPLITTER = Splitter.onPattern("\\s+");
private static final String ILLEGAL_CHARS_REGEX = "[^a-zA-Z0-9]";
private final List<String> words;
public InternalAbstractCategoryRule(Set<String> words)
{
this.words = Lists.newArrayList(requireNonNull(words));
sortBasedOnLength(this.words);
}
@Override
public CategoryMatchQuality<Double> createCategoryMatchQuality(Category targetCategory, Category sourceCategory)
{
String matchedTermForTargetLabel = getMatchedTermFromTheRulelabelContainsWords(targetCategory.getLabel());
String matchedTermForSourceLabel = getMatchedTermFromTheRulelabelContainsWords(sourceCategory.getLabel());
boolean ruleApplied =
StringUtils.isNotBlank(matchedTermForTargetLabel) && StringUtils.isNotBlank(matchedTermForSourceLabel);
Quality<Double> quality = NumericQuality
.create(createNumericQualityIndicator(matchedTermForTargetLabel, matchedTermForSourceLabel));
return CategoryMatchQuality.create(ruleApplied, quality, targetCategory, sourceCategory);
}
private double createNumericQualityIndicator(String matchedTermForTargetLabel, String matchedTermForSourceLabel)
{
return (double) matchedTermForTargetLabel.length() * matchedTermForSourceLabel.length();
}
protected String getMatchedTermFromTheRulelabelContainsWords(String label)
{
if (StringUtils.isNotBlank(label))
{
Set<String> tokens = split(label);
return words.stream().filter(word -> tokens.containsAll(split(word))).findFirst().orElse(StringUtils.EMPTY);
}
return StringUtils.EMPTY;
}
protected Set<String> split(String label)
{
return Sets.newHashSet(TERM_SPLITTER.split(label.toLowerCase())).stream().map(this::removeIllegalChars)
.collect(Collectors.toSet());
}
protected String removeIllegalChars(String string)
{
return string.replaceAll(ILLEGAL_CHARS_REGEX, StringUtils.EMPTY);
}
private void sortBasedOnLength(List<String> words)
{
Collections.sort(words, new Comparator<String>()
{
public int compare(String string1, String string2)
{
return Integer.compare(string1.length(), string2.length());
}
});
}
}