package org.molgenis.data.mapper.algorithmgenerator.categorymapper; import org.apache.commons.lang3.StringUtils; import org.molgenis.data.mapper.algorithmgenerator.bean.Category; import org.molgenis.data.mapper.algorithmgenerator.rules.CategoryMatchQuality; import org.molgenis.data.mapper.algorithmgenerator.rules.CategoryRule; import org.molgenis.data.semanticsearch.string.NGramDistanceAlgorithm; import java.util.List; import java.util.Objects; import java.util.Optional; public class LexicalCategoryMapper extends CategoryMapper { private final static double DEFAULT_THRESHOLD = 50.0f; public LexicalCategoryMapper(List<CategoryRule> rules) { super(rules); } public Category findBestCategoryMatch(Category sourceCategory, List<Category> targetCategories) { String sourceCategoryLabel = sourceCategory.getLabel().toLowerCase(); Category bestCategory = null; double bestNGramScore = -1; for (Category targetCategory : targetCategories) { String targetCategoryLabel = targetCategory.getLabel(); if (StringUtils.equalsIgnoreCase(sourceCategoryLabel, targetCategoryLabel)) { return targetCategory; } double ngramScore = NGramDistanceAlgorithm.stringMatching(sourceCategoryLabel, targetCategoryLabel); if (bestNGramScore == -1 || bestNGramScore < ngramScore) { bestNGramScore = ngramScore; bestCategory = targetCategory; } } if (bestNGramScore < DEFAULT_THRESHOLD) { Optional<?> findFirst = targetCategories.stream() .map(targetCategory -> applyCustomRules(sourceCategory, targetCategory)).filter(Objects::nonNull) .sorted().findFirst(); if (findFirst.isPresent() && findFirst.get() instanceof CategoryMatchQuality) { bestCategory = ((CategoryMatchQuality<?>) findFirst.get()).getTargetCategory(); } } return bestCategory; } public CategoryMatchQuality<?> applyCustomRules(Category sourceCategory, Category targetCategory) { return rules.stream().map(rule -> rule.createCategoryMatchQuality(targetCategory, sourceCategory)) .filter(CategoryMatchQuality::isRuleApplied).sorted().findFirst().orElse(null); } }