package org.molgenis.data.mapper.algorithmgenerator.categorymapper; import com.google.common.collect.Lists; import com.google.common.collect.Sets; import org.jscience.physics.amount.Amount; import org.molgenis.data.mapper.algorithmgenerator.bean.AmountWrapper; import org.molgenis.data.mapper.algorithmgenerator.categorymapper.convertor.AmountConvertor; import org.molgenis.data.mapper.algorithmgenerator.categorymapper.convertor.DailyAmountConvertor; import org.molgenis.data.mapper.algorithmgenerator.categorymapper.convertor.NumberAmountConvertor; import org.molgenis.data.mapper.algorithmgenerator.categorymapper.convertor.SeveralTimesConvertor; import javax.measure.converter.UnitConverter; import javax.measure.unit.NonSI; import javax.measure.unit.SI; import javax.measure.unit.Unit; import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; public class CategoryMapperUtil { private static final List<AmountConvertor> CONVERTORS = Lists .newArrayList(new DailyAmountConvertor(), new SeveralTimesConvertor(), new NumberAmountConvertor()); private static final Pattern NUMBER_PATTERN = Pattern.compile("\\d+\\.?\\d*"); private static final String NON_LETTER_REGEX = "[^a-zA-Z0-9]"; private static final List<Unit<?>> DURATION_UNITS; static { DURATION_UNITS = Arrays .asList(SI.SECOND.inverse(), NonSI.MINUTE.inverse(), NonSI.HOUR.inverse(), NonSI.DAY.inverse(), NonSI.WEEK.inverse(), NonSI.MONTH.inverse(), NonSI.YEAR.inverse()); } private static final Set<String> POSITIVE_ADJECTIVES; static { POSITIVE_ADJECTIVES = new HashSet<String>(); POSITIVE_ADJECTIVES.add("almost"); } private static final Set<String> NEGATIVE_ADJECTIVES; static { NEGATIVE_ADJECTIVES = new HashSet<String>(); NEGATIVE_ADJECTIVES.add("never"); NEGATIVE_ADJECTIVES.add("less"); NEGATIVE_ADJECTIVES.add("fewer"); } private static final Map<String, Integer> WORD_TO_NUMBER_MAP; private static final double STANDARD_ERROR = 0.0000000001; static { WORD_TO_NUMBER_MAP = new HashMap<String, Integer>(); WORD_TO_NUMBER_MAP.put("one", 1); WORD_TO_NUMBER_MAP.put("two", 2); WORD_TO_NUMBER_MAP.put("three", 3); WORD_TO_NUMBER_MAP.put("four", 4); WORD_TO_NUMBER_MAP.put("five", 5); WORD_TO_NUMBER_MAP.put("six", 6); WORD_TO_NUMBER_MAP.put("seven", 7); WORD_TO_NUMBER_MAP.put("eight", 8); WORD_TO_NUMBER_MAP.put("nine", 9); WORD_TO_NUMBER_MAP.put("ten", 10); WORD_TO_NUMBER_MAP.put("once", 1); WORD_TO_NUMBER_MAP.put("twice", 2); } public static boolean containNegativeAdjectives(String description) { String lowerCase = description.toLowerCase(); return NEGATIVE_ADJECTIVES.stream().anyMatch(adj -> lowerCase.contains(adj)); } public static Unit<?> getMoreSpecificUnit(Unit<?> unit) { int indexOf = DURATION_UNITS.indexOf(unit); if (indexOf <= 0) { return unit; } else { return DURATION_UNITS.get(--indexOf); } } public static AmountWrapper convertDescriptionToAmount(String description) { String cleanedDescription = convertWordToNumber(description); for (AmountConvertor convertor : CONVERTORS) { if (convertor.matchCriteria(cleanedDescription)) { return convertor.getAmount(cleanedDescription); } } return null; } public static String convertWordToNumber(String description) { StringBuilder stringBuilder = new StringBuilder(); for (String token : description.toLowerCase().split(NON_LETTER_REGEX)) { if (stringBuilder.length() > 0) stringBuilder.append(' '); if (WORD_TO_NUMBER_MAP.containsKey(token)) { stringBuilder.append(WORD_TO_NUMBER_MAP.get(token)); } else { stringBuilder.append(token); } } return stringBuilder.toString(); } public static Unit<?> findDurationUnit(String description) { Set<String> tokens = Sets.newHashSet(description.toLowerCase().split(NON_LETTER_REGEX)); List<Unit<?>> candidateUnits = new ArrayList<Unit<?>>(); for (Unit<?> unit : DURATION_UNITS) { if (tokens.contains(unit.inverse().toString().toLowerCase())) { candidateUnits.add(unit); } } return getMostGeneralUnit(candidateUnits); } public static Unit<?> getMostGeneralUnit(List<Unit<?>> candidateUnits) { Collections.sort(candidateUnits, new Comparator<Unit<?>>() { public int compare(Unit<?> o1, Unit<?> o2) { UnitConverter converterTo = o1.inverse().getConverterTo(o2.inverse()); if (converterTo.convert(1) > 1) { return -1; } else { return 1; } } }); return candidateUnits.size() > 0 ? candidateUnits.get(0) : null; } public static List<Double> extractNumbers(String description) { List<Double> extractedNumbers = new ArrayList<Double>(); String lowerCasedDesc = description.toLowerCase(); Matcher mather = NUMBER_PATTERN.matcher(lowerCasedDesc); while (mather.find()) { extractedNumbers.add(Double.parseDouble(mather.group())); } return extractedNumbers; } public static boolean containsNegativeAdjectives(String description) { String lowerCasedDesc = description.toLowerCase(); return NEGATIVE_ADJECTIVES.stream().anyMatch(adj -> lowerCasedDesc.contains(adj)); } public static boolean isAmountRanged(Amount<?> amount) { return amount.getMaximumValue() - amount.getMinimumValue() > STANDARD_ERROR; } }