package net.filebot.similarity; import static java.util.Arrays.*; import static java.util.Collections.*; import static java.util.regex.Pattern.*; import static java.util.stream.Collectors.*; import static net.filebot.Logging.*; import static net.filebot.MediaTypes.*; import static net.filebot.media.MediaDetection.*; import static net.filebot.media.XattrMetaInfo.*; import static net.filebot.similarity.Normalization.*; import static net.filebot.util.FileUtilities.*; import static net.filebot.util.StringUtilities.*; import java.io.File; import java.time.Instant; import java.time.LocalDate; import java.time.temporal.ChronoUnit; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Stream; import com.ibm.icu.text.Transliterator; import net.filebot.format.BindingException; import net.filebot.format.MediaBindingBean; import net.filebot.media.SmartSeasonEpisodeMatcher; import net.filebot.similarity.SeasonEpisodeMatcher.SxE; import net.filebot.vfs.FileInfo; import net.filebot.web.Episode; import net.filebot.web.EpisodeFormat; import net.filebot.web.Movie; import net.filebot.web.SeriesInfo; import net.filebot.web.SimpleDate; public enum EpisodeMetrics implements SimilarityMetric { // Match by season / episode numbers SeasonEpisode(new SeasonEpisodeMetric(new SmartSeasonEpisodeMatcher(null, false)) { private final Map<Object, Collection<SxE>> transformCache = synchronizedMap(new HashMap<Object, Collection<SxE>>(64, 4)); @Override protected Collection<SxE> parse(Object object) { // SxE sets for Episode objects cannot be cached because the same Episode (by ID) may have different episode numbers depending on the order (e.g. Airdate VS DVD order) if (object instanceof Episode) { Episode episode = (Episode) object; return parse(episode); } if (object instanceof Movie) { return emptySet(); } return transformCache.computeIfAbsent(object, super::parse); } private Set<SxE> parse(Episode e) { // get SxE from episode, both SxE for season/episode numbering and SxE for absolute episode numbering Set<SxE> sxe = new HashSet<SxE>(2); // default SxE numbering if (e.getEpisode() != null) { sxe.add(new SxE(e.getSeason(), e.getEpisode())); // absolute numbering if (e.getAbsolute() != null) { sxe.add(new SxE(null, e.getAbsolute())); } } else { // 0xSpecial numbering if (e.getSpecial() != null) { sxe.add(new SxE(0, e.getSpecial())); } } return sxe; } }), // Match episode airdate AirDate(new DateMetric(getDateMatcher()) { private final Map<Object, SimpleDate> transformCache = synchronizedMap(new HashMap<Object, SimpleDate>(64, 4)); @Override public SimpleDate parse(Object object) { if (object instanceof Episode) { Episode episode = (Episode) object; return episode.getAirdate(); } if (object instanceof Movie) { return null; } return transformCache.computeIfAbsent(object, super::parse); } }), // Match by episode/movie title Title(new SubstringMetric() { @Override protected String normalize(Object object) { if (object instanceof Episode) { Episode e = (Episode) object; // don't use title for matching if title equals series name if (e.getTitle() != null) { String title = normalizeObject(removeTrailingBrackets(e.getTitle())); if (title.length() >= 4 && !normalizeObject(e.getSeriesName()).contains(title)) { return title; } } } if (object instanceof Movie) { return normalizeObject(((Movie) object).getName()); } String s = normalizeObject(object); return s.length() >= 4 ? s : null; // only consider long enough strings to avoid false matches } }), // Match by SxE and airdate EpisodeIdentifier(new MetricCascade(SeasonEpisode, AirDate)), // Advanced episode <-> file matching Lv1 EpisodeFunnel(new MetricCascade(SeasonEpisode, AirDate, Title)), // Advanced episode <-> file matching Lv2 EpisodeBalancer(new SimilarityMetric() { @Override public float getSimilarity(Object o1, Object o2) { float sxe = EpisodeIdentifier.getSimilarity(o1, o2); float title = sxe < 1 ? Title.getSimilarity(o1, o2) : 1; // if SxE matches then boost score as if it was a title match as well // account for misleading SxE patterns in the episode title if (sxe < 0 && title == 1 && EpisodeIdentifier.getSimilarity(getTitle(o1), getTitle(o2)) == 1) { sxe = 1; title = 0; } // allow title to override SxE only if series name also is a good match if (title == 1 && SeriesName.getSimilarity(o1, o2) < 0.5f) { title = 0; } // 1:SxE && Title, 2:SxE return (float) ((Math.max(sxe, 0) * title) + (Math.floor(sxe) / 10)); } public Object getTitle(Object o) { if (o instanceof Episode) { Episode e = (Episode) o; return e.getSeriesName() + " " + e.getTitle(); } return o; } }), // Match series title and episode title against folder structure and file name SubstringFields(new SubstringMetric() { @Override public float getSimilarity(Object o1, Object o2) { String[] f1 = normalize(fields(o1)); String[] f2 = normalize(fields(o2)); // match all fields and average similarity double sum = 0; for (int i = 0; i < f1.length; i++) { for (int j = 0; j < f2.length; j++) { float f = super.getSimilarity(f1[i], f2[j]); if (f > 0) { // 2-sqrt(x) from 0 to 1 double multiplier = 2 - Math.sqrt((double) (i + j) / (f1.length + f2.length)); // bonus points for primary matches (e.g. primary title matches filename > alias title matches folder path) sum += f * multiplier; } } } sum /= f1.length * f2.length; return sum >= 0.9 ? 1 : sum >= 0.1 ? 0.5f : 0; } protected String[] normalize(Object[] objects) { // normalize objects (and make sure to keep word boundaries) return stream(objects).map(EpisodeMetrics::normalizeObject).toArray(String[]::new); } protected static final int MAX_FIELDS = 5; protected Object[] fields(Object object) { if (object instanceof Episode) { Episode e = (Episode) object; Stream<String> primaryNames = Stream.of(e.getSeriesName(), e.getTitle()); Stream<String> aliasNames = e.getSeriesInfo() == null ? Stream.empty() : e.getSeriesInfo().getAliasNames().stream().limit(MAX_FIELDS); Stream<String> names = Stream.concat(primaryNames, aliasNames).filter(s -> s != null && s.length() > 0).map(Normalization::removeTrailingBrackets).distinct(); return copyOf(names.limit(MAX_FIELDS).toArray(), MAX_FIELDS); } if (object instanceof File) { File f = (File) object; return new Object[] { f, f.getParentFile().getPath() }; } if (object instanceof Movie) { Movie m = (Movie) object; return new Object[] { m.getName(), m.getYear() }; } return new Object[] { object }; } }), // Match via common word sequence in episode name and file name NameSubstringSequence(new SequenceMatchSimilarity() { @Override public float getSimilarity(Object o1, Object o2) { String[] f1 = getNormalizedEffectiveIdentifiers(o1); String[] f2 = getNormalizedEffectiveIdentifiers(o2); // match all fields and average similarity float max = 0; for (String s1 : f1) { for (String s2 : f2) { max = Math.max(super.getSimilarity(s1, s2), max); } } // normalize absolute similarity to similarity rank (4 ranks in total), // so we are less likely to fall for false positives in this pass, and move on to the next one return (float) (Math.floor(max * 4) / 4); } @Override protected String normalize(Object object) { return object.toString(); } protected String[] getNormalizedEffectiveIdentifiers(Object object) { List<?> identifiers = getEffectiveIdentifiers(object); String[] names = new String[identifiers.size()]; for (int i = 0; i < names.length; i++) { names[i] = normalizeObject(identifiers.get(i)); } return names; } protected List<?> getEffectiveIdentifiers(Object object) { if (object instanceof Episode) { return ((Episode) object).getSeriesNames(); } else if (object instanceof Movie) { return ((Movie) object).getEffectiveNames(); } else if (object instanceof File) { return listPathTail((File) object, 3, true); } return singletonList(object); } }), // Match by generic name similarity (round rank) Name(new NameSimilarityMetric() { @Override public float getSimilarity(Object o1, Object o2) { // normalize absolute similarity to similarity rank (4 ranks in total), // so we are less likely to fall for false positives in this pass, and move on to the next one return (float) (Math.floor(super.getSimilarity(o1, o2) * 4) / 4); } @Override protected String normalize(Object object) { // simplify file name, if possible return normalizeObject(object); } }), // Match by generic name similarity (absolute) SeriesName(new NameSimilarityMetric() { private final SeriesNameMatcher seriesNameMatcher = getSeriesNameMatcher(false); @Override public float getSimilarity(Object o1, Object o2) { String[] f1 = getNormalizedEffectiveIdentifiers(o1); String[] f2 = getNormalizedEffectiveIdentifiers(o2); // match all fields and average similarity float max = 0; for (String s1 : f1) { for (String s2 : f2) { max = Math.max(super.getSimilarity(s1, s2), max); } } // normalize absolute similarity to similarity rank (4 ranks in total), // so we are less likely to fall for false positives in this pass, and move on to the next one return (float) (Math.floor(max * 4) / 4); } @Override protected String normalize(Object object) { return object.toString(); } protected String[] getNormalizedEffectiveIdentifiers(Object object) { return getEffectiveIdentifiers(object).stream().map(EpisodeMetrics::normalizeObject).toArray(String[]::new); } protected List<?> getEffectiveIdentifiers(Object object) { if (object instanceof Episode) { Episode episode = (Episode) object; // strip release info from known series name to make sure it matches the stripped filename return stripReleaseInfo(episode.getSeriesNames(), true); } else if (object instanceof File) { File file = (File) object; // guess potential series names from path return listPathTail(file, 3, true).stream().map(f -> { String fn = getName(f); String sn = seriesNameMatcher.matchByEpisodeIdentifier(fn); return sn != null ? sn : fn; }).collect(collectingAndThen(toList(), v -> stripReleaseInfo(v, true))); } return emptyList(); } }), SeriesNameBalancer(new MetricCascade(NameSubstringSequence, Name, SeriesName)), // Match by generic name similarity (absolute) FilePath(new NameSimilarityMetric() { @Override protected String normalize(Object object) { if (object instanceof File) { object = normalizePathSeparators(getRelativePathTail((File) object, 3).getPath()); } return normalizeObject(object.toString()); // simplify file name, if possible } }), FilePathBalancer(new NameSimilarityMetric() { @Override public float getSimilarity(Object o1, Object o2) { String s1 = normalizeObject(o1); String s2 = normalizeObject(o2); s1 = stripReleaseInfo(s1, false); s2 = stripReleaseInfo(s2, false); int length = Math.min(s1.length(), s2.length()); s1 = s1.substring(0, length); s2 = s2.substring(0, length); return (float) (Math.floor(super.getSimilarity(s1, s2) * 4) / 4); }; @Override protected String normalize(Object object) { return object.toString(); } }), NumericSequence(new SequenceMatchSimilarity() { @Override public float getSimilarity(Object o1, Object o2) { float lowerBound = super.getSimilarity(normalize(o1, true), normalize(o2, true)); float upperBound = super.getSimilarity(normalize(o1, false), normalize(o2, false)); return Math.max(lowerBound, upperBound); }; @Override protected String normalize(Object object) { return object.toString(); }; protected String normalize(Object object, boolean numbersOnly) { if (object instanceof Episode) { Episode e = (Episode) object; if (numbersOnly) { object = EpisodeFormat.SeasonEpisode.formatSxE(e); } else { object = String.format("%s %s", e.getSeriesName(), EpisodeFormat.SeasonEpisode.formatSxE(e)); } } else if (object instanceof Movie) { Movie m = (Movie) object; if (numbersOnly) { object = m.getYear(); } else { object = String.format("%s %s", m.getName(), m.getYear()); } } // simplify file name if possible and extract numbers List<Integer> numbers = matchIntegers(normalizeObject(object)); return join(numbers, " "); } }), // Match by generic numeric similarity Numeric(new NumericSimilarityMetric() { @Override public float getSimilarity(Object o1, Object o2) { String[] f1 = fields(o1); String[] f2 = fields(o2); // match all fields and average similarity float max = 0; for (String s1 : f1) { for (String s2 : f2) { if (s1 != null && s2 != null) { max = Math.max(super.getSimilarity(s1, s2), max); if (max >= 1) { return max; } } } } return max; } protected String[] fields(Object object) { if (object instanceof Episode) { Episode episode = (Episode) object; String[] f = new String[3]; f[0] = episode.getSeriesName(); f[1] = episode.getSpecial() == null ? EpisodeFormat.SeasonEpisode.formatSxE(episode) : episode.getSpecial().toString(); f[2] = episode.getAbsolute() == null ? null : episode.getAbsolute().toString(); return f; } if (object instanceof Movie) { Movie movie = (Movie) object; return new String[] { movie.getName(), String.valueOf(movie.getYear()) }; } return new String[] { normalizeObject(object) }; } }), // Prioritize proper episodes over specials SpecialNumber(new SimilarityMetric() { @Override public float getSimilarity(Object o1, Object o2) { return getSpecialFactor(o1) + getSpecialFactor(o2); } public int getSpecialFactor(Object object) { if (object instanceof Episode) { Episode episode = (Episode) object; return episode.getSpecial() != null ? -1 : 1; } return 0; } }), // Match by file length (only works when matching torrents or files) FileSize(new FileSizeMetric() { @Override public float getSimilarity(Object o1, Object o2) { // order of arguments is logically irrelevant, but we might be able to save us a call to File.length() which is quite costly return o1 instanceof File ? super.getSimilarity(o2, o1) : super.getSimilarity(o1, o2); } @Override protected long getLength(Object object) { if (object instanceof FileInfo) { return ((FileInfo) object).getLength(); } return super.getLength(object); } }), // Match by common words at the beginning of both files FileName(new FileNameMetric() { @Override protected String getFileName(Object object) { if (object instanceof File || object instanceof FileInfo) { return normalizeObject(object); } return null; } }), // Match by file last modified and episode release dates TimeStamp(new TimeStampMetric(10, ChronoUnit.YEARS) { @Override public float getSimilarity(Object o1, Object o2) { // adjust differentiation accuracy to about 2.5 years float f = super.getSimilarity(o1, o2); return f >= 0.75 ? 1 : f >= 0 ? 0 : -1; } private long getTimeStamp(SimpleDate date) { // some episodes may not have a defined airdate if (date != null) { Instant t = date.toInstant(); if (t.isBefore(Instant.now())) { return t.toEpochMilli(); } } // big penalty for episodes not yet aired return -1; } private long getTimeStamp(File file) { if (VIDEO_FILES.accept(file) && file.length() > ONE_MEGABYTE) { try { return new MediaBindingBean(file, file).getEncodedDate().getTimeStamp(); } catch (BindingException e) { debug.finest(e::getMessage); // Binding "General[0][Encoded_Date]": undefined => normal if Encoded_Date is undefined => ignore } catch (Exception e) { debug.warning("Failed to read media encoding date: " + e.getMessage()); } } return super.getTimeStamp(file); // default to file creation date } @Override public long getTimeStamp(Object object) { if (object instanceof Episode) { Episode e = (Episode) object; return getTimeStamp(e.getAirdate()); } else if (object instanceof Movie) { Movie m = (Movie) object; return getTimeStamp(new SimpleDate(m.getYear(), 1, 1)); } else if (object instanceof File) { File file = (File) object; return getTimeStamp(file); } return -1; } }), SeriesRating(new SimilarityMetric() { @Override public float getSimilarity(Object o1, Object o2) { float r1 = getScore(o1); float r2 = getScore(o2); if (r1 < 0 || r2 < 0) return -1; return Math.max(r1, r2); } public float getScore(Object object) { if (object instanceof Episode) { SeriesInfo seriesInfo = ((Episode) object).getSeriesInfo(); if (seriesInfo != null && seriesInfo.getRating() != null && seriesInfo.getRatingCount() != null) { if (seriesInfo.getRatingCount() >= 20) { return (float) Math.floor(seriesInfo.getRating() / 3); // BOOST POPULAR SHOWS and PUT INTO 3 GROUPS } if (seriesInfo.getRatingCount() >= 1) { return 0; // PENALIZE SHOWS WITH FEW RATINGS } return -1; // BIG PENALTY FOR SHOWS WITH 0 RATINGS } } return 0; } }), VoteRate(new SimilarityMetric() { @Override public float getSimilarity(Object o1, Object o2) { float r1 = getScore(o1); float r2 = getScore(o2); return Math.max(r1, r2) >= 0.1 ? 1 : 0; } public float getScore(Object object) { if (object instanceof Episode) { SeriesInfo seriesInfo = ((Episode) object).getSeriesInfo(); if (seriesInfo != null && seriesInfo.getRating() != null && seriesInfo.getRatingCount() != null && seriesInfo.getStartDate() != null) { long days = ChronoUnit.DAYS.between(seriesInfo.getStartDate().toLocalDate(), LocalDate.now()); if (days > 0) { return (float) ((seriesInfo.getRatingCount().doubleValue() / days) * seriesInfo.getRating()); } } } return 0; } }), // Match by (region) or (year) hints RegionHint(new SimilarityMetric() { private final Pattern hint = compile("[(](\\p{Alpha}+|\\p{Digit}+)[)]$"); private final SeriesNameMatcher seriesNameMatcher = getSeriesNameMatcher(true); @Override public float getSimilarity(Object o1, Object o2) { Set<String> h1 = getHint(o1); Set<String> h2 = getHint(o2); return h1.isEmpty() || h2.isEmpty() ? 0 : h1.containsAll(h2) || h2.containsAll(h1) ? 1 : 0; } public Set<String> getHint(Object o) { if (o instanceof Episode) { for (String sn : ((Episode) o).getSeriesNames()) { Matcher m = hint.matcher(sn); if (m.find()) { return singleton(m.group(1).trim().toLowerCase()); } } } else if (o instanceof File) { Set<String> h = new HashSet<String>(); for (File f : listPathTail((File) o, 3, true)) { // try to focus on series name String fn = f.getName(); String sn = seriesNameMatcher.matchByEpisodeIdentifier(fn); String[] tokens = PUNCTUATION_OR_SPACE.split(sn != null ? sn : fn); for (String s : tokens) { if (s.length() > 0) { h.add(s.trim().toLowerCase()); } } } return h; } return emptySet(); } }), // Match by stored MetaAttributes if possible MetaAttributes(new CrossPropertyMetric() { @Override protected Map<String, Object> getProperties(Object object) { // Episode / Movie objects if (object instanceof Episode || object instanceof Movie) { return super.getProperties(object); } // deserialize MetaAttributes if enabled and available if (object instanceof File) { Object metaObject = xattr.getMetaInfo((File) object); if (metaObject != null) { return super.getProperties(metaObject); } } // ignore everything else return emptyMap(); } }); // inner metric private final SimilarityMetric metric; private EpisodeMetrics(SimilarityMetric metric) { this.metric = metric; } @Override public float getSimilarity(Object o1, Object o2) { return metric.getSimilarity(o1, o2); } private static final Map<Object, String> transformCache = synchronizedMap(new HashMap<Object, String>(64, 4)); private static final Transliterator transliterator = Transliterator.getInstance("Any-Latin;Latin-ASCII;[:Diacritic:]remove"); public static String normalizeObject(Object object) { if (object == null) { return ""; } return transformCache.computeIfAbsent(object, o -> { String name = normalizeFileName(o); // remove checksums, any [...] or (...) name = removeEmbeddedChecksum(name); // remove obvious release info name = stripFormatInfo(name); synchronized (transliterator) { name = transliterator.transform(name); } // remove or normalize special characters return normalizePunctuation(name).toLowerCase(); }); } private static String normalizeFileName(Object object) { if (object instanceof File) { return getName((File) object); } else if (object instanceof FileInfo) { return ((FileInfo) object).getName(); } return object.toString(); } public static SimilarityMetric[] defaultSequence(boolean includeFileMetrics) { // 1 pass: divide by file length (only works for matching torrent entries or files) // 2-3 pass: divide by title or season / episode numbers // 4 pass: divide by folder / file name and show name / episode title // 5 pass: divide by name (rounded into n levels) // 6 pass: divide by generic numeric similarity // 7 pass: prefer episodes that were aired closer to the last modified date of the file // 8 pass: resolve remaining collisions via absolute string similarity if (includeFileMetrics) { return new SimilarityMetric[] { FileSize, new MetricCascade(FileName, EpisodeFunnel), EpisodeBalancer, AirDate, MetaAttributes, SubstringFields, SeriesNameBalancer, SeriesName, RegionHint, SpecialNumber, Numeric, NumericSequence, SeriesRating, VoteRate, TimeStamp, FilePathBalancer, FilePath }; } else { return new SimilarityMetric[] { EpisodeFunnel, EpisodeBalancer, AirDate, MetaAttributes, SubstringFields, SeriesNameBalancer, SeriesName, RegionHint, SpecialNumber, Numeric, NumericSequence, SeriesRating, VoteRate, TimeStamp, FilePathBalancer, FilePath }; } } public static SimilarityMetric verificationMetric() { return new MetricCascade(FileName, SeasonEpisode, AirDate, Title, Name); } }