//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.jobs.interactions.data; import java.util.Arrays; import java.util.List; import java.util.Set; import java.util.stream.Collectors; /** * Holds information relating to a word pattern (a set of words). * <p> * Can be used to hold intermediate cluster calculations. * */ public final class PatternReference { private final String id; private String sourceType; private String targetType; /** The tokens which form the pattern */ private final List<Word> tokens; /** The term frequency against the global vector. */ private int[] termFrequency; /** The term magnitude - sum of the termFrequencies. */ private int termMagnitude; /** * Instantiates a new pattern reference. * * @param id * the id * @param tokens * the tokens */ public PatternReference(String id, List<Word> tokens) { this.id = id; this.tokens = tokens; } /** * Instantiates a new pattern reference. * * @param id * the id * @param tokens * the tokens */ public PatternReference(String id, Word... tokens) { this.id = id; this.tokens = Arrays.asList(tokens); } /** * Gets the source type. * * @return the source type */ public String getSourceType() { return sourceType; } /** * Sets the source type. * * @param sourceType * the new source type */ public void setSourceType(String sourceType) { this.sourceType = sourceType; } /** * Gets the target type. * * @return the target type */ public String getTargetType() { return targetType; } /** * Sets the target type. * * @param targetType * the new target type */ public void setTargetType(String targetType) { this.targetType = targetType; } /** * Gets the id. * * @return the id */ public String getId() { return id; } /** * Gets the tokens. * * @return the tokens */ public List<Word> getTokens() { return tokens; } /** * Gets the TF magnitude. * * @return the TF magnitude */ public int getTFMagnitude() { return termMagnitude; } /** * Gets the term frequency. * * @return the term frequency */ public int[] getTermFrequency() { return termFrequency; } /** * Calculate term frequency given a set of words. * * @param terms * the terms */ public void calculateTermFrequency(Set<Word> terms) { termFrequency = new int[terms.size()]; termMagnitude = 0; // Naive implementation, but perhaps correct way given that the tokens should be very small // in general int i = 0; for (final Word term : terms) { for (final Word token : tokens) { // Note we ignore the POS here if (term.getLemma().equals(token.getLemma())) { termFrequency[i]++; termMagnitude++; } } i++; } } /** * Calculate similarity between this and another pattern, * * Uses the cosine distance. * * @param pattern * the pattern * @return the double */ public double calculateSimilarity(PatternReference pattern) { final int[] otherTF = pattern.getTermFrequency(); double score = 0; for (int i = 0; i < termFrequency.length; i++) { score += termFrequency[i] * otherTF[i]; } // NOTE: Departure from the paper (they don't do the division to normalize the result) // TODO: Should this have the c + d in it (ie be (k(p1,p2) not the dot product) return score / (pattern.getTFMagnitude() * getTFMagnitude()); } @Override public String toString() { return id + ":" + tokens.stream().map(Word::getLemma).collect(Collectors.joining(";")); } }