package edu.uncc.cs.watsonsim.nlp;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Optional;
import java.util.Set;
import org.apache.lucene.search.ScoreDoc;
import edu.uncc.cs.watsonsim.Database;
import edu.uncc.cs.watsonsim.Environment;
import edu.uncc.cs.watsonsim.KV;
import edu.uncc.cs.watsonsim.Phrase;
import edu.uncc.cs.watsonsim.StringUtils;
public class Relatedness {
private final Database db;
private final PreparedStatement link_statement;
private final Environment env;
public final Redirects redirects;
/**
* Create a Synonyms module using shared resources.
* @param env
*/
public Relatedness(Environment env) {
this.db = env.db;
this.env = env;
/*
* It's possible to send arrays of keys instead but the syntax is not
* consistent across PSQL and SQLite so I'm issuing many small queries.
* Postgres -> "WHERE link = ANY (?)"
* SQLite -> "WHERE link IN (?)"
* So there may be real overhead.
*/
link_statement = db.prep(
"SELECT count(*), trim_target"
+ " FROM wiki_links"
+ " WHERE link = ?"
+ " GROUP BY trim_target HAVING count(*) > 1"
+ " ORDER BY count(*) DESC;");
redirects = new Redirects(env);
}
/**
* Find paraphrases and synonyms of a set of phrases.
* You can enter multiple sources, which are an array for syntactic
* convenience. The scoring will be combined between all the sources.
* The exact scoring method may change over time.
*
* @param sources an array of words for which you want synonyms
*/
public List<Weighted<String>> viaWikiLinks(String[] sources) {
try {
List<Weighted<String>> synonyms = new ArrayList<>();
for (String source : sources){
link_statement.setString(1, source);
ResultSet rows = link_statement.executeQuery();
while (rows.next()) {
synonyms.add(new Weighted<>(rows.getString(2), rows.getDouble(1)));
}
}
return synonyms;
} catch (SQLException e) {
e.printStackTrace();
throw new RuntimeException("Failed to create query for wiki link synonyms of \"" + sources + "\"");
}
}
public boolean matchViaSearch(String left, String right) {
final int K = 3;
final int Q = K/2;
ScoreDoc[] left_hits = env.simpleLuceneQuery(left, K);
Set<ScoreDoc> lefts = new HashSet<>(Arrays.asList(left_hits));
ScoreDoc[] right_hits = env.simpleLuceneQuery(right, K);
Set<ScoreDoc> rights = new HashSet<>(Arrays.asList(right_hits));
return left_hits.length >0 && right_hits.length > 0 && left_hits[0].doc == right_hits[0].doc;
//lefts.retainAll(rights);
//return lefts.size() > Q;
}
/**
* This is a very strict way of measuring synonymy, simply by the edit
* distance. It does handle a few rudimentary similarities, however.
*
* First it canonicalizes the inputs (see StringUtils.canonicalize),
* then it will
* ignore up to one letter of edit distance.
* This helps for situations like Advertize = Advertise
*
* This is used by the grading scorer (CORRECT) so keep that in mind.
*
* @return Whether the two strings are synonymous.
*/
public boolean matchViaLevenshtein(String left, String right) {
int dist = StringUtils.getLevenshteinDistance(
StringUtils.canonicalize(left),
StringUtils.canonicalize(right),
2);
// -1 means "uncertain, but greater than the threshold"
return (0 <= dist && dist < 2);
}
/**
* Simple hard-coded heuristics for whether the left phrase implies the
* right. It uses Levenshtein, search, redirects, and token set ops.
* @param left The antecedent
* @param right The consequent
* @return Whether left implies right
*/
public boolean implies(Phrase left, Phrase right) {
return matchViaLevenshtein(left.text, right.text)
|| matchViaSearch(left.text, right.text)
|| redirects.matches(left.text, right.text)
|| right.memo(Phrase.lemmas).containsAll(left.memo(Phrase.lemmas))
|| StringUtils.containsIgnoreCase(right.text, left.text);
}
}