/******************************************************************************* * Copyright 2016 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ package de.tudarmstadt.ukp.alignment.framework.clustering; import java.io.BufferedReader; import java.io.FileOutputStream; import java.io.FileReader; import java.io.IOException; import java.io.PrintStream; import java.sql.Connection; import java.sql.ResultSet; import java.sql.SQLException; import java.sql.Statement; import java.util.HashMap; import java.util.HashSet; import de.tudarmstadt.ukp.alignment.framework.Global; import de.tudarmstadt.ukp.alignment.framework.graph.OneResourceBuilder; import de.tudarmstadt.ukp.lmf.model.enums.ELanguageIdentifier; public class CreateClusteringFromAlignment { public static void main(String[] args) throws ClassNotFoundException, SQLException, IOException { /* GLOBAL SETTINGS */ Global.init(); String language = ELanguageIdentifier.ENGLISH; /* RESOURCE 1 */ boolean synset1 = true; boolean usePos1 = true; int prefix1 = Global.WN_Synset_prefix; OneResourceBuilder bg_1 = new OneResourceBuilder("uby_release_1_0", "root", "fortuna", prefix1, language, synset1, usePos1); String alignment_file = "/home/local/UKP/matuschek/ClusterEvaluationTM/WN_OW_full_joint.csv"; splitSynsetsToSenses(bg_1, alignment_file, true); produceClustersFromSenseAlignment(alignment_file + "_sense"); splitClustersByLemma(bg_1, alignment_file + "_sense_cluster", true); mergeClustersWithSharedSenses(alignment_file + "_sense_cluster_cleaned"); checkPOSpurity(bg_1, alignment_file + "_sense_cluster_cleaned_merged"); checkForDupesInSameCluster(alignment_file + "_sense_cluster_cleaned_merged_POScleaned"); filterByLexemeList(bg_1, alignment_file + "_sense_cluster_cleaned_merged_POScleaned_noDupes", "/home/local/UKP/matuschek/ClusterEvaluationTM/lemmas.tsv", true); bringWNextRefClustersIntoTMFormat("/home/local/UKP/matuschek/ClusterEvaluationTM/WN_WP_full_alignment_DijkstaWSA_best_corrected_senses.txt_cluster_cleaned_merged_POScleaned_noDupes_lexemeFiltered"); } /** * This method splits sense clusters which contain senses of different POS. * This might occur if you align against a resource which lacks POS * information, such as OmegaWiki. * * * @param bg_1 * The resource * @param infile * The sense cluster file */ public static void checkPOSpurity(OneResourceBuilder bg_1, String infile) { try { Connection connection = bg_1.connection; FileReader in = new FileReader(infile); BufferedReader input = new BufferedReader(in); FileOutputStream outstream; PrintStream p; outstream = new FileOutputStream(infile + "_POScleaned"); p = new PrintStream(outstream); Statement statement = connection.createStatement(); HashMap<String, String> idPosMap = new HashMap<String, String>(); ResultSet rs = statement .executeQuery("SELECT externalReference,partOfSpeech FROM MonolingualExternalRef join LexicalEntry join Sense where Sense.senseId like '" + bg_1.prefix_string + "%' and MonolingualExternalRef.senseID = Sense.senseId and Sense.lexicalEntryId = LexicalEntry.lexicalEntryId"); while (rs.next()) { idPosMap.put(rs.getString(1), rs.getString(2)); } String line; while ((line = input.readLine()) != null) { String[] elements = line.split("\t"); String results = ""; HashSet<String> pos_count = new HashSet<String>(); HashMap<String, HashSet<String>> pos_map = new HashMap<String, HashSet<String>>(); for (String s : elements) { String pos = idPosMap.get(s); if (!pos_map.containsKey(pos)) { pos_map.put(pos, new HashSet<String>()); } pos_map.get(pos).add(s); pos_count.add(idPosMap.get(s)); } for (String x : pos_map.keySet()) { results = ""; HashSet<String> posCluster = pos_map.get(x); if (posCluster.size() == 1) { continue; } for (String id : posCluster) { results += id + "\t"; } p.println(results.trim()); } if (pos_count.size() > 1) { // For debugging purposes System.out.println(line); } } p.close(); input.close(); in.close(); } catch (Exception e) { e.printStackTrace(); } } /** * This method separates clusters with mixed lemmas * * * @param bg_1 * the resource * @param infile * the (mixed lemma) sense clusters * @param extRef * are we using external References or not? * @throws SQLException * @throws IOException * @throws ClassNotFoundException */ public static void splitClustersByLemma(OneResourceBuilder bg_1, String infile, boolean extRef) throws SQLException, IOException, ClassNotFoundException { FileOutputStream outstream; PrintStream p; outstream = new FileOutputStream(infile + "_cleaned"); p = new PrintStream(outstream); FileReader in = new FileReader(infile); String line; Statement statement = bg_1.connection.createStatement(); HashMap<String, String> extRefLemmaMap = new HashMap<String, String>(); BufferedReader input = new BufferedReader(in); ResultSet rs; if (extRef) { rs = statement .executeQuery("SELECT externalReference, writtenForm FROM LexicalEntry join FormRepresentation_Lemma join Sense join MonolingualExternalRef where FormRepresentation_Lemma.lemmaId = LexicalEntry.lemmaId and Sense.lexicalEntryId = LexicalEntry.lexicalEntryId and MonolingualExternalRef.senseId = Sense.senseId and Sense.senseId like '" + bg_1.prefix_string + "%'"); } else { rs = statement .executeQuery("SELECT senseId, writtenForm FROM LexicalEntry join FormRepresentation_Lemma join Sense where FormRepresentation_Lemma.lemmaId = LexicalEntry.lemmaId and Sense.lexicalEntryId = LexicalEntry.lexicalEntryId and Sense.senseId like '" + bg_1.prefix_string + "%'"); } while (rs.next()) { extRefLemmaMap.put(rs.getString(1), rs.getString(2)); } while ((line = input.readLine()) != null) { line = line.trim(); String[] sensekeys = line.split("\t"); HashMap<String, HashSet<String>> lemmaIDMap = new HashMap<String, HashSet<String>>(); for (String key : sensekeys) { String lemma = extRefLemmaMap.get(key); if (lemmaIDMap.get(lemma) == null) { lemmaIDMap.put(lemma, new HashSet<String>()); } lemmaIDMap.get(lemma).add(key); } for (String l : lemmaIDMap.keySet()) { String output = ""; if (lemmaIDMap.get(l).size() > 1) { for (String k : lemmaIDMap.get(l)) { output += k + "\t"; } p.println(output.trim()); } } } input.close(); p.close(); } /** * This method takes a given SYNSET alignment file (as produced by the * framework) and splits it into sense alignments * * * * @param bg_1 * The resource to be clustered * @param infile * The alignment file * @param extRef * Is the alignment file given in external References or UBY ids? */ public static void splitSynsetsToSenses(OneResourceBuilder bg_1, String infile, boolean extRef) { try { Connection connection = bg_1.connection; FileReader in = new FileReader(infile); BufferedReader input = new BufferedReader(in); FileOutputStream outstream; PrintStream p; outstream = new FileOutputStream(infile + "_sense"); p = new PrintStream(outstream); Statement statement = connection.createStatement(); HashMap<String, String> idMapSense = new HashMap<String, String>(); HashMap<String, String> idMapSynset = new HashMap<String, String>(); HashMap<String, HashSet<String>> senseSynsetMapping = new HashMap<String, HashSet<String>>(); ResultSet rs; if (extRef) { rs = statement .executeQuery("SELECT externalReference, senseId FROM uby_release_1_0.MonolingualExternalRef where senseId like '" + bg_1.prefix_string + "%'"); while (rs.next()) { idMapSense.put(rs.getString(2), rs.getString(1)); } rs = statement .executeQuery("SELECT externalReference, synsetId FROM uby_release_1_0.MonolingualExternalRef where synsetId like '" + bg_1.prefix_string + "%'"); while (rs.next()) { idMapSynset.put(rs.getString(1), rs.getString(2)); } } rs = statement .executeQuery("SELECT synsetId, senseId FROM uby_release_1_0.Sense where synsetId like '" + bg_1.prefix_string + "%'"); while (rs.next()) { if (!senseSynsetMapping.containsKey(rs.getString(1))) { senseSynsetMapping.put(rs.getString(1), new HashSet<String>()); } senseSynsetMapping.get(rs.getString(1)).add(rs.getString(2)); } String line; while ((line = input.readLine()) != null) { String[] ids = line.split("\t"); String synsetId = ids[0]; String synsetUbyId; if (extRef) { synsetUbyId = idMapSynset.get(synsetId); } else { synsetUbyId = synsetId; } HashSet<String> ss = senseSynsetMapping.get(synsetUbyId); for (String s : ss) { if (extRef) { p.println(idMapSense.get(s) + "\t" + ids[1]); } else { p.println(s + "\t" + ids[1]); } } } p.close(); input.close(); } catch (Exception e) { e.printStackTrace(); } } /** * This method naively clusters those senses together which have the same * alignment target. Further downstream cleaning will be necessary * * * @param infile * The sense alignment file * */ public static void produceClustersFromSenseAlignment(String infile) { try { HashMap<String, HashSet<String>> clusters = new HashMap<String, HashSet<String>>(); FileReader in = new FileReader(infile); BufferedReader input = new BufferedReader(in); String line = ""; while ((line = input.readLine()) != null) { String[] ids = line.split("\t"); if (!clusters.containsKey(ids[1])) { clusters.put(ids[1], new HashSet<String>()); } (clusters.get(ids[1])).add(ids[0]); } FileOutputStream outstream = new FileOutputStream(infile + "_cluster"); PrintStream p = new PrintStream(outstream); for (String key : clusters.keySet()) { HashSet<String> sss = clusters.get(key); if (sss.size() > 1) { for (String s : sss) { p.print(s + "\t"); } p.println(); } } input.close(); p.close(); in.close(); } catch (Exception e) { e.printStackTrace(); } } /** * * This simple method just removes the POS information as given in the * External References for WordNet senses * * E.g. [POS: noun] acid%1:06:00:: -> acid%1:06:00:: * * @param input_file * The sense clustering */ public static void bringWNextRefClustersIntoTMFormat(String input_file) { try { FileOutputStream outstream; PrintStream p; outstream = new FileOutputStream(input_file + "_TM"); p = new PrintStream(outstream); FileReader in = new FileReader(input_file); BufferedReader input = new BufferedReader(in); String line = ""; while ((line = input.readLine()) != null) { String out = ""; String[] ids = line.split("\t"); for (String id : ids) { out += id.split("] ")[1] + "\t"; } p.println(out.trim()); } p.close(); input.close(); in.close(); } catch (Exception e) { e.printStackTrace(); } } /** * This method filters out senses clusters accorind to a given, * tab-separated lemma/POS list, where one lexeme per line is given, e.g. * * fish noun swim verb * * * @param bg_1 * The resource * @param input_file * The sense clustering * @param lexeme_file * The tab-separated lexeme list * @param extRef * Do we have external references or UBY IDs? */ public static void filterByLexemeList(OneResourceBuilder bg_1, String input_file, String lexeme_file, boolean extRef) { try { Statement statement = bg_1.connection.createStatement(); HashMap<String, String> extRefLemmaMap = new HashMap<String, String>(); ResultSet rs; if (extRef) { rs = statement .executeQuery("SELECT externalReference, writtenForm, partOfSpeech FROM LexicalEntry join FormRepresentation_Lemma join Sense join MonolingualExternalRef where FormRepresentation_Lemma.lemmaId = LexicalEntry.lemmaId and Sense.lexicalEntryId = LexicalEntry.lexicalEntryId and MonolingualExternalRef.senseId = Sense.senseId and Sense.senseId like '" + bg_1.prefix_string + "%'"); } else { rs = statement .executeQuery("SELECT senseId, writtenForm, partOfSpeech FROM LexicalEntry join FormRepresentation_Lemma join Sense where FormRepresentation_Lemma.lemmaId = LexicalEntry.lemmaId and Sense.lexicalEntryId = LexicalEntry.lexicalEntryId and Sense.senseId like '" + bg_1.prefix_string + "%'"); } while (rs.next()) { extRefLemmaMap.put(rs.getString(1), rs.getString(2) + "\t" + rs.getString(3)); } FileReader in = new FileReader(lexeme_file); BufferedReader input = new BufferedReader(in); HashSet<String> lexemes = new HashSet<String>(); String line = ""; FileOutputStream outstream; PrintStream p; outstream = new FileOutputStream(input_file + "_lexemeFiltered"); p = new PrintStream(outstream); while ((line = input.readLine()) != null) { lexemes.add(line.replace(" ", "_")); // Assuming spaces are // represented by // underscores in the // lexeme list } in.close(); in = new FileReader(input_file); input = new BufferedReader(in); while ((line = input.readLine()) != null) { String[] ids = line.split("\t"); String lemmaPos = extRefLemmaMap.get(ids[0]); if (lexemes.contains(lemmaPos)) { p.println(line); } } p.close(); input.close(); in.close(); } catch (Exception e) { e.printStackTrace(); } } /** * This method checks for duplicate senses in the same cluster and removes * them * * * @param input_file * the clustering */ public static void checkForDupesInSameCluster(String input_file) { try { FileReader in = new FileReader(input_file); BufferedReader input = new BufferedReader(in); HashSet<String> alreadySeen2 = new HashSet<String>(); String line = ""; FileOutputStream outstream; PrintStream p; outstream = new FileOutputStream(input_file + "_noDupes"); p = new PrintStream(outstream); while ((line = input.readLine()) != null) { String out = ""; String[] ids = line.split("\t"); int c = 0; for (String id : ids) { if (alreadySeen2.contains(id)) { continue; } alreadySeen2.add(id); out += id + "\t"; c++; } if (c < 2) { continue; } p.println(out.trim()); } p.close(); input.close(); in.close(); } catch (Exception e) { e.printStackTrace(); } } /** * * This method iteratively merges clusters which share senses, until the * number of clusters stays stable * * @param input_file * the sense cluster file * * */ public static void mergeClustersWithSharedSenses(String input_file) { try { HashSet<String> input_clusters = new HashSet<String>(); HashSet<String> output_clusters = new HashSet<String>(); FileReader in = new FileReader(input_file); BufferedReader input = new BufferedReader(in); String l = ""; while ((l = input.readLine()) != null) { input_clusters.add(l); } input.close(); int count = 0; int cluster_size = input_clusters.size(); while (output_clusters.size() < cluster_size) { output_clusters = new HashSet<String>(); cluster_size = input_clusters.size(); HashMap<String, HashSet<String>> reverse_candidates = new HashMap<String, HashSet<String>>(); HashMap<String, HashSet<String>> candidates = new HashMap<String, HashSet<String>>(); HashSet<String> alreadySeen2 = new HashSet<String>(); for (String line : input_clusters) { if (alreadySeen2.contains(line)) { continue; } alreadySeen2.add(line); count++; String[] elements = line.split("\t"); for (String id : elements) { if (!candidates.containsKey(id)) { candidates.put(id, new HashSet<String>()); } if (!reverse_candidates.containsKey(count + "")) { reverse_candidates.put(count + "", new HashSet<String>()); } reverse_candidates.get(count + "").add(id); candidates.get(id).add(count + ""); } } HashSet<String> alreadySeen = new HashSet<String>(); HashMap<String, HashSet<String>> reverse_candidates_merged = new HashMap<String, HashSet<String>>(); for (String id_mirror : reverse_candidates.keySet()) { if (alreadySeen.contains(id_mirror)) { continue; } reverse_candidates_merged.put(id_mirror, new HashSet<String>()); alreadySeen.add(id_mirror); HashSet<String> rev_cluster = reverse_candidates .get(id_mirror); for (String sense : rev_cluster) { HashSet<String> cluster = candidates.get(sense); reverse_candidates_merged.get(id_mirror).add(sense); cluster.remove(id_mirror); if (rev_cluster.size() == 0) { // do nothing } else { for (String remaining : cluster) { reverse_candidates_merged.get(id_mirror) .addAll(reverse_candidates .get(remaining)); alreadySeen.add(remaining); } } } } for (String id_mirror : reverse_candidates_merged.keySet()) { HashSet<String> cluster = reverse_candidates_merged .get(id_mirror); if (cluster.size() == 1) { continue; } String output_cluster = ""; for (String sense : cluster) { output_cluster += sense + "\t"; } output_cluster = output_cluster.trim(); if (alreadySeen.contains(output_cluster)) { continue; } else { alreadySeen.add(output_cluster); } output_clusters.add(output_cluster); } input_clusters = output_clusters; } FileOutputStream outstream; PrintStream p; outstream = new FileOutputStream(input_file + "_merged"); p = new PrintStream(outstream); for (String s : output_clusters) { p.println(s); } p.close(); } catch (Exception e) { e.printStackTrace(); } } }