/*******************************************************************************
* Copyright 2016
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
package de.tudarmstadt.ukp.alignment.framework.gloss;
import java.io.BufferedReader;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.PrintStream;
import java.sql.SQLException;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import de.tudarmstadt.ukp.alignment.framework.Global;
import de.tudarmstadt.ukp.alignment.framework.graph.OneResourceBuilder;
import de.tudarmstadt.ukp.lmf.model.enums.ELanguageIdentifier;
import dkpro.similarity.algorithms.api.TextSimilarityMeasure;
import dkpro.similarity.algorithms.lexical.string.CosineSimilarity;
public class GlossSimilarityCalculator
{
public static HashMap<String,Double> combinedLexemeFreqInGlosses = new HashMap<String, Double>();
public static HashMap<String,Double> combinedLemmaFreqInGlosses = new HashMap<String, Double>();
public static void main(String[] args) throws IOException, ClassNotFoundException, SQLException
{
/* GLOBAL SETTINGS */
Global.init();
final String language = ELanguageIdentifier.ENGLISH;
/*RESOURCE 1*/
boolean synset1 = true;
boolean usePos1 = true;
final int prefix1 = Global.WN_Synset_prefix;
OneResourceBuilder bg_1 = new OneResourceBuilder("uby_release_1_0","root","fortuna", prefix1,language,synset1,usePos1);
// final int chunksize1 = 2000;
// bg_1.createGlossFile(false);
// bg_1.lemmatizePOStagGlossFileInChunks(chunksize1);
bg_1.fillIndexTables();
/*RESOURCE 2*/
boolean synset2 = false;
boolean usePos2 = true;
final int prefix2 = Global.WKT_EN_prefix;
// final int chunksize2 = 1000;
OneResourceBuilder bg_2 = new OneResourceBuilder("uby_release_1_0","root","fortuna",prefix2,language,synset2,usePos2);
// bg_2.createGlossFile(false);
// bg_2.lemmatizePOStagGlossFileInChunks(chunksize2);
bg_2.fillIndexTables();
boolean useTaggedGloss = true;
boolean tfidf = true;
createIdfFiles(bg_1, bg_2);
calculateSimilarityForCandidates(bg_1, bg_2,useTaggedGloss, tfidf,"target/ijcnlp2011-meyer-dataset_graph.csv");
boolean onlyGreaterZero = true;
createAlignmentFromSimilarityFileUnsupervised(bg_1, bg_2,useTaggedGloss, tfidf, onlyGreaterZero);
// boolean extRef = true;
// Global.mapAlignmentToUby(bg_1,bg_2,"target/"+bg_1.prefix_string+"_"+bg_2.prefix_string+"_alignment_similarity_"+(bg_2.pos ? "Pos": "noPos")+(tfidf ? "_tfidf": "")+(onlyGreaterZero ? "_nonZero" :"")+".txt", extRef);
}
/**
* This method creates a combined idf value for the two resources to enable tfidf weighted cosine similarity calcujlation
*/
public static void createIdfFiles(OneResourceBuilder gb1, OneResourceBuilder gb2)
{
for(String s : gb1.lexemeFreqInGlosses.keySet())
{
double d = gb1.lexemeFreqInGlosses.get(s);
combinedLexemeFreqInGlosses.put(s,d);
}
for(String s : gb1.lemmaFreqInGlosses.keySet())
{
double d = gb1.lemmaFreqInGlosses.get(s);
combinedLemmaFreqInGlosses.put(s,d);
}
for(String s : gb2.lexemeFreqInGlosses.keySet())
{
double d = gb2.lexemeFreqInGlosses.get(s);
if(!combinedLexemeFreqInGlosses.containsKey(s))
{
combinedLexemeFreqInGlosses.put(s, 0.0);
}
double freq = combinedLexemeFreqInGlosses.get(s);
combinedLexemeFreqInGlosses.put(s, d+freq);
}
for(String s : gb2.lemmaFreqInGlosses.keySet())
{
double d = gb2.lemmaFreqInGlosses.get(s);
if(!combinedLemmaFreqInGlosses.containsKey(s))
{
combinedLemmaFreqInGlosses.put(s, 0.0);
}
double freq = combinedLemmaFreqInGlosses.get(s);
combinedLemmaFreqInGlosses.put(s, d+freq);
}
try
{
double overallGlossSize = gb1.gloss_count+gb2.gloss_count;
FileOutputStream outstream;
PrintStream p;
outstream = new FileOutputStream( "target/"+gb1.prefix_string+"_"+gb2.prefix_string+"_combined_lexeme_idf.txt");
p = new PrintStream( outstream );
for(String lexeme : combinedLexemeFreqInGlosses.keySet())
{
double freq = combinedLexemeFreqInGlosses.get(lexeme);
double idf = Math.log(overallGlossSize/freq);
p.println(lexeme+"\t"+idf);
}
p.close();
outstream.close();
outstream = new FileOutputStream( "target/"+gb1.prefix_string+"_"+gb2.prefix_string+"_combined_lemma_idf.txt");
p = new PrintStream( outstream );
for(String lemma : combinedLemmaFreqInGlosses.keySet())
{
double freq = combinedLemmaFreqInGlosses.get(lemma);
double idf = Math.log(overallGlossSize/freq);
p.println(lemma+"\t"+idf);
}
p.close();
outstream.close();
}
catch(Exception e)
{
e.printStackTrace();
}
}
/**
* This method calculates the cosine similarities between the glosses of two resources. Candidate files need to be created beforehand
*
* @param pos Consider pos-tagged lexemes or only lemmas
* @param tfidf Use tfidf weighting
*/
public static void calculateSimilarityForCandidates(OneResourceBuilder gb1, OneResourceBuilder gb2, boolean pos, boolean tfidf, String candidatesFile)
{
try
{
FileReader in = new FileReader(candidatesFile); //+"_short"
BufferedReader input_reader = new BufferedReader(in);
String line;
FileOutputStream outstream;
PrintStream p;
outstream = new FileOutputStream( "target/"+gb1.prefix_string+"_"+gb2.prefix_string+"_glossSimilarities"+(pos? "_tagged": "_plain")+(tfidf? "_tfidf": "")+".txt");
p = new PrintStream( outstream );
TextSimilarityMeasure measure = null;
/*TODO
*
* Other similarity measures can be integrated here
*
*
*
* */
if(!tfidf)
{
measure = new CosineSimilarity();
}
else
{
String idfScoresFile= "";
if(pos)
{
idfScoresFile = "target/"+gb1.prefix_string+"_"+gb2.prefix_string+"_combined_lexeme_idf.txt";
}
else
{
idfScoresFile = "target/"+gb1.prefix_string+"_"+gb2.prefix_string+"_combined_lemma_idf.txt";
}
measure = new CosineSimilarity(CosineSimilarity.WeightingModeTf.FREQUENCY_LOGPLUSONE,CosineSimilarity.WeightingModeIdf.LOGPLUSONE,CosineSimilarity.NormalizationMode.L2,idfScoresFile);
}
while((line =input_reader.readLine())!=null)
{
if(line.startsWith("p"))
{
p.println("f "+gb1.prefix_string+"_"+gb2.prefix_string+"_candidates_"+(gb2.pos ? "Pos": "noPos")+" "+"Cosine similarity");
continue;
} else if(line.startsWith("q"))
{
String id1 = line.split(" ")[1];
String id2 = line.split(" ")[2];
String gloss1 = "";
String gloss2 = "";
if(pos)
{
gloss1 = gb1.senseIdGlossPos.get(id1);
gloss2 = gb2.senseIdGlossPos.get(id2);
}
else
{
gloss1 = gb1.senseIdGloss.get(id1);
gloss2 = gb2.senseIdGloss.get(id2);
}
if(gloss1 == null || gloss2 ==null) {
p.println(id1+"\t"+id2+"\t"+0.0);
}
else
{
String [] gloss_arr1 = gloss1.split(" ");
List<String> gloss_set1 =Arrays.asList(gloss_arr1);
String [] gloss_arr2 = gloss2.split(" ");
List<String> gloss_set2 =Arrays.asList(gloss_arr2);
double similarity = measure.getSimilarity(gloss_set1, gloss_set2);
p.println(id1+"\t"+id2+"\t"+similarity);
}
}
}
input_reader.close();
p.close();
}
catch(Exception e)
{
e.printStackTrace();
}
}
/**
* This method creates an alignment from the similarity files in an unsupervised way by picking the candidate with the greatest value
*
* @param pos Consider pos-tagged lexemes or only lemmas
* @param tfidf Use tfidf weighting
* @param onlyGreaterZero only consider candidates with a non-zero similarity value
*/
public static void createAlignmentFromSimilarityFileUnsupervised(OneResourceBuilder gb1, OneResourceBuilder gb2, boolean pos, boolean tfidf,boolean onlyGreaterZero)
{
try
{
HashMap<String,HashSet<String> > alignment_candidates = new HashMap<String,HashSet<String> > ();
FileReader in = new FileReader("target/"+gb1.prefix_string+"_"+gb2.prefix_string+"_glossSimilarities"+(pos? "_tagged": "_plain")+(tfidf? "_tfidf": "")+".txt");
BufferedReader input = new BufferedReader(in);
String current_id1 ="";
String current_id2 ="";
Double similarity =0.0;
String line = "";
FileOutputStream outstream_alignment = new FileOutputStream("target/"+gb1.prefix_string+"_"+gb2.prefix_string+"_alignment_similarity_"+(gb2.pos ? "Pos": "noPos")+""+(tfidf? "_tfidf" :"")+(onlyGreaterZero ? "_nonZero":"")+".txt");
PrintStream p_align = new PrintStream( outstream_alignment );
int i =0;
while((line = input.readLine())!=null )
{
if(line.startsWith("f"))
{
p_align.println(line+" alignment");
continue;
}
System.out.println("Source Nodes parsed "+i++);
current_id1 = line.split("\t")[0];
current_id2 = line.split("\t")[1];
similarity = Double.parseDouble(line.split("\t")[2]);
if(alignment_candidates.get(current_id1)==null)
{
alignment_candidates.put(current_id1, new HashSet<String>());
}
HashSet<String> temp = alignment_candidates.get(current_id1);
temp.add(current_id2+"#"+similarity);
}
input.close();
in.close();
/*HERE THE ACTUAL ANALYISIS BEGINS*/
for(String s : alignment_candidates.keySet())
{
HashSet<String> temp = alignment_candidates.get(s);
HashSet<String> targets = new HashSet<String>();
if(temp.size()==1)
{
String[] targsim= temp.iterator().next().split("#");
String targ = targsim[0];
double sim = Double.parseDouble(targsim[1]);
if(sim >0 || !onlyGreaterZero)
{
targets.add(targ);
p_align.println(s+"\t"+targ+"\t"+sim);
}
}
else
{
double max = 0.0;
String targetid = "";
for(String t : temp)
{
String[] targsim=t.split("#");
String id =targsim[0];
double sim =Double.parseDouble(targsim[1]);
if(sim >= max)
{
max = sim;
targetid = id;
}
}
if(max >0 || !onlyGreaterZero)
{
targets.add(targetid);
p_align.println(s+"\t"+targetid+"\t"+max);
}
}
}
p_align.close();
}
catch(Exception e)
{
e.printStackTrace();
}
}
}