/******************************************************************************* * Copyright 2016 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ package de.tudarmstadt.ukp.alignment.framework.graph; import java.io.FileOutputStream; import java.io.IOException; import java.io.PrintStream; import java.sql.SQLException; import java.util.Set; import de.tudarmstadt.ukp.alignment.framework.Global; import de.tudarmstadt.ukp.lmf.model.enums.ELanguageIdentifier; public class JointGraphBuilder { /** * *This method is the "starting point" of the alignment framework, encoding the process from creation of the graphs to their merging into one big graph using monosemous linking * * */ public static void main(String[] args) throws IOException, ClassNotFoundException, SQLException { /* GLOBAL SETTINGS */ Global.init(); String language = ELanguageIdentifier.ENGLISH; //We cover only the monolingual case for now /*RESOURCE 1*/ boolean synset1 = true; boolean usePos1 = true; //Chose the resource we want to align by selecting the appropriate prefixes int prefix1 = Global.WN_Synset_prefix; String prefix_string1 = Global.prefixTable.get(prefix1); //Frequency threshold for the monosemous linking final int monoLinkThreshold1 = 1000; //Chunksize for the POS-Tagging of the glosses. This is mostly a memory issues, higher values are faster, but might lead to crashes // final int chunksize1 = 2000; //Build the resource by using the appropriate databases OneResourceBuilder bg_1 = new OneResourceBuilder("uby_release_1_0","root","fortuna", prefix1,language,synset1,usePos1); //Create text files with glosses for the two resources, and do POS tagging // bg_1.createGlossFile(false); // bg_1.lemmatizePOStagGlossFileInChunks(chunksize1); // Fill the index, build graphs from the relations and the monosemous linking - merge in the end bg_1.fillIndexTables(); // bg_1.builtRelationGraphFromDb(false); // bg_1.createMonosemousLinks(monoLinkThreshold1); // Global.mergeTwoGraphs(prefix_string1+"_"+(synset1?"synset":"sense")+"_relationgraph.txt" , // prefix_string1+"_"+(synset1?"synset":"sense")+"_"+(usePos1 ? "Pos":"noPos")+"_monosemousLinks"+"_"+monoLinkThreshold1+".txt", // prefix_string1+"_"+(synset1?"synset":"sense")+"_"+(usePos1 ? "Pos":"noPos")+"_relationMLgraph"+"_"+monoLinkThreshold1+".txt"); /*RESOURCE 2*/ boolean synset2 = false; boolean usePos2 = true; final int prefix2 = Global.WKT_EN_prefix; final String prefix_string2 = Global.prefixTable.get(prefix2); final int monoLinkThreshold2 = 2000; // final int chunksize2 = 2000; OneResourceBuilder bg_2 = new OneResourceBuilder("uby_release_1_0","root","fortuna",prefix2,language,synset2,usePos2); // bg_2.createGlossFile(false); //bg_2.lemmatizePOStagGlossFileInChunks(chunksize2); bg_2.fillIndexTables(); // boolean filter = false; // bg_2.builtRelationGraphFromDb(filter); // bg_2.createMonosemousLinks(monoLinkThreshold2); // // Global.mergeTwoGraphs(prefix_string2+"_"+(synset2?"synset":"sense")+"_relationgraph.txt" , // prefix_string2+"_"+(synset2?"synset":"sense")+"_"+(usePos2 ? "Pos":"noPos")+"_monosemousLinks"+"_"+monoLinkThreshold2+".txt", // prefix_string2+"_"+(synset2?"synset":"sense")+"_"+(usePos2 ? "Pos":"noPos")+"_relationMLgraph"+"_"+monoLinkThreshold2+".txt"); /*Merge the two graphs*/ Global.mergeTwoGraphs( prefix_string1+"_"+(synset1?"synset":"sense")+"_"+(usePos1 ? "Pos":"noPos")+"_relationMLgraph"+"_"+monoLinkThreshold1+".txt", prefix_string2+"_"+(synset2?"synset":"sense")+"_"+(usePos2 ? "Pos":"noPos")+"_relationMLgraph"+"_"+monoLinkThreshold2+".txt", // prefix_string2+"_"+(synset2?"synset":"sense")+"_relationgraph"+(filter ? "_filtered":"")+".txt", prefix_string1+"_"+(synset1?"synset":"sense")+"_"+(usePos1 ? "Pos":"noPos")+"_relationMLgraph"+"_"+monoLinkThreshold1 +"_MERGED_"+ prefix_string2+"_"+(synset2?"synset":"sense")+"_"+(usePos2 ? "Pos":"noPos")+"_relationMLgraph"+"_"+monoLinkThreshold2+".txt" ); /*Create trivial alignments between the two LSRs*/ /*Index tables must be filled at this point!!!*/ createTrivialAlignments(bg_1, bg_2); /*Merge the joint graphs and trivial alignments*/ Global.mergeTwoGraphs( prefix_string1+"_"+(synset1?"synset":"sense")+"_"+(usePos1 ? "Pos":"noPos")+"_relationMLgraph"+"_"+monoLinkThreshold1 +"_MERGED_"+ prefix_string2+"_"+(synset2?"synset":"sense")+"_"+(usePos2 ? "Pos":"noPos")+"_relationMLgraph"+"_"+monoLinkThreshold2+".txt", prefix_string1+"_"+prefix_string2+"_trivial_"+(usePos2 ? "Pos": "noPos")+".txt", prefix_string1+"_"+(synset1?"synset":"sense")+"_"+(usePos1 ? "Pos":"noPos")+"_relationMLgraph"+"_"+monoLinkThreshold1 +"_MERGED_"+ prefix_string2+"_"+(synset2?"synset":"sense")+"_"+(usePos2 ? "Pos":"noPos")+"_relationMLgraph"+"_"+monoLinkThreshold2+ "_trivial.txt" ); //Done! We now have two linked graphs which are connected via monosemous links } /** * * Creates the trivial alignment between two resource graphs, i.e. those between lemmas with only one sense in either LSR * * @param gb1 First LSR * @param gb2 Second LSR * */ public static void createTrivialAlignments(OneResourceBuilder gb1, OneResourceBuilder gb2) throws ClassNotFoundException, SQLException, IOException { StringBuilder sb = new StringBuilder(); int edge_count = 0; int maxId = 0; FileOutputStream outstream; PrintStream p; outstream = new FileOutputStream( "target/"+gb1.prefix_string+"_"+gb2.prefix_string+"_trivial_"+(gb2.pos ? "Pos": "noPos")+".txt"); p = new PrintStream( outstream ); for(String lemmaPos: gb1.lemmaPosSenses.keySet()) { if(gb1.lemmaPosSenses.get(lemmaPos).size()==1) //if there is only one sense for this lexeme { String id1= gb1.lemmaPosSenses.get(lemmaPos).iterator().next(); int id1_int = Integer.parseInt(id1); if(gb2.pos) { //if resource 2 uses POS if(gb2.lemmaPosSenses.get(lemmaPos)!= null && gb2.lemmaPosSenses.get(lemmaPos).size()==1) //if there is only one sense for this lexeme { String id2= gb2.lemmaPosSenses.get(lemmaPos).iterator().next(); int id2_int = Integer.parseInt(id2); //Retrieve the largest ID, so that it can be used as a value for the graph algortihm input file if(id1_int > maxId) { maxId = id1_int; } if(id2_int > maxId) { maxId = id2_int; } //sb.append("a "+id1+" "+id2+" 1"+Global.LF); //edges are unweighted //sb.append("a "+id2+" "+id1+" 1"+Global.LF); sb.append("e"+edge_count+++" "+id1+" "+id2+Global.LF); } } else { String lemma = lemmaPos.split("#")[0]; if(gb2.lemmaPosSenses.get(lemma)!= null && gb2.lemmaPosSenses.get(lemma).size()==1) { String id2= gb2.lemmaPosSenses.get(lemma).iterator().next(); int id2_int = Integer.parseInt(id2); if(id1_int > maxId) { maxId = id1_int; } if(id2_int > maxId) { maxId = id2_int; } // sb.append("a "+id1+" "+id2+" 1"+Global.LF); //edges are unweighted // sb.append("a "+id2+" "+id1+" 1"+Global.LF); sb.append("e"+edge_count+++" "+id1+" "+id2+Global.LF); } } } } //p.println("p sp "+maxId+" "+count); String header = "graph class=grph.in_memory.InMemoryGrph"; p.println(header); p.print(sb.toString()); p.close(); } /** * * Calculates the overlap between the vocabulary used in two resources * * @param gb1 First LSR * @param gb2 Second LSR * */ public static void calculateLexicalGlossOverlap(OneResourceBuilder gb1, OneResourceBuilder gb2) throws ClassNotFoundException, SQLException, IOException { Set<String> lexemes1 = gb1.lemmaFreqInGlosses.keySet(); Set<String> lexemes2 = gb2.lemmaFreqInGlosses.keySet(); double size1 = lexemes1.size(); double size2 = lexemes2.size(); lexemes1.retainAll(lexemes2); double overlap = lexemes1.size(); System.out.println(gb1.prefix_string+" "+size1); System.out.println(gb2.prefix_string+" "+size2); System.out.println("Common: "+" "+overlap); System.out.println(gb1.prefix_string+" overlap "+(overlap/size1)); System.out.println(gb2.prefix_string+" overlap "+(overlap/size2)); } }