/******************************************************************************* * Copyright 2016 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ /* * */ package de.tudarmstadt.ukp.alignment.framework.graph; import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.FileReader; import java.io.IOException; import java.io.PrintStream; import java.sql.Connection; import java.sql.DriverManager; import java.sql.ResultSet; import java.sql.SQLException; import java.sql.Statement; import java.util.HashMap; import java.util.HashSet; import java.util.TreeMap; import java.util.TreeSet; import java.util.regex.Pattern; import de.tudarmstadt.ukp.alignment.framework.Global; import de.tudarmstadt.ukp.alignment.framework.uima.Toolkit; import de.tudarmstadt.ukp.lmf.model.enums.ELanguageIdentifier; /* * This class is responsible for handling all operations concerning one singular resource * * */ public class OneResourceBuilder { public TreeMap<String,HashSet<String>> lemmaPosSenses; //Index for the senses a lexeme has public TreeMap<String,HashSet<String>> senseIdLemma; //Index for the lemmas for senses public TreeMap<String,String> lemmaIdWrittenForm; //Index for the written form for lemmas public TreeMap<String,Integer> lexemeFreqInGlosses; //Frequency of lexemes across all glosses public TreeMap<String,Integer> lemmaFreqInGlosses;//Frequency of lemmas across all glosses public TreeMap<String, String> senseIdGloss; //Index for the gloss of a senses public TreeMap<String,String> senseIdGlossPos; //Index for the pos-tagged gloss of a sense //Basic characteristics of the resources public Connection connection; public int prefix; public String prefix_string; public boolean synset; public boolean pos; public String language; public int gloss_count; public OneResourceBuilder(String dbname, String user, String pass, int prefix, String language, boolean synset, boolean pos) { senseIdLemma = new TreeMap<String, HashSet<String>>(); lemmaIdWrittenForm = new TreeMap<String, String>(); lemmaPosSenses = new TreeMap<String, HashSet<String>>(); lexemeFreqInGlosses = new TreeMap<String, Integer>(); lemmaFreqInGlosses = new TreeMap<String, Integer>(); // HashMap<Integer,String> senseIdGloss = new HashMap<Integer, String>(); // HashMap<Integer,String> senseIdGlossPos = new HashMap<Integer, String>(); try { Class.forName("com.mysql.jdbc.Driver"); connection = DriverManager.getConnection("jdbc:mysql://localhost/"+dbname,user,pass); System.out.println(connection.isClosed()); this.prefix =prefix; this.prefix_string = Global.prefixTable.get(prefix); this.synset = synset; this.pos = pos; this.language = language; // int gloss_count = 0; } catch (SQLException e) { e.printStackTrace(); } catch (ClassNotFoundException e) { e.printStackTrace(); } } /** * This method creates a graph from the semantic relations encoded in UBY * * param filterByGloss only considers relation targets that are contained within the gloss, or the first paragraph. This option mostly contains Wikipedia * as described in the paper. * */ public void builtRelationGraphFromDb(boolean filterByGloss) throws ClassNotFoundException, SQLException, IOException { FileOutputStream outstream; PrintStream p; outstream = new FileOutputStream("target/"+prefix_string+"_"+(synset?"synset":"sense")+"_relationgraph"+(filterByGloss?"_filtered":"")+".txt"); p = new PrintStream( outstream ); StringBuilder sb = new StringBuilder(); Statement statement = connection.createStatement(); ResultSet rs; if(synset) { rs = statement.executeQuery("SELECT synsetId,target FROM SynsetRelation where synsetId like '"+prefix_string+"%'"); } else { //Special handling of FrameNet, as relations are expressed differently here if(prefix == Global.FN_prefix) { rs = statement.executeQuery("SELECT distinct pr1.senseId, pr2.senseId FROM PredicativeRepresentation pr1 join PredicativeRepresentation pr2 where pr1.predicate = pr2.predicate and pr1.senseId like 'FN%' and pr2.senseId like 'FN%' and pr1.senseId != pr2.senseId"); } else { rs = statement.executeQuery("SELECT senseId,target FROM SenseRelation where senseId like '"+prefix_string+"%'"); } } int max_id = 0; int edge_count = 0; while(rs.next()) { String id1 = rs.getString(1); String id2 = rs.getString(2); // System.out.println(id1+" "+id2); // System.out.println(count); if(id2 == null) { continue; } if(synset) { id1 = prefix+id1.split("ynset_")[1]; id2 = prefix+id2.split("ynset_")[1]; } else { id1 = prefix+id1.split("ense_")[1]; id2 = prefix+id2.split("ense_")[1]; } //HashSet<String> lemmas1 = senseIdLemma.get(id1); HashSet<String> lemmas2 = senseIdLemma.get(id2); String gloss1; if(filterByGloss) { gloss1 = senseIdGloss.get(id1); } else { gloss1 = senseIdGlossPos.get(id1); } if(gloss1 == null) { gloss1 = ""; } String[] gloss_array1 = gloss1.split(" "); //String[] gloss2 = senseIdGlossPos.get(id2).split(" "); for(String s : gloss_array1) { if(lemmas2.contains(s) || !filterByGloss ) { int id1_num = Integer.parseInt(id1); int id2_num = Integer.parseInt(id2); if(id1_num > max_id) { max_id = id1_num; } if(id2_num > max_id) { max_id = id2_num; } // sb.append("a "+id1_num+" "+id2_num+" 1\n"); sb.append("e"+edge_count+++" "+id1_num+" "+id2_num+Global.LF); if(prefix != Global.FN_prefix) { //sb.append("a "+id2_num+" "+id1_num+" 1\n"); //count+=1; } break; //System.out.println(senseIdGlossPos.get(id1)); // for(String l : lemmas2) { // System.out.println(l); // } } } } // String header = "p sp "+max_id+" "+count; String header = "graph class=grph.in_memory.InMemoryGrph"; p.println(header); p.print(sb.toString()); p.flush(); p.close(); rs.close(); statement.close(); } /** * This method outputs an analysis of how many lemmas and senses we have in the resource for each part of speech for a given list of lemmas. * * * */ public void analyizeLemmaList(String input) throws ClassNotFoundException, SQLException, IOException { FileReader in = new FileReader("/home/matuschek/ClusterEvaluationTM/GermaNet/WebCAGe-2.0_lemmas.tsv"); double total_count = 0; double n_count = 0 ; double v_count = 0 ; double a_count = 0 ; double total_counts = 0; double n_counts = 0 ; double v_counts = 0 ; double a_counts = 0 ; double total_mono = 0; double n_mono = 0 ; double v_mono = 0 ; double a_mono = 0 ; BufferedReader inp = new BufferedReader(in); Statement statement = connection.createStatement(); String line; String lemma =""; String pos =""; ResultSet rs; while((line =inp.readLine())!=null) { lemma = line.split("\t")[0]; pos = line.split("\t")[1]; rs = statement.executeQuery("select writtenForm,count(senseId) from LexicalEntry join FormRepresentation_Lemma join Sense where Sense.lexicalEntryId = LexicalEntry.lexicalEntryId and FormRepresentation_Lemma.lemmaId= LexicalEntry.lemmaId and writtenForm = '"+lemma+"' and partOfSpeech like '"+pos+"%' and Sense.senseId like 'GN%' group by writtenForm"); while(rs.next()) { total_count++; int senses = rs.getInt(2); if(senses==1) { total_mono++; } total_counts+=senses; if(pos.equals("n")) { n_count++; n_counts+=senses; if(senses==1) { n_mono++; } } else if(pos.equals("v")) { v_count++; v_counts+=senses; if(senses==1) { v_mono++; } } else if(pos.equals("a")) { a_count++; a_counts+=senses; if(senses==1) { a_mono++; } } } } inp.close(); System.out.println("Total:"); System.out.println(total_count); System.out.println(total_counts); System.out.println(total_counts/total_count); System.out.println(total_mono); System.out.println("N:"); System.out.println(n_count); System.out.println(n_counts); System.out.println(n_counts/n_count); System.out.println(n_mono); System.out.println("V:"); System.out.println(v_count); System.out.println(v_counts); System.out.println(v_counts/v_count); System.out.println(v_mono); System.out.println("A:"); System.out.println(a_count); System.out.println(a_counts); System.out.println(a_counts/a_count); System.out.println(a_mono); } /** * This method outputs type-token-ratio of the glosses of the resource. * */ public void typeTokenRatio() throws ClassNotFoundException, SQLException, IOException { double token_count = 0; FileOutputStream outstream; int count = 0; TreeSet<String> coveredSenses = new TreeSet<String>(); PrintStream p; outstream = new FileOutputStream("target/"+prefix_string+"_"+(synset?"synset":"sense")+"_ttr.txt"); p = new PrintStream( outstream ); Statement statement = connection.createStatement(java.sql.ResultSet.TYPE_FORWARD_ONLY, java.sql.ResultSet.CONCUR_READ_ONLY); statement.setFetchSize(Integer.MIN_VALUE); ResultSet rs; final Pattern CLEANUP = Pattern.compile("[^A-Za-z0-9äöüÄÖÜß]+"); if(synset) { rs = statement.executeQuery("SELECT synsetId, writtenText FROM Definition join TextRepresentation_Definition where synsetId like '"+prefix_string+"%' and Definition.definitionId = TextRepresentation_Definition.definitionId and length(writtenText)>0"); } else { rs = statement.executeQuery("SELECT senseId, writtenText FROM Definition join TextRepresentation_Definition where senseId like '"+prefix_string+"%' and Definition.definitionId = TextRepresentation_Definition.definitionId and length(writtenText)>0"); } while(rs.next()) { if(rs.getString(2)!=null) { String id = rs.getString(1); if(synset) { id = prefix+id.split("ynset_")[1]; } else { id = prefix+id.split("ense_")[1]; } count++; if(count % 1000 == 0) { System.out.println(count); } String gloss = CLEANUP.matcher(rs.getString(2)).replaceAll(" "); gloss = gloss.replace("\n", "").replace("\r", "").replace("\t", " ").trim(); coveredSenses.add(id); String[] result=gloss.split(" "); for(String s : result) { token_count++; if(!lemmaFreqInGlosses.containsKey(s)) { lemmaFreqInGlosses.put(s, 0); } int freq = lemmaFreqInGlosses.get(s); lemmaFreqInGlosses.put(s, freq+1); //System.out.println(s); } } } p.println("Tokens: "+token_count); p.println("Types: "+lemmaFreqInGlosses.keySet().size()); p.println("Ratio: "+(lemmaFreqInGlosses.keySet().size()/token_count )); rs.close(); statement.close(); p.close(); } /** * This method outputs the gloss file for the resource * * @param createLexicalFieldIfEmpty states whether, in case the gloss is empty, an artificial gloss should be created by using related senses/synsets. This is in line with * the Lexical Fields introduced by Henrich et al. * */ public void createGlossFile(boolean createLexicalFieldIfEmpty) throws ClassNotFoundException, SQLException, IOException { FileOutputStream outstream; FileOutputStream outstream_freq; HashSet<String> coveredSenses = new HashSet<String>(); PrintStream p; PrintStream p_freq; outstream = new FileOutputStream("target/"+prefix_string+"_"+(synset?"synset":"sense")+"_glosses.txt"); p = new PrintStream( outstream ); outstream_freq = new FileOutputStream("target/"+prefix_string+"_"+(synset?"synset":"sense")+"_lemma_frequencies.txt"); p_freq = new PrintStream( outstream_freq ); Statement statement = connection.createStatement(java.sql.ResultSet.TYPE_FORWARD_ONLY, java.sql.ResultSet.CONCUR_READ_ONLY); statement.setFetchSize(Integer.MIN_VALUE); ResultSet rs; final Pattern CLEANUP = Pattern.compile("[^A-Za-z0-9äöüÄÖÜß]+"); if(synset) { rs = statement.executeQuery("SELECT synsetId, writtenText FROM Definition join TextRepresentation_Definition where synsetId like '"+prefix_string+"%' and Definition.definitionId = TextRepresentation_Definition.definitionId and length(writtenText)>0"); } else { rs = statement.executeQuery("SELECT senseId, writtenText FROM Definition join TextRepresentation_Definition where senseId like '"+prefix_string+"%' and Definition.definitionId = TextRepresentation_Definition.definitionId and length(writtenText)>0"); } while(rs.next()) { if(rs.getString(2)!=null) { String id = rs.getString(1); if(synset) { id = prefix+id.split("ynset_")[1]; } else { id = prefix+id.split("ense_")[1]; } String gloss = CLEANUP.matcher(rs.getString(2)).replaceAll(" "); gloss = gloss.replace("\n", "").replace("\r", "").replace("\t", " ").trim(); p.println(id+"\t"+gloss); coveredSenses.add(id); String[] result=gloss.split(" "); for(String s : result) { if(!lemmaFreqInGlosses.containsKey(s)) { lemmaFreqInGlosses.put(s, 0); } int freq = lemmaFreqInGlosses.get(s); lemmaFreqInGlosses.put(s, freq+1); } } } if(createLexicalFieldIfEmpty) { HashMap<String,HashSet<String>> idMap = new HashMap<String, HashSet<String>>(); rs = statement.executeQuery("SELECT SenseRelation.senseId, writtenForm FROM SenseRelation " + "join Sense join LexicalEntry join FormRepresentation_Lemma " + "where Sense.lexicalEntryId = LexicalEntry.lexicalEntryId and SenseRelation.target =Sense.senseId " + "and Sense.senseID like '"+prefix_string+"%' and FormRepresentation_Lemma.lemmaId = LexicalEntry.lemmaId " + "and (relName like 'hyperynym' or relName like 'hyponym' or relName like 'synonym')"); while(rs.next()) { String id1 = rs.getString(1); if(!idMap.containsKey(id1)) { idMap.put(id1,new HashSet<String>()); } HashSet<String> temp = idMap.get(id1); temp.add(rs.getString(2)); } for(String s : idMap.keySet()) { String lf = ""; for(String l : idMap.get(s)) { lf+=l+" "; } lf = lf.trim(); lf = CLEANUP.matcher(lf).replaceAll(" "); lf = lf.replace("\n", "").replace("\r", "").replace("\t", " ").trim(); String id = prefix+s.split("ense_")[1]; p.println(id+"\t"+lf); String[] result=lf.split(" "); for(String r : result) { if(!lemmaFreqInGlosses.containsKey(r)) { lemmaFreqInGlosses.put(r, 0); } int freq = lemmaFreqInGlosses.get(r); lemmaFreqInGlosses.put(r, freq+1); //System.out.println(s); } } } for(String lemma : lemmaFreqInGlosses.keySet()) { p_freq.println(lemma+"\t"+lemmaFreqInGlosses.get(lemma)); } p_freq.close(); rs.close(); statement.close(); p.close(); } /** * This method lemmatizes and POS-tags the given gloss files * * @param chunk_size the chunk size to be processed at a time. The higher, the better, but also the more memory is consumed * */ public void lemmatizePOStagGlossFileInChunks(int chunk_size) { int i = 0; int line_count = 1; FileOutputStream outstream; FileOutputStream outstream_freq; PrintStream p; PrintStream p_freq ; try { FileReader in = new FileReader("target/"+prefix_string+"_"+(synset?"synset":"sense")+"_glosses.txt"); BufferedReader input_reader = new BufferedReader(in); String line; outstream = new FileOutputStream("target/"+prefix+"temp_"+i); p = new PrintStream( outstream ); while((line =input_reader.readLine())!=null) { if(line_count % chunk_size ==0) { outstream.flush(); outstream.close(); p.close(); lemmatizePOStagGlossFile(prefix+"temp_"+i, prefix+"temp_tagged_"+i,prefix+"temp_freq_"+i, prefix, language); File f = new File("target/"+prefix+"temp_"+i); f.delete(); i++; outstream = new FileOutputStream("target/"+prefix+"temp_"+i); p = new PrintStream( outstream ); } p.println(line); line_count++; } outstream.close(); p.close(); lemmatizePOStagGlossFile(prefix+"temp_"+i, prefix+"temp_tagged_"+i,prefix+"temp_freq_"+i, prefix, language); File f = new File("target/"+prefix+"temp_"+i); f.delete(); outstream = new FileOutputStream("target/"+prefix_string+"_"+(synset?"synset":"sense")+"_glosses_tagged.txt"); outstream_freq = new FileOutputStream("target/"+prefix_string+"_"+(synset?"synset":"sense")+"_lexeme_frequencies.txt"); p = new PrintStream(outstream); p_freq = new PrintStream(outstream_freq); for(int x =0 ; x<= i;x++) { f = new File("target/"+prefix+"temp_tagged_"+x); in = new FileReader(f); input_reader = new BufferedReader(in); while((line =input_reader.readLine())!=null) { p.println(line); } f.delete(); input_reader.close(); in.close(); } lexemeFreqInGlosses = new TreeMap<String, Integer>(); for(int x =0 ; x<= i;x++) { f = new File("target/"+prefix+"temp_freq_"+x); in = new FileReader(f); input_reader = new BufferedReader(in); while((line =input_reader.readLine())!=null) { String lexeme = line.split("\t")[0]; int frequency = Integer.parseInt(line.split("\t")[1]); if(!lexemeFreqInGlosses.containsKey(lexeme)) { lexemeFreqInGlosses.put(lexeme, 0); } int freq = lexemeFreqInGlosses.get(lexeme); lexemeFreqInGlosses.put(lexeme, frequency+freq); } f.delete(); in.close(); } p.close(); for(String lexeme : lexemeFreqInGlosses.keySet()) { p_freq.println(lexeme+"\t"+lexemeFreqInGlosses.get(lexeme)); } p_freq.close(); } catch(Exception e) { e.printStackTrace(); } } /** * This method lemmatizes and POS-tags the given gloss files. This is the method which processes a given chunk * * /** * This method lemmatizes and POS-tags the given gloss files * * * * * */ public void lemmatizePOStagGlossFile(String input, String output1,String output2, int prefix, String lang) { if(lexemeFreqInGlosses == null) { lexemeFreqInGlosses = new TreeMap<String, Integer>(); } int i = 0; FileOutputStream outstream; FileOutputStream outstream_freq; PrintStream p; PrintStream p_freq; try { outstream = new FileOutputStream("target/"+output1); outstream_freq = new FileOutputStream("target/"+output2); // Connect print stream to the output stream p = new PrintStream( outstream ); p_freq = new PrintStream(outstream_freq); FileReader in = new FileReader("target/"+input); BufferedReader input_reader = new BufferedReader(in); String line; StringBuilder sb = new StringBuilder(); while((line =input_reader.readLine())!=null) { sb.append(line.replace("\t","TABULATOR ")+" ENDOFLINE "); //System.out.println("lines appended "+i++); } input_reader.close(); String[] result = null; if(lang.equals(ELanguageIdentifier.ENGLISH)) { Toolkit.initializePOS(); result = Toolkit.lemmatizeEnglish(sb.toString()); } else if(lang.equals(ELanguageIdentifier.GERMAN)) { Toolkit.initializePOSGerman(); result = Toolkit.lemmatizeGerman(sb.toString()); } String resultline=""; for(String s : result) { resultline+=s+" "; if(!s.contains("TABULATOR") && !s.contains("ENDOFLINE")) { if(!lexemeFreqInGlosses.containsKey(s)) { lexemeFreqInGlosses.put(s, 0); } int freq = lexemeFreqInGlosses.get(s); lexemeFreqInGlosses.put(s, freq+1); } //System.out.println(s); } resultline = resultline.replaceAll("tabulator#\\S*\\s", "\t"); resultline = resultline.replaceAll("endofline#\\S*\\s", Global.LF); resultline = resultline.replaceAll("TABULATOR#\\S*\\s", "\t"); resultline = resultline.replaceAll("ENDOFLINE#\\S*\\s", Global.LF); p.print(resultline); System.out.println("resultline "+resultline); p.flush(); p.close(); for(String lexeme : lexemeFreqInGlosses.keySet()) { p_freq.println(lexeme+"\t"+lexemeFreqInGlosses.get(lexeme)); } p_freq.close(); } catch(Exception e) { e.printStackTrace(); } } /** * This method fills the index tables which are required for downstream processing * * */ public void fillIndexTables() throws ClassNotFoundException, SQLException, IOException { String prefix_string = Global.prefixTable.get(prefix); Statement statement = connection.createStatement(java.sql.ResultSet.TYPE_FORWARD_ONLY, java.sql.ResultSet.CONCUR_READ_ONLY); statement.setFetchSize(Integer.MIN_VALUE); int count =0; if(lemmaIdWrittenForm == null || lemmaIdWrittenForm.size()==0) { ResultSet rs = statement.executeQuery("select distinct LexicalEntry.lemmaId,writtenForm from FormRepresentation_Lemma join LexicalEntry where LexicalEntry.lexicalEntryId like '"+prefix_string+"%' and LexicalEntry.lemmaId = FormRepresentation_Lemma.lemmaId"); while(rs.next()) { String lemmaId = rs.getString(1); String writtenForm = rs.getString(2); lemmaIdWrittenForm.put(lemmaId,writtenForm); } rs.close(); } if(lemmaFreqInGlosses.size()==0) { lemmaFreqInGlosses = new TreeMap<String, Integer>(); FileReader in = new FileReader("target/"+prefix_string+"_"+(synset?"synset":"sense")+"_lemma_frequencies.txt"); BufferedReader input_reader = new BufferedReader(in); String line; while((line =input_reader.readLine())!=null) { String lemma = line.split("\t")[0]; int frequency = Integer.parseInt(line.split("\t")[1]); lemmaFreqInGlosses.put(lemma, frequency); } input_reader.close(); } System.out.println("Lemma frequencies filled for "+this.prefix_string); try { if(lexemeFreqInGlosses.size()==0) { lexemeFreqInGlosses = new TreeMap<String, Integer>(); FileReader in = new FileReader("target/"+prefix_string+"_"+(synset?"synset":"sense")+"_lexeme_frequencies.txt"); BufferedReader input_reader = new BufferedReader(in); String line; while((line =input_reader.readLine())!=null) { String lexeme = line.split("\t")[0]; int frequency = Integer.parseInt(line.split("\t")[1]); lexemeFreqInGlosses.put(lexeme, frequency); } input_reader.close(); } System.out.println("Lexeme frequencies filled for "+this.prefix_string); } catch(FileNotFoundException nfe) { System.err.println("Lexeme frequencies not found, skipped"); } if(senseIdGloss == null || senseIdGloss.size()==0) { senseIdGloss = new TreeMap<String, String>(); FileReader in = new FileReader("target/"+prefix_string+"_"+(synset?"synset":"sense")+"_glosses.txt"); BufferedReader input_reader = new BufferedReader(in); String line; while((line =input_reader.readLine())!=null) { gloss_count++; String id = line.split("\t")[0]; if(line.split("\t").length != 2) { continue; } String gloss = line.split("\t")[1]; senseIdGloss.put(id,gloss); } input_reader.close(); } System.out.println("Glosses filled for "+this.prefix_string); try { if(senseIdGlossPos == null || senseIdGlossPos.size()==0) { senseIdGlossPos = new TreeMap<String, String>(); FileReader in = new FileReader("target/"+prefix_string+"_"+(synset?"synset":"sense")+"_glosses_tagged.txt"); BufferedReader input_reader = new BufferedReader(in); String line; while((line =input_reader.readLine())!=null) { String id = line.split("\t")[0]; if(line.split("\t").length != 2) { continue; } String gloss = line.split("\t")[1]; senseIdGlossPos.put(id,gloss); } input_reader.close(); } System.out.println("Tagged glosses filled for "+this.prefix_string); } catch(FileNotFoundException nfe) { System.err.println("Tagged glosses not found, skipped"); } count = 0; if(lemmaPosSenses == null || lemmaPosSenses.size()==0) { ResultSet rs = statement.executeQuery("select distinct lemmaId,partOfSpeech, "+ (synset ? "synsetId" : "senseId") +" from LexicalEntry join Sense where LexicalEntry.lexicalEntryId like '"+prefix_string+"%' and LexicalEntry.lexicalEntryId = Sense.lexicalEntryId"); while(rs.next()) { String lemmaId = rs.getString(1); String lemma; lemma = lemmaIdWrittenForm.get(lemmaId); String POS = rs.getString(2); String senseId = rs.getString(3); if(synset) { senseId = prefix+senseId.split("ynset_")[1]; } else { senseId = prefix+senseId.split("ense_")[1]; } if(lemma == null) { continue; } if(count++%1000==0) { System.out.println(count); } String key = ""; if(pos) { if(POS == null ) { key =lemma.toLowerCase()+"#"+"null"; } else { key =lemma.toLowerCase()+"#"+POS.replace("Common", ""); } } else { key =lemma.toLowerCase(); } if(!senseIdLemma.containsKey(senseId)) { senseIdLemma.put(senseId, new HashSet<String>()); } senseIdLemma.get(senseId).add(key); if(!lemmaPosSenses.containsKey(key)) { lemmaPosSenses.put(key, new HashSet<String>()); } lemmaPosSenses.get(key).add(senseId); } // for(String key : lemmaPosSenses.keySet()) // { // System.out.println(key+" "+lemmaPosSenses.get(key).size()); // } rs.close(); statement.close(); System.out.println("Lexeme-sense map filled for "+this.prefix_string); } } /** * This method creates monosemous links based on the POS-tagged glosses and the frequencies of the lexemes * * * @param phi the maximum frequency of a lexeme to be considered */ public void createMonosemousLinks(int phi) throws ClassNotFoundException, SQLException, IOException { StringBuilder sb = new StringBuilder(); int count = 0; int max_id = 0; FileOutputStream outstream; PrintStream p; FileReader in = new FileReader("target/"+prefix_string+"_"+(synset?"synset":"sense")+"_glosses_tagged.txt"); BufferedReader input_reader = new BufferedReader(in); outstream = new FileOutputStream("target/"+prefix_string+"_"+(synset?"synset":"sense")+"_"+(pos ? "Pos":"noPos")+"_monosemousLinks"+"_"+phi+".txt"); p = new PrintStream( outstream ); int edge_count=0; String line; while((line =input_reader.readLine())!=null) { String id1 = line.split("\t")[0]; System.out.println("id1 :" +id1); if((line.split("\t")).length<2) { //empty gloss continue; } String[] lexemes = line.split("\t")[1].split(" "); for(String lexeme:lexemes) { if(lexemeFreqInGlosses == null || lexemeFreqInGlosses.size() ==0) { System.err.println("Index Tables not initialized"); } if(lexemeFreqInGlosses.get(lexeme) == null || lexemeFreqInGlosses.get(lexeme)>phi) //too frequent { continue; } if(pos) { if(lemmaPosSenses.get(lexeme)!= null && lemmaPosSenses.get(lexeme).size()==1) { String id2= lemmaPosSenses.get(lexeme).iterator().next(); System.out.println("id2 :" +id2); count+=2; int id1_num = Integer.parseInt(id1); System.out.println("parsed id1 :" +id1); int id2_num = Integer.parseInt(id2); System.out.println("parsed id2 :" +id2); if(id1_num > max_id) { max_id = id1_num; } if(id2_num > max_id) { max_id = id2_num; } // sb.append("a "+id1_num+" "+id2_num+" 1\n"); // sb.append("a "+id2_num+" "+id1_num+" 1\n"); sb.append("e"+edge_count+++" "+id1_num+" "+id2_num+Global.LF); } } else { String lemma = lexeme.split("#")[0]; if(lemmaPosSenses.get(lemma)!= null && lemmaPosSenses.get(lemma).size()==1) { String id2= lemmaPosSenses.get(lemma).iterator().next(); count+=2; int id1_num = Integer.parseInt(id1); int id2_num = Integer.parseInt(id2); if(id1_num > max_id) { max_id = id1_num; } if(id2_num > max_id) { max_id = id2_num; } sb.append("e"+edge_count+++" "+id1_num+" "+id2_num+Global.LF); //sb.append("a "+id1_num+" "+id2_num+" 1\n"); // sb.append("a "+id2_num+" "+id1_num+" 1\n"); } } } } input_reader.close(); // String header = "p sp "+max_id+" "+count; String header = "graph class=grph.in_memory.InMemoryGrph"; p.println(header); p.print(sb.toString()); p.flush(); p.close(); } }