/*******************************************************************************
* Copyright 2016
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
package de.tudarmstadt.ukp.alignment.framework;
import java.io.BufferedReader;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.PrintStream;
import java.sql.ResultSet;
import java.sql.Statement;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import de.tudarmstadt.ukp.alignment.framework.graph.OneResourceBuilder;
public class Global
{
public static final int WN_Synset_prefix = 10;
public static final int WN_Sense_prefix = 11;
public static final int WKT_EN_prefix = 12;
public static final int OW_EN_Synset_prefix = 13;
public static final int OW_EN_Sense_prefix = 14;
public static final int WP_EN_prefix = 15;
public static final int FN_prefix = 16;
public static final int VN_prefix = 17;
public static final int WKT_DE_prefix = 18;
public static final int WP_DE_prefix = 19;
public static final int OW_DE_Synset_prefix = 20;
public static final int OW_DE_Sense_prefix = 21;
public static final int GN_Synset_prefix = 22;
public static final int GN_Sense_prefix = 23;
public static final int IMS_prefix = 24;
public static final int OntoWKT_DE_Synset_prefix = 25;
public static final int OntoWKT_DE_Sense_prefix = 26;
public static HashMap<Integer, String> prefixTableLong = new HashMap<Integer, String>();
public static HashMap<Integer, String> prefixTable = new HashMap<Integer, String>();
public static final String LF = System.getProperty("line.separator");
public static void init()
{
prefixTableLong.put(Global.GN_Synset_prefix, "GN_Synset_");
prefixTableLong.put(Global.GN_Sense_prefix, "GN_Sense_");
prefixTableLong.put(Global.WN_Synset_prefix, "WN_Synset_");
prefixTableLong.put(Global.WN_Sense_prefix, "WN_Sense_");
prefixTableLong.put(Global.WKT_EN_prefix, "WktEN_sense_");
prefixTableLong.put(Global.WKT_DE_prefix, "WktDN_sense_");
prefixTableLong.put(Global.WP_EN_prefix, "WikiEn_sense_");
prefixTableLong.put(Global.WP_DE_prefix, "WikiDe_sense_");
prefixTableLong.put(Global.OW_EN_Synset_prefix, "OW_en_Synset_");
prefixTableLong.put(Global.OW_EN_Sense_prefix, "OW_en_Sense_");
prefixTableLong.put(Global.OW_DE_Synset_prefix, "OW_de_Synset_");
prefixTableLong.put(Global.OW_DE_Sense_prefix, "OW_de_Sense_");
prefixTableLong.put(Global.FN_prefix, "FN_Sense_");
prefixTableLong.put(Global.VN_prefix, "VN_Sense_");
prefixTableLong.put(Global.IMS_prefix, "IMSLexSubcat_Sense_");
prefixTableLong.put(Global.OntoWKT_DE_Synset_prefix, "OntoWktDE_synset_");
prefixTableLong.put(Global.OntoWKT_DE_Sense_prefix, "OntoWktDE_sense_");
prefixTable.put(Global.GN_Synset_prefix, "GN");
prefixTable.put(Global.GN_Sense_prefix, "GN");
prefixTable.put(Global.WN_Synset_prefix, "WN");
prefixTable.put(Global.WN_Sense_prefix, "WN");
prefixTable.put(Global.WKT_EN_prefix, "WktEn");
prefixTable.put(Global.WKT_DE_prefix, "WktDe");
prefixTable.put(Global.WP_EN_prefix, "WikiEn");
prefixTable.put(Global.WP_DE_prefix, "WikiDe");
prefixTable.put(Global.OW_EN_Synset_prefix, "OW_en");
prefixTable.put(Global.OW_EN_Sense_prefix, "OW_en");
prefixTable.put(Global.OW_DE_Synset_prefix, "OW_de");
prefixTable.put(Global.OW_DE_Sense_prefix, "OW_de");
prefixTable.put(Global.FN_prefix, "FN");
prefixTable.put(Global.VN_prefix, "VN");
prefixTable.put(Global.IMS_prefix, "IMSLex");
prefixTable.put(Global.OntoWKT_DE_Synset_prefix, "OntoWktDE");
prefixTable.put(Global.OntoWKT_DE_Sense_prefix, "OntoWktDE");
}
public static void mergeTwoGraphs(String infile1,String infile2, String outfile ) throws ClassNotFoundException, IOException
{
StringBuilder sb = new StringBuilder();
FileReader in = new FileReader("target/"+infile1);
BufferedReader input = new BufferedReader(in);
FileOutputStream outstream;
PrintStream p;
outstream = new FileOutputStream("target/"+outfile);
p = new PrintStream( outstream );
int maxId = 0;
int size = 0;
String line;
int i = 0;
int edge_count=0;
while((line =input.readLine())!=null)
{
if(line.startsWith("p"))
{
String[] info = line.split(" ");
maxId = Integer.parseInt(info[2]);
size = Integer.parseInt(info[3]);
}
else if(line.startsWith("e"))
{
String[] temp = line.split(" ");
//sb.append(line+Global.LF);
sb.append("e"+edge_count+++" "+temp[1]+" "+temp[2]+Global.LF);
}
if(i++ % 1000 ==0) {
System.out.println("Lines processed "+i);
}
}
input.close();
in = new FileReader("target/"+infile2);
input = new BufferedReader(in);
while((line =input.readLine())!=null)
{
if(line.startsWith("p"))
{
String[] info = line.split(" ");
int max = Integer.parseInt(info[2]);
if(max>maxId) {
maxId = max;
}
size = size+ Integer.parseInt(info[3]);
}
else if (line.startsWith("e"))
{ String[] temp = line.split(" ");
sb.append("e"+edge_count+++" "+temp[1]+" "+temp[2]+Global.LF);
// sb.append(line+Global.LF);
}
if(i++ % 1000 ==0) {
System.out.println("Lines processed "+i);
}
}
// p.println("p sp "+maxId+" "+size);
p.println("graph class=grph.in_memory.InMemoryGrph");
p.print(sb.toString());
input.close();
p.close();
}
public static double overlap(List<String> o1, List<String> o2)
{
Set<Object> set1 = new HashSet<Object>(o1);
Set<Object> set2 = new HashSet<Object>(o2);
set2.retainAll(set1);
return set2.size();
}
/**
* This method maps an alignment file with numerical IDs to either UBY-IDs or the original IDs
*
* @param extRef states whether the original ids should be used
*
* */
public static void mapAlignmentToUby(OneResourceBuilder gb1, OneResourceBuilder gb2, String alignmentfile, boolean extRef)
{
int i = 0;
FileOutputStream outstream;
PrintStream p;
try
{
String alignment_file= alignmentfile;
outstream = new FileOutputStream(alignment_file.replace(".txt", "")+"_"+(extRef? "extRef": "UbyID")+".txt");
p = new PrintStream( outstream );
FileReader in = new FileReader(alignment_file);
BufferedReader input_reader = new BufferedReader(in);
String line;
HashMap<String,String> extRefs1 = new HashMap<String, String>();
HashMap<String,String> extRefs2 = new HashMap<String, String>();
if(extRef)
{
Statement statement= gb1.connection.createStatement();
ResultSet rs = statement.executeQuery("SELECT externalReference, "+(gb1.synset? "synsetId" : "senseId")+" FROM MonolingualExternalRef where "+(gb1.synset? "synsetId" : "senseId")+" like '"+gb1.prefix_string+"%' ");
while(rs.next())
{
extRefs1.put(rs.getString(2),rs.getString(1));
}
statement.close();
statement= gb2.connection.createStatement();
rs = statement.executeQuery("SELECT externalReference, "+(gb2.synset? "synsetId" : "senseId")+" FROM MonolingualExternalRef where "+(gb2.synset? "synsetId" : "senseId")+" like '"+gb2.prefix_string+"%' ");
while(rs.next())
{
extRefs2.put(rs.getString(2),rs.getString(1));
}
statement.close();
rs.close();
}
while((line =input_reader.readLine())!=null)
{
if(line.startsWith("f"))
{
p.println(line);
continue;
}
String[] temp = line.split("\t");
String id1 = temp[0];
String id2 = temp[1];
String conf = temp[2];
String uby_id1 = Global.prefixTableLong.get(Integer.parseInt(id1.substring(0, 2)))+id1.substring(2);
String uby_id2 = Global.prefixTableLong.get(Integer.parseInt(id2.substring(0, 2)))+id2.substring(2);
if(extRef)
{
p.println(extRefs1.get(uby_id1)+"\t"+extRefs2.get(uby_id2)+"\t"+conf);
}
else
{
p.println(uby_id1+"\t"+uby_id2+"\t"+conf);
}
System.out.println("lines processed "+i++);
}
input_reader.close();
in.close();
p.close();
}
catch(Exception e)
{
e.printStackTrace();
}
/*TODO: Conform to newly defined XML standard*/
/*
*
* */
/*TODO: Create actual SenseAxis instances? NO! Use import class in UBY!*/
}
/**
* This method streamlines proprietary alignment gold standard files
*
* @param graph states whether numerical ids for the graph should be created
*
* */
public static void processExtRefGoldstandardFile(OneResourceBuilder gb1, OneResourceBuilder gb2, String alignmentfile, boolean graph)
{
/*TODO Has to be adapted to new standard - this is a propietary solution for now*/
int i = 0;
FileOutputStream outstream;
PrintStream p;
try
{
String alignment_file= alignmentfile;
outstream = new FileOutputStream(alignment_file.replace(".csv", "")+"_"+(graph? "graph": "UbyID")+".csv");
p = new PrintStream( outstream );
FileReader in = new FileReader(alignment_file);
BufferedReader input_reader = new BufferedReader(in);
String line;
HashMap<String,String> extRefs1 = new HashMap<String, String>();
HashMap<String,String> extRefs2 = new HashMap<String, String>();
Statement statement= gb1.connection.createStatement();
ResultSet rs = statement.executeQuery("SELECT externalReference, "+(gb1.synset? "synsetId" : "senseId")+" FROM MonolingualExternalRef where "+(gb1.synset? "synsetId" : "senseId")+" like '"+gb1.prefix_string+"%' ");
while(rs.next())
{
extRefs1.put(rs.getString(1),rs.getString(2));
}
statement.close();
statement= gb2.connection.createStatement();
rs = statement.executeQuery("SELECT externalReference, "+(gb2.synset? "synsetId" : "senseId")+" FROM MonolingualExternalRef where "+(gb2.synset? "synsetId" : "senseId")+" like '"+gb2.prefix_string+"%' ");
while(rs.next())
{
extRefs2.put(rs.getString(1),rs.getString(2));
}
statement.close();
rs.close();
while((line =input_reader.readLine())!=null)
{
if(line.startsWith("f"))
{
p.println(line);
continue;
}
line = line.replace("\"","");
line = line.replace(":","\t");
String[] temp = line.split("\t");
String id1 = temp[0];
if (id1.endsWith("#n"))
{
id1 = "[POS: noun] "+id1.split("#")[0];
}
else if (id1.endsWith("#v"))
{
id1 = "[POS: verb] "+id1.split("#")[0];
}
else if (id1.endsWith("#a"))
{
id1 = "[POS: adjective] "+id1.split("#")[0];
}
else if (id1.endsWith("#r"))
{
id1 = "[POS: adverb] "+id1.split("#")[0];
}
String id2 = temp[1];
String conf = temp[2];
String uby_id1 = extRefs1.get(id1);//Global.prefixTableLong.get(Integer.parseInt(id1.substring(0, 2)))+id1.substring(2);
String uby_id2 = extRefs2.get(id2);//Global.prefixTableLong.get(Integer.parseInt(id2.substring(0, 2)))+id2.substring(2);
if(uby_id1==null || uby_id2 == null) {
continue;
}
if(graph)
{
p.println(gb1.prefix+uby_id1.split(Global.prefixTableLong.get(gb1.prefix))[1] + "\t" +gb2.prefix+uby_id2.split(Global.prefixTableLong.get(gb2.prefix))[1]+"\t"+conf);
}
else
{
p.println(uby_id1+"\t"+uby_id2+"\t"+conf);
}
System.out.println("lines processed "+i++);
}
input_reader.close();
in.close();
p.close();
}
catch(Exception e)
{
e.printStackTrace();
}
//CONTINUE HERE
/*TODO: Conform to newly defined standard*/
/*
*
* */
/*TODO: Create actual SenseAxis instances? NO! Use import class in UBY!*/
}
/**
* This method streamlines proprieatary alignment gold standard files as provided by EW and ChM
*
* @param graph states whether numerical ids for the graph should be created
*
* */
public static void processExtRefGoldstandardFileWKTWP(OneResourceBuilder gb1, OneResourceBuilder gb2, String alignmentfile, boolean graph)
{
/*TODO Has to be adapted to new standard - this is a propietary solution for now*/
int i = 0;
FileOutputStream outstream;
PrintStream p;
try
{
String alignment_file= alignmentfile;
outstream = new FileOutputStream(alignment_file.replace(".txt", "")+"_"+(graph? "graph": "UbyID")+".csv");
p = new PrintStream( outstream );
FileReader in = new FileReader(alignment_file);
BufferedReader input_reader = new BufferedReader(in);
String line;
HashMap<String,String> extRefs1 = new HashMap<String, String>();
HashMap<String,String> extRefs2 = new HashMap<String, String>();
Statement statement= gb1.connection.createStatement();
ResultSet rs = statement.executeQuery("SELECT externalReference, "+(gb1.synset? "synsetId" : "senseId")+" FROM MonolingualExternalRef where "+(gb1.synset? "synsetId" : "senseId")+" like '"+gb1.prefix_string+"%' ");
while(rs.next())
{
extRefs1.put(rs.getString(1),rs.getString(2));
}
statement.close();
statement= gb2.connection.createStatement();
rs = statement.executeQuery("SELECT externalReference, "+(gb2.synset? "synsetId" : "senseId")+" FROM MonolingualExternalRef where "+(gb2.synset? "synsetId" : "senseId")+" like '"+gb2.prefix_string+"%' ");
while(rs.next())
{
extRefs2.put(rs.getString(1),rs.getString(2));
}
statement.close();
rs.close();
while((line =input_reader.readLine())!=null)
{
if(line.startsWith("#") )
{
// p.println(line);
continue;
}
String[] temp = line.split(" ; ");
String id1 = temp[0];
String pos = temp[1];
if (pos.equals("noun"))
{
id1 = "[POS: noun] "+id1;
}
else if (pos.equals("verb"))
{
id1 = "[POS: verb] "+id1;
}
else if (pos.equals("adjective"))
{
id1 = "[POS: adjective] "+id1;
}
else if (pos.equals("#adverb"))
{
id1 = "[POS: adverb] "+id1;
}
String id2 = temp[3];
String conf = temp[4];
System.out.println(id1);
System.out.println(id2);
String uby_id1 = extRefs1.get(id1);//Global.prefixTableLong.get(Integer.parseInt(id1.substring(0, 2)))+id1.substring(2);
String uby_id2 = extRefs2.get(id2);//Global.prefixTableLong.get(Integer.parseInt(id2.substring(0, 2)))+id2.substring(2);
System.out.println(uby_id1);
System.out.println(uby_id2);
if(uby_id1==null || uby_id2 == null) {
continue;
}
if(graph)
{
System.out.println(uby_id2); System.out.println(Global.prefixTableLong.get(gb2.prefix));
p.println(gb1.prefix+uby_id1.split(Global.prefixTableLong.get(gb1.prefix))[1]
+ "\t" +gb2.prefix+uby_id2.split(Global.prefixTableLong.get(gb2.prefix))[1]+"\t"+conf);
}
else
{
p.println(uby_id1+"\t"+uby_id2+"\t"+conf);
}
System.out.println("lines processed "+i++);
}
input_reader.close();
in.close();
p.close();
}
catch(Exception e)
{
e.printStackTrace();
}
//CONTINUE HERE
/*TODO: Conform to newly defined standard*/
/*
*
* */
/*TODO: Create actual SenseAxis instances? NO! Use import class in UBY!*/
}
}