/*******************************************************************************
* Copyright 2016
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
package de.tudarmstadt.ukp.alignment.framework.candidates;
import java.io.BufferedReader;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.PrintStream;
import java.sql.SQLException;
import java.util.HashMap;
import java.util.HashSet;
import de.tudarmstadt.ukp.alignment.framework.Global;
import de.tudarmstadt.ukp.alignment.framework.graph.OneResourceBuilder;
import de.tudarmstadt.ukp.lmf.model.enums.ELanguageIdentifier;
public class CandidateExtractor
{
public static void main(String[] args) throws IOException, ClassNotFoundException, SQLException
{
/* GLOBAL SETTINGS */
Global.init();
final String language = ELanguageIdentifier.ENGLISH;
/*RESOURCE 1*/
boolean synset1 = true;
boolean usePos1 = true;
final int prefix1 = Global.WN_Synset_prefix;
//OneGraphBuilder bg_1 = new OneGraphBuilder("uby_lite_0_4_0","root","fortuna");
OneResourceBuilder bg_1 = new OneResourceBuilder("uby_release_1_0","root","fortuna", prefix1,language,synset1,usePos1);
bg_1.fillIndexTables();
/*RESOURCE 2*/
boolean synset2 = false;
boolean usePos2 = true;
final int prefix2 = Global.WKT_EN_prefix;
OneResourceBuilder bg_2 = new OneResourceBuilder("uby_release_1_0","root","fortuna", prefix2,language,synset2,usePos2);
bg_2.fillIndexTables();
/*Calculate alignment candidates between the two LSRs*/
/*Index tables must be filled at this point*/
// Global.processExtRefGoldstandardFileWKTWP(bg_1, bg_2, "target/ijcnlp2011-meyer-dataset.txt", true);
// createCandidateFileFull(bg_1, bg_2);
// createCandidateFileGoldStandard(bg_1, bg_2, "target/ijcnlp2011-meyer-dataset_graph.csv",false);
// createCandidateFileLemmaList(bg_1, bg_2, "target/lemmas.tsv");
}
/**
* This method extracts the possible alignment candidates (those with matching lemma and POS) from two resources
*
*
*
*
*/
public static void createCandidateFileFull(OneResourceBuilder gb1, OneResourceBuilder gb2) throws ClassNotFoundException, SQLException, IOException
{
StringBuilder sb = new StringBuilder();
int count = 0;
FileOutputStream outstream;
PrintStream p;
outstream = new FileOutputStream("target/"+gb1.prefix_string+"_"+gb2.prefix_string+"_candidates_"+(gb2.pos ? "Pos": "noPos")+".txt");
p = new PrintStream( outstream );
for(String lemmaPos: gb1.lemmaPosSenses.keySet())
{
if(gb2.pos) {
if(gb2.lemmaPosSenses.get(lemmaPos)!= null)
{
for(String id1 :gb1.lemmaPosSenses.get(lemmaPos))
{
for(String id2 :gb2.lemmaPosSenses.get(lemmaPos))
{
sb.append("q "+id1+" "+id2+""+Global.LF);
count++;
}
}
}
}
else
{
String lemma = lemmaPos.split("#")[0];
if(gb2.lemmaPosSenses.get(lemma)!= null)
{
for(String id1 :gb1.lemmaPosSenses.get(lemmaPos))
{
for(String id2 :gb2.lemmaPosSenses.get(lemma))
{
sb.append("q "+id1+" "+id2+""+Global.LF);
count++;
}
}
}
}
}
p.println("p aux sp p2p "+count);
p.print(sb.toString());
p.close();
}
/**
* This method creates a list of alignment candidates (those with matching lemma and POS) from a given list
*
*
*
*
*/
public static void createCandidateFileLemmaList(OneResourceBuilder gb1, OneResourceBuilder gb2,String input) throws ClassNotFoundException, SQLException, IOException
{
HashMap<String, String> lemmaPosList = new HashMap<String,String>();
FileReader in = new FileReader(input);
BufferedReader input_reader = new BufferedReader(in);
String line;
while((line =input_reader.readLine())!=null)
{
lemmaPosList.put(line.split("\t")[0],line.split("\t")[1]);
}
StringBuilder sb = new StringBuilder();
int count = 0;
FileOutputStream outstream;
PrintStream p;
outstream = new FileOutputStream("target/"+gb1.prefix_string+"_"+gb2.prefix_string+"_LemmaListCandidates_"+(gb2.pos ? "Pos": "noPos")+".txt");
p = new PrintStream( outstream );
for(String lemmaPos: gb1.lemmaPosSenses.keySet())
{
String lemma = lemmaPos.split("#")[0];
String pos = lemmaPos.split("#")[1];
System.out.println(lemma);
System.out.println(pos);
if(!(lemmaPosList.containsKey(lemma) && lemmaPosList.get(lemma).equals(pos))) {
continue;
}
if(gb2.pos) {
if(gb2.lemmaPosSenses.get(lemmaPos)!= null)
{
for(String id1 :gb1.lemmaPosSenses.get(lemmaPos))
{
for(String id2 :gb2.lemmaPosSenses.get(lemmaPos))
{
sb.append("q "+id1+" "+id2+""+Global.LF);
count++;
}
}
}
}
else
{
if(gb2.lemmaPosSenses.get(lemma)!= null)
{
for(String id1 :gb1.lemmaPosSenses.get(lemmaPos))
{
for(String id2 :gb2.lemmaPosSenses.get(lemma))
{
sb.append("q "+id1+" "+id2+""+Global.LF);
count++;
}
}
}
}
}
p.println("p aux sp p2p "+count);
p.print(sb.toString());
p.close();
input_reader.close();
in.close();
}
/**
* This method extracts the possible alignment candidates from a gold standard file
*
*
*
* @param checkIntegrity This parameter toggles if the gold standard should be checked for correctness of lemma/POS combinations. If unchecked, the GS is just output in the correct format
*/
public static void createCandidateFileGoldStandard(OneResourceBuilder gb1, OneResourceBuilder gb2,String input, boolean checkIntegrity) throws ClassNotFoundException, SQLException, IOException
{
int count = 0;
HashSet<String> candidates = new HashSet<String>();
FileReader in = new FileReader(input);
BufferedReader input_reader = new BufferedReader(in);
String line;
StringBuilder sb = new StringBuilder();
while((line =input_reader.readLine())!=null)
{
if(checkIntegrity)
{
candidates.add(line.split("\t")[0]+"###"+line.split("\t")[1]);
}
else
{
sb.append("q "+line.split(" ")[0]+"\t"+line.split(" ")[1]+""+Global.LF);
count++;
}
}
FileOutputStream outstream;
PrintStream p;
outstream = new FileOutputStream("target/"+gb1.prefix_string+"_"+gb2.prefix_string+"_GScandidates_"+(!checkIntegrity? "noCheck":(gb2.pos ? "Pos": "noPos"))+".txt");
p = new PrintStream( outstream );
if(checkIntegrity)
{
for(String lemmaPos: gb1.lemmaPosSenses.keySet())
{
if(gb2.pos) {
if(gb2.lemmaPosSenses.get(lemmaPos)!= null)
{
for(String id1 :gb1.lemmaPosSenses.get(lemmaPos))
{
for(String id2 :gb2.lemmaPosSenses.get(lemmaPos))
{
if(candidates.contains(id1+"###"+id2))
{
sb.append("q "+id1+" "+id2+""+Global.LF);
count++;
}
}
}
}
}
else
{
String lemma = lemmaPos.split("#")[0];
if(gb2.lemmaPosSenses.get(lemma)!= null)
{
for(String id1 :gb1.lemmaPosSenses.get(lemmaPos))
{
for(String id2 :gb2.lemmaPosSenses.get(lemma))
{
if(candidates.contains(id1+"###"+id2))
{
sb.append("q "+id1+" "+id2+""+Global.LF);
count++;
}
}
}
}
}
}
}
p.println("p aux sp p2p "+count);
p.print(sb.toString());
p.close();
input_reader.close();
in.close();
}
}