/*******************************************************************************
* Copyright 2016
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
package de.tudarmstadt.ukp.lmf.transform.omegawiki;
import java.io.UnsupportedEncodingException;
import java.util.LinkedList;
import java.util.List;
import java.util.regex.Pattern;
import de.tudarmstadt.ukp.lmf.model.core.Sense;
import de.tudarmstadt.ukp.lmf.model.mrd.Equivalent;
import de.tudarmstadt.ukp.omegawiki.api.DefinedMeaning;
import de.tudarmstadt.ukp.omegawiki.api.SynTrans;
import de.tudarmstadt.ukp.omegawiki.exception.OmegaWikiException;
/**
* An instance of this class updates the SenseRelations of already created Senses
* @author matuschek
*
*/
public class EquivalentGenerator {
private final LexicalEntryGenerator lexicalEntryGenerator;
private final int GlobalLanguage;
public EquivalentGenerator(LexicalEntryGenerator lexicalEntryGenerator, SynsetGenerator synsetGenerator, int globalLanguage) {
this.GlobalLanguage = globalLanguage;
this.lexicalEntryGenerator = lexicalEntryGenerator;
}
/**
* This method updates the SenseRelations
* of alredy created Senses
* @throws OmegaWikiException
*/
public void updateEquivalents() throws OmegaWikiException {
// Iterate over all LexemeGroups and update
for(SynTrans lexeme : lexicalEntryGenerator.getSenseGenerator().getProcessedLexemes()){
updateEquivalents(lexeme);
}
}
// Skip supplementary planes (multi-byte UTF16 characters), as they cause problems with the XML reader.
protected static Pattern SUPPLEMENTARY_PLANES = Pattern.compile("[\\uD7FF-\\uE000]");
/**
* This method updates the Equvalents
* of lexeme's Sense
* @param lexeme
* @throws OmegaWikiException
*/
private void updateEquivalents(SynTrans lexeme) throws OmegaWikiException {
SenseGenerator senseGenerator = lexicalEntryGenerator.getSenseGenerator();
Sense sense = senseGenerator.getSense(lexeme);
List<Equivalent> equivalents = new LinkedList<Equivalent>();
DefinedMeaning dm = lexeme.getDefinedMeaning();
try {
for(SynTrans st : dm.getSynTranses())
{
if (st.getSyntrans().getLanguageId() != GlobalLanguage && st.getSyntrans().getSpelling()!=null && st.getSyntrans().getSpelling().length()>0)
{
String writtenForm = st.getSyntrans().getSpelling();
if (SUPPLEMENTARY_PLANES.matcher(writtenForm).find())
continue;
if (writtenForm.length() > 255)
writtenForm = writtenForm.substring(0, 255);
Equivalent eq = new Equivalent();
eq.setLanguageIdentifier(OmegaWikiLMFMap.mapLanguage(st.getSyntrans().getLanguageId()));
eq.setWrittenForm(writtenForm);
equivalents.add(eq);
}
}
}
catch (UnsupportedEncodingException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
sense.setEquivalents(equivalents);
}
}