/*******************************************************************************
* Copyright 2016
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
package de.tudarmstadt.ukp.lmf.transform.ontowiktionary;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.SortedMap;
import java.util.TreeMap;
import java.util.TreeSet;
import de.tudarmstadt.ukp.jwktl.api.IPronunciation;
import de.tudarmstadt.ukp.jwktl.api.IPronunciation.PronunciationType;
import de.tudarmstadt.ukp.jwktl.api.IQuotation;
import de.tudarmstadt.ukp.jwktl.api.IWikiString;
import de.tudarmstadt.ukp.jwktl.api.IWiktionaryEdition;
import de.tudarmstadt.ukp.jwktl.api.IWiktionaryEntry;
import de.tudarmstadt.ukp.jwktl.api.IWiktionaryRelation;
import de.tudarmstadt.ukp.jwktl.api.IWiktionarySense;
import de.tudarmstadt.ukp.jwktl.api.IWiktionaryTranslation;
import de.tudarmstadt.ukp.jwktl.api.IWiktionaryWordForm;
import de.tudarmstadt.ukp.jwktl.api.PartOfSpeech;
import de.tudarmstadt.ukp.jwktl.api.entry.WikiString;
import de.tudarmstadt.ukp.jwktl.api.util.ILanguage;
import de.tudarmstadt.ukp.jwktl.api.util.IWiktionaryIterator;
import de.tudarmstadt.ukp.jwktl.api.util.TemplateParser;
import de.tudarmstadt.ukp.jwktl.api.util.TemplateParser.EtymologyTemplateHandler;
import de.tudarmstadt.ukp.lmf.model.core.Definition;
import de.tudarmstadt.ukp.lmf.model.core.GlobalInformation;
import de.tudarmstadt.ukp.lmf.model.core.LexicalEntry;
import de.tudarmstadt.ukp.lmf.model.core.LexicalResource;
import de.tudarmstadt.ukp.lmf.model.core.Lexicon;
import de.tudarmstadt.ukp.lmf.model.core.Sense;
import de.tudarmstadt.ukp.lmf.model.core.Statement;
import de.tudarmstadt.ukp.lmf.model.core.TextRepresentation;
import de.tudarmstadt.ukp.lmf.model.enums.EAuxiliary;
import de.tudarmstadt.ukp.lmf.model.enums.ECase;
import de.tudarmstadt.ukp.lmf.model.enums.EContextType;
import de.tudarmstadt.ukp.lmf.model.enums.EDefinitionType;
import de.tudarmstadt.ukp.lmf.model.enums.EDegree;
import de.tudarmstadt.ukp.lmf.model.enums.EExampleType;
import de.tudarmstadt.ukp.lmf.model.enums.EGrammaticalGender;
import de.tudarmstadt.ukp.lmf.model.enums.EGrammaticalNumber;
import de.tudarmstadt.ukp.lmf.model.enums.ELabelNameSemantics;
import de.tudarmstadt.ukp.lmf.model.enums.ELabelTypeSemantics;
import de.tudarmstadt.ukp.lmf.model.enums.EPartOfSpeech;
import de.tudarmstadt.ukp.lmf.model.enums.EPerson;
import de.tudarmstadt.ukp.lmf.model.enums.ERelTypeMorphology;
import de.tudarmstadt.ukp.lmf.model.enums.ERelTypeSemantics;
import de.tudarmstadt.ukp.lmf.model.enums.EStatementType;
import de.tudarmstadt.ukp.lmf.model.enums.ESyntacticProperty;
import de.tudarmstadt.ukp.lmf.model.enums.ETense;
import de.tudarmstadt.ukp.lmf.model.enums.EVerbFormMood;
import de.tudarmstadt.ukp.lmf.model.meta.SemanticLabel;
import de.tudarmstadt.ukp.lmf.model.miscellaneous.ConstraintSet;
import de.tudarmstadt.ukp.lmf.model.morphology.FormRepresentation;
import de.tudarmstadt.ukp.lmf.model.morphology.Lemma;
import de.tudarmstadt.ukp.lmf.model.morphology.RelatedForm;
import de.tudarmstadt.ukp.lmf.model.morphology.WordForm;
import de.tudarmstadt.ukp.lmf.model.mrd.Context;
import de.tudarmstadt.ukp.lmf.model.mrd.Equivalent;
import de.tudarmstadt.ukp.lmf.model.multilingual.SenseAxis;
import de.tudarmstadt.ukp.lmf.model.semantics.MonolingualExternalRef;
import de.tudarmstadt.ukp.lmf.model.semantics.SemanticPredicate;
import de.tudarmstadt.ukp.lmf.model.semantics.SenseExample;
import de.tudarmstadt.ukp.lmf.model.semantics.SenseRelation;
import de.tudarmstadt.ukp.lmf.model.semantics.SynSemCorrespondence;
import de.tudarmstadt.ukp.lmf.model.semantics.Synset;
import de.tudarmstadt.ukp.lmf.model.semantics.SynsetRelation;
import de.tudarmstadt.ukp.lmf.model.syntax.LexemeProperty;
import de.tudarmstadt.ukp.lmf.model.syntax.SubcategorizationFrame;
import de.tudarmstadt.ukp.lmf.model.syntax.SubcategorizationFrameSet;
import de.tudarmstadt.ukp.lmf.model.syntax.SyntacticBehaviour;
import de.tudarmstadt.ukp.lmf.transform.DBConfig;
import de.tudarmstadt.ukp.lmf.transform.LMFDBTransformer;
import de.tudarmstadt.ukp.lmf.transform.StringUtils;
import de.tudarmstadt.ukp.lmf.transform.ontowiktionary.WiktionaryLabelManager.PragmaticLabel;
/**
* Converts OntoWiktionary into UBY-LMF.
*/
public class OntoWiktionaryTransformer extends LMFDBTransformer {
// The embracing lexicon instance.
protected Lexicon lexicon;
// JWKTL Wiktionary object
protected final IWiktionaryEdition wkt;
// Language of Wiktionary edition that should be transformed
protected final ILanguage wktLang;
// A string representation (YYYY-MM-DD) of the dump date.
protected final String wktDate;
// JWKTL Entry iterator
protected final IWiktionaryIterator<IWiktionaryEntry> entryIterator;
// Current entry number
protected int currentEntryNr = 0;
// Handler for Wiktionary's pragmatic labels.
protected WiktionaryLabelManager labelManager;
// Cache of unsaved word forms defined by Wiktionary word form labels.
protected final Map<String, List<WordForm>> wordForms;
// Cache of unsaved subcategorization frames.
protected final SortedMap<String, SubcategorizationFrame> subcatFrames;
protected OntoWiktionary ontoWiktionary;
protected Iterator<OntoWiktionaryConcept> synsetIter;
protected final String jwktlVersion;
protected final String dtd_version;
static int exampleIdx = 1;
static int subcatFrameIdx = 1;
static int syntacticBehaviourIdx = 1;
/**
* @param dbConfig - Database configuration of LMF database
* @param wkt - JWKTL Wiktionary Object
* @throws FileNotFoundException
*/
public OntoWiktionaryTransformer(final DBConfig dbConfig,
final OntoWiktionary ontoWiktionary,
final IWiktionaryEdition wkt, final ILanguage wktLang,
final String wktDate, final String dtd) throws IOException {
super(dbConfig);
this.ontoWiktionary = ontoWiktionary;
this.wkt = wkt;
this.wktLang = wktLang;
this.wktDate = wktDate;
this.entryIterator = wkt.getAllEntries();
this.wordForms = new TreeMap<String, List<WordForm>>();
this.subcatFrames = new TreeMap<String, SubcategorizationFrame>();
this.labelManager = WiktionaryLMFMap.createLabelManager();
jwktlVersion = /*JWKTL.getVersion() - version clash!*/ "1.0.0";
dtd_version = dtd;
}
@Override
protected String getResourceAlias() {
return "OntoWkt" + wktLang.getISO639_1().toUpperCase();
};
@Override
protected LexicalResource createLexicalResource() {
LexicalResource resource = new LexicalResource();
GlobalInformation glInformation = new GlobalInformation();
glInformation.setLabel("OntoWiktionary " + wktLang.getName()
+ " edition, dump of 2013/02, JWKTL "
+ jwktlVersion);
resource.setGlobalInformation(glInformation);
resource.setName("OntoWiktionary" + wktLang.getISO639_1().toUpperCase());
resource.setDtdVersion(dtd_version);
return resource;
}
@Override
protected Lexicon createNextLexicon() {
if (lexicon != null)
return null;
lexicon = new Lexicon();
String lmfLang = WiktionaryLMFMap.mapLanguage(wktLang);
lexicon.setId(getLmfId(Lexicon.class, "lexiconWkt" + lmfLang));
lexicon.setLanguageIdentifier(lmfLang);
lexicon.setName("OntoWiktionary" + wktLang.getISO639_1().toUpperCase());
return lexicon;
}
@Override
protected LexicalEntry getNextLexicalEntry() {
/*if (!entryIterator.hasNext() || currentEntryNr > 100000) {
return null;
}*/
// If we're finished, convert the semantic relations and free resources.
if (!entryIterator.hasNext()) {
System.out.println("PROCESS SENSE RELATIONS");
convertSemanticRelations();
return null;
}
if (currentEntryNr % 1000 == 0) {
System.out.println("PROCESSED " + currentEntryNr + " ENTRIES");
}
IWiktionaryEntry wktEntry = null;
while (entryIterator.hasNext()){
wktEntry = entryIterator.next();
if (wktLang.equals(wktEntry.getWordLanguage()))
break;
}
// Lexical entry.
LexicalEntry entry = new LexicalEntry();
entry.setId(getLmfId(LexicalEntry.class, getEntryId(wktEntry)));
EPartOfSpeech pos = WiktionaryLMFMap.mapPos(wktEntry);
entry.setPartOfSpeech(pos);
// Lemma
String word = convert(wktEntry.getWord(), 1000);
Lemma lemma = new Lemma();
lemma.setFormRepresentations(createFormRepresentationList(word, wktEntry.getWordLanguage()));
entry.setLemma(lemma);
// Senses.
List<Sense> senses = new ArrayList<Sense>();
for (IWiktionarySense wktSense : wktEntry.getSenses()) {
if (considerSense(wktSense)) {
Sense sense = wktSenseToLMFSense(wktSense, wktEntry, entry);
senses.add(sense);
}
wktSense = null;
}
entry.setSenses(senses);
// Related forms.
List<IWiktionaryRelation> relations = wktEntry.getRelations();
if (relations != null) {
List<RelatedForm> relatedForms = new ArrayList<RelatedForm>();
for (IWiktionaryRelation relation : relations) {
ERelTypeMorphology relType = WiktionaryLMFMap.mapMorphologicalRelation(relation.getRelationType());
if (relType == null)
continue;
RelatedForm form = new RelatedForm();
form.setRelType(relType);
form.setFormRepresentations(createFormRepresentationList(relation.getTarget(), wktEntry.getWordLanguage()));
relatedForms.add(form);
}
entry.setRelatedForms(relatedForms);
}
// Word forms.
convertWordForms(wktEntry, entry);
wktEntry = null;
currentEntryNr++;
return entry;
}
protected Set<String> unknownPronunciationNotes;
protected void convertWordForms(final IWiktionaryEntry wktEntry,
final LexicalEntry entry) {
boolean isNoun = false;
boolean isVerb = false;
boolean isAdjAd = false;
for (PartOfSpeech pos : wktEntry.getPartsOfSpeech())
if (pos == PartOfSpeech.NOUN || pos == PartOfSpeech.PROPER_NOUN
|| pos == PartOfSpeech.FIRST_NAME || pos == PartOfSpeech.LAST_NAME
|| pos == PartOfSpeech.TOPONYM
|| pos == PartOfSpeech.SINGULARE_TANTUM
|| pos == PartOfSpeech.PLURALE_TANTUM
|| pos == PartOfSpeech.PRONOUN
|| pos == PartOfSpeech.PERSONAL_PRONOUN
|| pos == PartOfSpeech.REFLEXIVE_PRONOUN
|| pos == PartOfSpeech.DEMONSTRATIVE_PRONOUN
|| pos == PartOfSpeech.INDEFINITE_PRONOUN
|| pos == PartOfSpeech.POSSESSIVE_PRONOUN
|| pos == PartOfSpeech.RELATIVE_PRONOUN
|| pos == PartOfSpeech.INTERROGATIVE_ADVERB
|| pos == PartOfSpeech.INTERROGATIVE_PRONOUN)
isNoun = true;
else
if (pos == PartOfSpeech.VERB || pos == PartOfSpeech.AUXILIARY_VERB)
isVerb = true;
else
if (pos == PartOfSpeech.ADJECTIVE || pos == PartOfSpeech.ADVERB)
isAdjAd = true;
List<WordForm> wordForms = new ArrayList<WordForm>();
// Add lemma form (for inflectable word forms).
WordForm lemmaForm = new WordForm();
lemmaForm.setFormRepresentations(createFormRepresentationList(wktEntry.getWord(), wktEntry.getWordLanguage()));
if (isNoun) {
lemmaForm.setCase(ECase.nominative);
lemmaForm.setGrammaticalNumber(EGrammaticalNumber.singular);
} else
if (isVerb) {
lemmaForm.setVerbFormMood(EVerbFormMood.infinitive);
} else
if (isAdjAd) {
lemmaForm.setDegree(EDegree.positive);
} else {
lemmaForm = null;
}
if (lemmaForm != null)
wordForms.add(lemmaForm);
// Add inflected word forms.
List<IWiktionaryWordForm> wktWordForms = wktEntry.getWordForms();
if (wktWordForms != null) {
for (IWiktionaryWordForm wktWordForm : wktWordForms) {
String writtenForm = convert(wktWordForm.getWordForm(), 255);
if (writtenForm == null || writtenForm.isEmpty())
continue;
if (writtenForm.contains("[")) {
// System.err.println("Skipping word form: " + writtenForm);
continue;
}
WordForm newWordForm = new WordForm();
newWordForm.setCase(WiktionaryLMFMap.mapCase(wktWordForm));
newWordForm.setDegree(WiktionaryLMFMap.mapDegree(wktWordForm));
newWordForm.setPerson(WiktionaryLMFMap.mapPerson(wktWordForm));
//newWordForm.setGrammaticalGender(WiktionaryLMFMap.mapGender(wktWordForm.get));
newWordForm.setGrammaticalNumber(WiktionaryLMFMap.mapGrammaticalNumber(wktWordForm));
newWordForm.setVerbFormMood(WiktionaryLMFMap.mapVerbFormMood(wktWordForm));
newWordForm.setTense(WiktionaryLMFMap.mapTense(wktWordForm));
if (newWordForm.getVerbFormMood() == null && isVerb)
newWordForm.setVerbFormMood(EVerbFormMood.indicative);
if (newWordForm.getVerbFormMood() == EVerbFormMood.subjunctive)
newWordForm.setTense(null);
// Check if a similar word form exists.
WordForm wordForm = null;
for (WordForm wf : wordForms) {
if (newWordForm.getCase() != null && wf.getCase() != null
&& newWordForm.getCase() != wf.getCase())
continue;
if (newWordForm.getDegree() != null && wf.getDegree() != null
&& newWordForm.getDegree() != wf.getDegree())
continue;
if (newWordForm.getPerson() != null && wf.getPerson() != null
&& newWordForm.getPerson() != wf.getPerson())
continue;
if (newWordForm.getGrammaticalGender() != null && wf.getGrammaticalGender() != null
&& newWordForm.getGrammaticalGender() != wf.getGrammaticalGender())
continue;
if (newWordForm.getGrammaticalNumber() != null && wf.getGrammaticalNumber() != null
&& newWordForm.getGrammaticalNumber() != wf.getGrammaticalNumber())
continue;
if (newWordForm.getVerbFormMood() != null && wf.getVerbFormMood() != null
&& newWordForm.getVerbFormMood() != wf.getVerbFormMood())
continue;
if (newWordForm.getTense() != null && wf.getTense() != null
&& newWordForm.getTense() != wf.getTense())
continue;
String key1 = newWordForm.getCase() + " " + newWordForm.getDegree()
+ " " + newWordForm.getPerson() + " " + newWordForm.getGrammaticalNumber()
+ " " + newWordForm.getVerbFormMood() + " " + newWordForm.getTense();
String key2 = wf.getCase() + " " + wf.getDegree()
+ " " + wf.getPerson() + " " + wf.getGrammaticalNumber()
+ " " + wf.getVerbFormMood() + " " + wf.getTense();
if (key1.equals(key2)) {
wordForm = wf;
break;
}
// else
// System.err.println(wktEntry.getWord() + " " + key1 + "\n"
// + wktEntry.getWord() + " " + key2 + "\n");
}
if (wordForm == null) {
wordForm = newWordForm;
wordForm.setFormRepresentations(new ArrayList<FormRepresentation>());
wordForms.add(wordForm);
}
// If this is a noun, remove the determiner and identify the gender.
if (isNoun) {
int idx = writtenForm.indexOf(' ');
if (idx >= 0) {
EGrammaticalGender gender = null;
String determiner = writtenForm.substring(0, idx);
if ("der".equals(determiner) || "(der)".equals(determiner))
gender = EGrammaticalGender.masculine;
else
if ("die".equals(determiner) || "(die)".equals(determiner))
gender = EGrammaticalGender.feminine;
else
if ("das".equals(determiner) || "(das)".equals(determiner))
gender = EGrammaticalGender.neuter;
else
if (!"des".equals(determiner) && !"(des)".equals(determiner)
&& !"dem".equals(determiner) && !"(dem)".equals(determiner)
&& !"den".equals(determiner) && !"(den)".equals(determiner))
idx = -1;
if (idx >= 0) {
writtenForm = writtenForm.substring(idx + 1);
if (wordForm == lemmaForm && gender != null)
wordForm.setGrammaticalGender(gender);
}
}
}
// Add a new form representation if the written form does not yet exist.
boolean found = false;
for (FormRepresentation fp : wordForm.getFormRepresentations())
if (fp.getWrittenForm().equals(writtenForm)) {
found = true;
break;
}
if (!found) {
FormRepresentation fp = new FormRepresentation();
fp.setWrittenForm(writtenForm);
fp.setLanguageIdentifier(lexicon.getLanguageIdentifier());
wordForm.getFormRepresentations().add(fp);
}
}
}
// Add phonetic forms.
List<IPronunciation> pronunciations = wktEntry.getPronunciations();
if (pronunciations != null) {
for (IPronunciation pronunciation : pronunciations) {
// Only save IPA pronunciations.
if (pronunciation.getType() != PronunciationType.IPA)
continue;
// Don't save empty pronunciations.
String phoneticForm = pronunciation.getText();
if (phoneticForm == null || phoneticForm.isEmpty()
|| "...".equals(phoneticForm) || "…".equals(phoneticForm))
continue;
// Don't save pronunciations containing a dash or a wiki link.
if (phoneticForm.startsWith("[")) {
phoneticForm = phoneticForm.substring(1);
int idx = phoneticForm.indexOf(']');
if (idx >= 0)
phoneticForm = phoneticForm.substring(0, idx);
}
if (phoneticForm.startsWith("/")) {
phoneticForm = phoneticForm.substring(1);
int idx = phoneticForm.indexOf('/');
if (idx >= 0)
phoneticForm = phoneticForm.substring(0, idx);
}
if (phoneticForm.contains("–") || phoneticForm.contains("|")
|| phoneticForm.contains("[") || phoneticForm.contains("]")) {
// System.err.println("Skipping phonetic form: " + phoneticForm);
continue;
}
WordForm newWordForm = new WordForm();
if (lemmaForm != null) {
newWordForm.setCase(lemmaForm.getCase());
newWordForm.setDegree(lemmaForm.getDegree());
newWordForm.setPerson(lemmaForm.getPerson());
newWordForm.setGrammaticalGender(lemmaForm.getGrammaticalGender());
newWordForm.setGrammaticalNumber(lemmaForm.getGrammaticalNumber());
newWordForm.setVerbFormMood(lemmaForm.getVerbFormMood());
newWordForm.setTense(lemmaForm.getTense());
}
String note = pronunciation.getNote();
String geographicalVariant = null;
if (note != null && !note.isEmpty()) {
if (note.contains("Sg."))
newWordForm.setGrammaticalNumber(EGrammaticalNumber.singular);
else
if (note.contains("Pl."))
newWordForm.setGrammaticalNumber(EGrammaticalNumber.plural);
else
if ("Gen.".equals(note))
newWordForm.setCase(ECase.genitive);
else
if ("Dat.".equals(note))
newWordForm.setCase(ECase.dative);
else
if ("Akk.".equals(note))
newWordForm.setCase(ECase.accusative);
else
if ("Prät.".equals(note)) {
newWordForm.setPerson(EPerson.first);
newWordForm.setGrammaticalNumber(EGrammaticalNumber.singular);
newWordForm.setTense(ETense.past);
newWordForm.setVerbFormMood(EVerbFormMood.indicative);
}
else
if ("Komp.".equals(note))
newWordForm.setDegree(EDegree.comparative);
else
if ("Sup.".equals(note))
newWordForm.setDegree(EDegree.superlative);
else
if ("Part.".equals(note)) // Partizip II
newWordForm.setVerbFormMood(EVerbFormMood.participle);
//newWordForm.setTense(ETense.past);
else
if ("UK".equals(note) || "RP".equals(note)
|| note.startsWith("RP ") || "Received Pronunciation".equals(note)
|| note.contains("British") || note.contains("England")
|| note.contains("English") || note.contains("Scotland")
|| note.contains("Scots") || "GB".equals(note)) {
geographicalVariant = "UK";
} else
if ("US".equals(note) || note.startsWith("US ") || "U.S.".equals(note)
|| note.endsWith(" US") || note.toUpperCase().contains("GENAM")
|| note.equals("GAm")
|| note.contains("Southern US") || note.contains("Northern US")
|| note.contains("New York") || "NYC".equals(note) || "NY".equals(note)
|| note.contains("St. Louis") || "STL".equals(note))
geographicalVariant = "US";
else
if ("CA".equals(note) || "Canada".equals(note)
|| "Canadian".equals(note) || "CanE".equals(note)
|| "CaE".equals(note))
geographicalVariant = "CA";
else
if ("AU".equals(note) || "AUSE".equals(note.toUpperCase())
|| "AUSEN".equals(note.toUpperCase())
|| "Australia".equals(note))
geographicalVariant = "AU";
else
if ("NZ".equals(note) || "New Zealand".equals(note))
geographicalVariant = "NZ";
else
if ("IE".equals(note) || "Ireland".equals(note) || "Irish".equals(note))
geographicalVariant = "IE";
else
if ("Deutschland".equals(note))
geographicalVariant = "DE";
else
if ("Österreich".equals(note) || note.contains("österr."))
geographicalVariant = "AT";
else
if ("Schweiz".equals(note))
geographicalVariant = "CH";
else
if (note.contains("South Africa") || "S Africa".equals(note)
|| "SAE".equals(note))
geographicalVariant = "RSA";
else
if (note.contains("North America") || "Puerto Rican".equals(note)
|| note.contains("American"))
geographicalVariant = note;
else
if (!"letter name".equals(note) && !"phoneme".equals(note)) {
// Save a new empty word form.
newWordForm.setCase(null);
newWordForm.setDegree(null);
newWordForm.setPerson(null);
newWordForm.setGrammaticalGender(null);
newWordForm.setGrammaticalNumber(null);
newWordForm.setVerbFormMood(null);
newWordForm.setTense(null);
/*if (unknownPronunciationNotes == null)
unknownPronunciationNotes = new TreeSet<String>();
if (unknownPronunciationNotes.add(note))
System.err.println("PRONUNCIATION: >" + note + "<");*/
}
}
// Check if a similar word form exists.
WordForm wordForm = null;
for (WordForm wf : wordForms) {
if (newWordForm.getCase() != null && wf.getCase() != null
&& newWordForm.getCase() != wf.getCase())
continue;
if (newWordForm.getDegree() != null && wf.getDegree() != null
&& newWordForm.getDegree() != wf.getDegree())
continue;
if (newWordForm.getPerson() != null && wf.getPerson() != null
&& newWordForm.getPerson() != wf.getPerson())
continue;
if (newWordForm.getGrammaticalGender() != null && wf.getGrammaticalGender() != null
&& newWordForm.getGrammaticalGender() != wf.getGrammaticalGender())
continue;
if (newWordForm.getGrammaticalNumber() != null && wf.getGrammaticalNumber() != null
&& newWordForm.getGrammaticalNumber() != wf.getGrammaticalNumber())
continue;
if (newWordForm.getVerbFormMood() != null && wf.getVerbFormMood() != null
&& newWordForm.getVerbFormMood() != wf.getVerbFormMood())
continue;
if (newWordForm.getTense() != null && wf.getTense() != null
&& newWordForm.getTense() != wf.getTense())
continue;
String key1 = newWordForm.getCase() + " " + newWordForm.getDegree()
+ " " + newWordForm.getPerson() + " " + newWordForm.getGrammaticalNumber()
+ " " + newWordForm.getVerbFormMood() + " " + newWordForm.getTense();
String key2 = wf.getCase() + " " + wf.getDegree()
+ " " + wf.getPerson() + " " + wf.getGrammaticalNumber()
+ " " + wf.getVerbFormMood() + " " + wf.getTense();
if (key1.equals(key2)) {
wordForm = wf;
break;
} /*else
if (newWordForm.getCase() != null
|| newWordForm.getDegree() != null
|| newWordForm.getGrammaticalGender() != null
|| newWordForm.getGrammaticalNumber() != null
|| newWordForm.getPerson() != null
|| newWordForm.getTense() != null
|| newWordForm.getVerbFormMood() != null)
System.err.println("* " + wktEntry.getWord() + " " + key1 + "\n"
+ "* "+ wktEntry.getWord() + " " + key2 + "\n");*/
}
if (wordForm == null) {
wordForm = newWordForm;
wordForm.setFormRepresentations(new ArrayList<FormRepresentation>());
wordForms.add(wordForm);
}
// Add the phonetic form if there's only one written form.
String writtenForm = null;
List<FormRepresentation> fps = wordForm.getFormRepresentations();
for (FormRepresentation fp : fps)
if (writtenForm == null)
writtenForm = fp.getWrittenForm();
else
if (fp.getWrittenForm() != null && !fp.getWrittenForm().equals(writtenForm)) {
writtenForm = null;
break;
}
if (fps.size() == 1 && fps.get(0).getPhoneticForm() == null) {
FormRepresentation fp = wordForm.getFormRepresentations().get(0);
fp.setPhoneticForm(phoneticForm);
fp.setGeographicalVariant(geographicalVariant);
} else {
FormRepresentation fp = new FormRepresentation();
fp.setWrittenForm(writtenForm);
fp.setPhoneticForm(phoneticForm);
fp.setGeographicalVariant(geographicalVariant);
fp.setLanguageIdentifier(lexicon.getLanguageIdentifier());
fps.add(fp);
}
}
}
if (!wordForms.isEmpty())
entry.setWordForms(wordForms);
}
protected void convertSemanticRelations() {
int senseCount = 0;
for (IWiktionaryEntry wktEntry : wkt.getAllEntries()) {
if (!wktLang.equals(wktEntry.getWordLanguage()))
continue;
for (IWiktionarySense wktSense : wktEntry.getSenses()) {
if (!considerSense(wktSense))
continue;
String sourceId = getLmfId(Sense.class, getSenseId(wktSense.getKey()));
Sense source = (Sense) getLmfObjectById(Sense.class, sourceId);
if (source != null)
convertSemanticRelations(source, wktSense, wktEntry);
source = null;
wktSense = null;
if (++senseCount % 500 == 0) {
System.out.println("SAVING RELATIONS / PROCESSED " + senseCount + " SENSES");
tx.commit();
session.close();
session = sessionFactory.openSession();
tx = session.beginTransaction();
}
}
wktEntry = null;
}
ontoWiktionary.freeSemanticRelations();
}
protected void convertSemanticRelations(final Sense source,
final IWiktionarySense wktSense,
final IWiktionaryEntry wktEntry) {
// Sense relations (SenseRelation class).
List<OntoWiktionarySemanticRelation> owktRelations;
try {
owktRelations = ontoWiktionary.getSemanticRelations(wktSense.getKey());
} catch (IOException e) {
throw new RuntimeException(e);
}
List<SenseRelation> senseRelations = new ArrayList<SenseRelation>();
if (wktSense.getRelations() != null) {
for (IWiktionaryRelation wktRelation : wktSense.getRelations()) {
if (wktRelation.getRelationType() == null || wktRelation.getTarget().isEmpty()) {
continue;
}
SenseRelation senseRelation = new SenseRelation();
senseRelation.setRelType(WiktionaryLMFMap.mapRelationType(wktRelation.getRelationType()));
senseRelation.setRelName(WiktionaryLMFMap.mapRelationName(wktRelation.getRelationType()));
if (senseRelation.getRelType() == null) {
continue;
}
// Find a suitable relation in OntoWiktionary.
if (owktRelations != null) {
Iterator<OntoWiktionarySemanticRelation> owktRelationIter = owktRelations.iterator();
while (owktRelationIter.hasNext()) {
OntoWiktionarySemanticRelation owktRelation = owktRelationIter.next();
if (!wktRelation.getRelationType().equals(owktRelation.getRelationType())
|| !wktRelation.getTarget().equals(owktRelation.getTargetWordForm()))
continue;
owktRelationIter.remove();
String targetId = getLmfId(Sense.class, getSenseId(owktRelation.getTargetSenseId()));
if (!"???".equals(targetId)) {
Sense target = (Sense) getLmfObjectById(Sense.class, targetId);
if (target != null)
senseRelation.setTarget(target);
// else
// System.err.println("SenseRelation.Target not found: " + owktRelation.getTargetSenseId());
target = null;
}
break;
}
}
// Save target word as targetFormRepresentation.
FormRepresentation targetFormRepresentation = new FormRepresentation();
targetFormRepresentation.setWrittenForm(convert(wktRelation.getTarget(), 255));
targetFormRepresentation.setLanguageIdentifier(WiktionaryLMFMap.mapLanguage(wktEntry.getWordLanguage()));
senseRelation.setFormRepresentation(targetFormRepresentation);
senseRelations.add(senseRelation);
}
}
// Save inferred relations.
if (owktRelations != null) {
for (OntoWiktionarySemanticRelation owktRelation : owktRelations) {
SenseRelation senseRelation = new SenseRelation();
senseRelation.setRelType(WiktionaryLMFMap.mapRelationType(owktRelation.getRelationType()));
senseRelation.setRelName(WiktionaryLMFMap.mapRelationName(owktRelation.getRelationType()) + "-AUTO");
if (senseRelation.getRelType() == null)
continue;
String targetId = getLmfId(Sense.class, getSenseId(owktRelation.getTargetSenseId()));
if (!"???".equals(targetId)) {
Sense target = (Sense) getLmfObjectById(Sense.class, targetId);
if (target != null)
senseRelation.setTarget(target);
// else
// System.err.println("SenseRelation.Target not found: " + owktRelation.getTargetSenseId());
target = null;
}
// Save target word as targetFormRepresentation.
FormRepresentation targetFormRepresentation = new FormRepresentation();
targetFormRepresentation.setWrittenForm(convert(owktRelation.getTargetWordForm(), 255));
targetFormRepresentation.setLanguageIdentifier(WiktionaryLMFMap.mapLanguage(wktEntry.getWordLanguage()));
senseRelation.setFormRepresentation(targetFormRepresentation);
senseRelations.add(senseRelation);
}
}
source.setSenseRelations(senseRelations);
saveList(source, senseRelations);
}
/** Returns true if this sense should be used for the UBY database. */
protected boolean considerSense(final IWiktionarySense wktSense) {
return wktSense.getGloss() != null;
}
/** Converts Wiktionary Sense to LMF Sense. */
protected Sense wktSenseToLMFSense(IWiktionarySense wktSense, IWiktionaryEntry wktEntry, LexicalEntry entry){
// Sense and identifier.
Sense sense = new Sense();
sense.setId(getLmfId(Sense.class, getSenseId(wktSense)));
sense.setIndex(wktSense.getIndex());
// Monolingual external reference.
MonolingualExternalRef monolingualExternalRef = new MonolingualExternalRef();
monolingualExternalRef.setExternalSystem("Wiktionary_"
+ jwktlVersion + "_" + wktDate + "_"
+ wktLang.getISO639_2T() + "_sense");
monolingualExternalRef.setExternalReference(wktSense.getKey());
List<MonolingualExternalRef> monolingualExternalRefs = new LinkedList<MonolingualExternalRef>();
monolingualExternalRefs.add(monolingualExternalRef);
sense.setMonolingualExternalRefs(monolingualExternalRefs);
// Sense Definition (Definition class; type intensionalDefinition).
List<Definition> definitions = new ArrayList<Definition>();
Definition definition = new Definition();
definition.setDefinitionType(EDefinitionType.intensionalDefinition);
definition.setTextRepresentations(createTextRepresentationList(
wktSense.getGloss().getPlainText(), wktEntry.getPage().getEntryLanguage()
));
definitions.add(definition);
sense.setDefinitions(definitions);
// Semantic Labels.
List<SemanticLabel> semanticLabels = createSemanticLabels(entry, sense, wktSense);
if (semanticLabels != null && semanticLabels.size() > 0)
sense.setSemanticLabels(semanticLabels);
// Etymology (Statement class; type etymology).
IWikiString etymology = null;
if (wktEntry.getWordEtymology() != null)
etymology = wktEntry.getWordEtymology();
String etymologyText = convertEtymology(etymology);
if (etymologyText != null && !etymologyText.isEmpty()) {
List<Statement> statements = new LinkedList<Statement>();
Statement statement = new Statement();
statement.setStatementType(EStatementType.etymology);
statement.setTextRepresentations(createTextRepresentationList(
convertEtymology(etymology), wktEntry.getPage().getEntryLanguage()));
statements.add(statement);
definition.setStatements(statements);
}
// Sense examples (SenseExample class; type senseInstance).
List<SenseExample> examples = new ArrayList<SenseExample>();
if (wktSense.getExamples() != null) {
for (IWikiString example : wktSense.getExamples()) {
SenseExample senseExample = new SenseExample();
senseExample.setId(getResourceAlias() + "_SenseExample_" + (exampleIdx++));
senseExample.setExampleType(EExampleType.senseInstance);
senseExample.setTextRepresentations(createTextRepresentationList(
example.getPlainText(), wktEntry.getWordLanguage()
));
examples.add(senseExample);
}
}
sense.setSenseExamples(examples);
// Quotations (Context class; type citation).
List<Context> contexts = new ArrayList<Context>();
if (wktSense.getQuotations() != null) {
for (IQuotation quotation : wktSense.getQuotations()) {
Context context = new Context();
context.setContextType(EContextType.citation);
StringBuilder quotationText = new StringBuilder();
for (IWikiString line : quotation.getLines()) {
quotationText.append(quotationText.length() == 0 ? "" : " ")
.append(line.getPlainText());
}
context.setTextRepresentations(createTextRepresentationList(
quotationText.toString(), wktEntry.getWordLanguage()
));
if (quotation.getSource() != null) {
String source = quotation.getSource().getPlainText();
if (source.length() > 255)
source = source.substring(0, 255);
context.setSource(source);
}
contexts.add(context);
}
}
sense.setContexts(contexts);
// Sense relations (SenseRelation class)
// -- skip (will be done in a separate step!
// Translations (Equivalent class).
if (wktSense.getTranslations() != null) {
List<Equivalent> equivalents = new ArrayList<Equivalent>();
for (IWiktionaryTranslation trans : wktSense.getTranslations()) {
String targetForm = convert(trans.getTranslation(), 255);
if (targetForm == null || targetForm.isEmpty()) {
continue; // Do not save empty translations.
}
String language = WiktionaryLMFMap.mapLanguage(trans.getLanguage());
if (language == null) {
continue; // Do not save translations to unknown languages.
}
Equivalent equivalent = new Equivalent();
equivalent.setWrittenForm(targetForm);
equivalent.setLanguageIdentifier(language);
String transliteration = trans.getTransliteration();
if (transliteration != null && !transliteration.isEmpty()) {
transliteration = convert(transliteration, 255);
equivalent.setTransliteration(transliteration);
}
String additionalInformation = trans.getAdditionalInformation();
if (additionalInformation != null && !additionalInformation.isEmpty()) {
additionalInformation = additionalInformation.replace("{{m}}", "masculine");
additionalInformation = additionalInformation.replace("{{f}}", "feminine");
additionalInformation = additionalInformation.replace("{{n}}", "neuter");
additionalInformation = convert(additionalInformation, 255);
equivalent.setUsage(additionalInformation);
}
equivalents.add(equivalent);
}
sense.setEquivalents(equivalents);
}
return sense;
}
protected List<SemanticLabel> createSemanticLabels(
final LexicalEntry entry, final Sense sense,
final IWiktionarySense wktSense) {
List<SemanticLabel> result = new ArrayList<SemanticLabel>();
// Create semantic labels from part of speech tags.
for (PartOfSpeech p : wktSense.getEntry().getPartsOfSpeech()) {
if (p == null)
continue;
ELabelTypeSemantics semanticLabelType;
String semanticLabelName;
switch (p) {
case TOPONYM:
semanticLabelType = ELabelTypeSemantics.semanticNounClass;
semanticLabelName = ELabelNameSemantics.SEMANTIC_NOUN_CLASS_TOPONYM;
break;
case SINGULARE_TANTUM:
semanticLabelType = ELabelTypeSemantics.semanticNounClass;
semanticLabelName = ELabelNameSemantics.SEMANTIC_NOUN_CLASS_ONLY_SINGULAR;
break;
case PLURALE_TANTUM:
semanticLabelType = ELabelTypeSemantics.semanticNounClass;
semanticLabelName = ELabelNameSemantics.SEMANTIC_NOUN_CLASS_ONLY_PLURAL;
break;
case SALUTATION:
semanticLabelType = ELabelTypeSemantics.interjectionClass;
semanticLabelName = ELabelNameSemantics.INTERJECTION_SALUTATION;
break;
case ONOMATOPOEIA:
semanticLabelType = ELabelTypeSemantics.interjectionClass;
semanticLabelName = ELabelNameSemantics.INTERJECTION_ONOMATOPOEIA;
break;
case IDIOM:
semanticLabelType = ELabelTypeSemantics.phrasemeClass;
semanticLabelName = ELabelNameSemantics.PHRASEME_CLASS_IDIOM;
break;
case COLLOCATION:
semanticLabelType = ELabelTypeSemantics.phrasemeClass;
semanticLabelName = ELabelNameSemantics.PHRASEME_CLASS_COLLOCATION;
break;
case PROVERB:
semanticLabelType = ELabelTypeSemantics.phrasemeClass;
semanticLabelName = ELabelNameSemantics.PHRASEME_CLASS_PROVERB;
break;
case MNEMONIC:
semanticLabelType = ELabelTypeSemantics.phrasemeClass;
semanticLabelName = ELabelNameSemantics.PHRASEME_CLASS_MNEMONIC;
break;
case MODAL_PARTICLE:
semanticLabelType = ELabelTypeSemantics.discourseFunction;
semanticLabelName = ELabelNameSemantics.DISCOURSE_FUNCTION_MODAL_PARTICLE;
break;
case FOCUS_PARTICLE:
semanticLabelType = ELabelTypeSemantics.discourseFunction;
semanticLabelName = ELabelNameSemantics.DISCOURSE_FUNCTION_FOCUS_PARTICLE;
break;
case INTENSIFYING_PARTICLE:
semanticLabelType = ELabelTypeSemantics.discourseFunction;
semanticLabelName = ELabelNameSemantics.DISCOURSE_FUNCTION_INTENSIFYING_PARTICLE;
break;
default:
continue;
}
SemanticLabel semanticLabel = new SemanticLabel();
semanticLabel.setType(semanticLabelType);
semanticLabel.setLabel(semanticLabelName);
result.add(semanticLabel);
}
// Process labels encoded in the sense definition.
IWikiString senseDefinition = wktSense.getGloss();
List<PragmaticLabel> labels = labelManager.parseLabels(
senseDefinition.getText(), wktSense.getEntry().getWord());
if (labels != null) {
List<String> subcatLabels = new LinkedList<String>();
EAuxiliary auxiliary = null;
ESyntacticProperty syntacticProperty = null;
for (PragmaticLabel label : labels) {
String labelGroup = label.getLabelGroup();
if (labelGroup == null || labelGroup.length() == 0)
continue;
String[] labelInfo = labelManager.getWordFormLabel(label);
if (labelInfo != null) {
// WordForm.
/* if (!labelInfo[0].equals("WordForm"))
continue;
String targetWord = labelManager.extractTargetWordForm(senseDefinition.getText());
IWiktionaryEntry wktEntry = wktSense.getEntry();
// System.err.println("WORD FORM: " + targetWord + " -> " + wktEntry.getWord());
WordForm wordForm = new WordForm();
if (!labelInfo[1].isEmpty())
wordForm.setCase(ECase.valueOf(labelInfo[1]));
if (!labelInfo[2].isEmpty())
wordForm.setGrammaticalNumber(EGrammaticalNumber.valueOf(labelInfo[2]));
if (!labelInfo[3].isEmpty())
wordForm.setVerbFormMood(EVerbFormMood.valueOf(labelInfo[3]));
if (!labelInfo[4].isEmpty())
wordForm.setTense(ETense.valueOf(labelInfo[4]));
if (!labelInfo[5].isEmpty())
wordForm.setGrammaticalGender(EGrammaticalGender.valueOf(labelInfo[5]));
if (!labelInfo[6].isEmpty())
wordForm.setDegree(EDegree.valueOf(labelInfo[6]));
wordForm.setFormRepresentations(createFormRepresentationList(
wktEntry.getWord(), wktEntry.getWordLanguage()));
for (IWiktionaryEntry targetEntry : wkt.getEntriesForWord(targetWord)) {
// Ignore entries of a different language.
if (targetEntry.getWordLanguage() == null
|| !targetEntry.getWordLanguage().equals(wktEntry.getWordLanguage()))
continue;
// Ignore entries of a different part of speech.
if (targetEntry.getPartOfSpeech() == null
|| !targetEntry.getPartOfSpeech().equals(wktEntry.getPartOfSpeech()))
continue;
String entryId = getLmfId(LexicalEntry.class, getEntryId(targetEntry)); // this only works if the entire resource is converted!
LexicalEntry lexEntry = (LexicalEntry) getLmfObjectById(LexicalEntry.class, entryId);
if (lexEntry != null) {
// If the entry already exists then save directly to it
List<WordForm> wordFormList = lexEntry.getWordForms();
if (wordFormList == null) {
wordFormList = new ArrayList<WordForm>();
lexEntry.setWordForms(wordFormList);
}
wordFormList.add(wordForm);
saveList(lexEntry, lexEntry.getWordForms());
} else {
// If the lexical entry does not yet exist, then
// save the wordForms temporarily.
List<WordForm> wordFormList = wordForms.get(entryId);
if (wordFormList == null) {
wordFormList = new ArrayList<WordForm>();
wordForms.put(entryId, wordFormList);
}
wordFormList.add(wordForm);
}
}*/
} else
if ("syntax:gram:auxiliary".equals(labelGroup)) {
// LexemeProperty:auxiliary.
if ("habenSein".equals(label.getStandardizedLabel()))
continue; //TODO: Add enum value!
auxiliary = EAuxiliary.valueOf(label.getStandardizedLabel());
} else
if ("syntax:gram:synprop".equals(labelGroup)) {
// LexemeProperty:syntacticProperty.
syntacticProperty = ESyntacticProperty.valueOf(label.getStandardizedLabel());
} else
if ("syntax:gram:subcat".equals(labelGroup)) {
// SubcategorizationFrame.
subcatLabels.add(label.getStandardizedLabel());
} else
if ("syntax:gram:nounClass".equals(labelGroup)) {
// SemanticLabel:semanticNounClass.
SemanticLabel semanticLabel = new SemanticLabel();
semanticLabel.setLabel(StringUtils.replaceNonUtf8(label.getStandardizedLabel()));
semanticLabel.setType(ELabelTypeSemantics.semanticNounClass);
result.add(semanticLabel);
} else
if ("syntax:gram:usage".equals(labelGroup)) {
// SemanticLabel:usage.
SemanticLabel semanticLabel = new SemanticLabel();
semanticLabel.setLabel(StringUtils.replaceNonUtf8(label.getStandardizedLabel()));
semanticLabel.setType(ELabelTypeSemantics.usage);
result.add(semanticLabel);
} else {
// Semantic labels.
SemanticLabel semanticLabel = new SemanticLabel();
semanticLabel.setLabel(StringUtils.replaceNonUtf8(label.getLabel()));
if (labelGroup.startsWith("dom"))
semanticLabel.setType(ELabelTypeSemantics.domain);
else
if (labelGroup.startsWith("reg") || labelGroup.startsWith("dia"))
semanticLabel.setType(ELabelTypeSemantics.regionOfUsage);
else
if (labelGroup.startsWith("phas") || labelGroup.startsWith("strat") || labelGroup.startsWith("eval"))
semanticLabel.setType(ELabelTypeSemantics.register);
else
if (labelGroup.startsWith("temp"))
semanticLabel.setType(ELabelTypeSemantics.timePeriodOfUsage);
else
if (labelGroup.startsWith("freq") || labelGroup.startsWith("norm"))
semanticLabel.setType(ELabelTypeSemantics.usage);
else
continue;
result.add(semanticLabel); //TODO: standardize
}
// TODO: Additional labels: etym request syntax:form syntax:pos syntax:gram
}
// Create a subcategorization frame if no syntactic label exists.
String lpKey = (auxiliary != null ? auxiliary.ordinal() : "")
+ "_" + (syntacticProperty != null ? syntacticProperty.ordinal() : "");
if (subcatLabels.isEmpty() && !"_".equals(lpKey))
subcatLabels.add("");
for (String subcatLabel : subcatLabels) {
// Create subcategorization frame.
String scfKey = subcatLabel + ":" + lpKey;
SubcategorizationFrame subcatFrame = subcatFrames.get(scfKey);
if (subcatFrame == null) {
LexemeProperty lexemeProperty = new LexemeProperty();
lexemeProperty.setAuxiliary(auxiliary);
lexemeProperty.setSyntacticProperty(syntacticProperty);
subcatFrame = new SubcategorizationFrame();
subcatFrame.setId(getResourceAlias() + "_SubcatFrame_" + (subcatFrameIdx++));
subcatFrame.setSubcatLabel(subcatLabel);
subcatFrame.setLexemeProperty(lexemeProperty);
subcatFrames.put(scfKey, subcatFrame);
}
// Create syntactic behavior.
SyntacticBehaviour sb = new SyntacticBehaviour();
sb.setSubcategorizationFrame(subcatFrame);
sb.setId(getResourceAlias() + "_SyntacticBehaviour_" + (syntacticBehaviourIdx++));
sb.setSense(sense);
if (entry.getSyntacticBehaviours() == null)
entry.setSyntacticBehaviours(new LinkedList<SyntacticBehaviour>());
entry.getSyntacticBehaviours().add(sb);
}
}
return result;
}
protected List<TextRepresentation> createTextRepresentationList(
final String writtenText, final ILanguage language) {
List<TextRepresentation> result = new ArrayList<TextRepresentation>();
TextRepresentation textRepresentation = new TextRepresentation();
textRepresentation.setWrittenText(convert(writtenText));
textRepresentation.setLanguageIdentifier(WiktionaryLMFMap.mapLanguage(language));
result.add(textRepresentation);
return result;
}
protected List<FormRepresentation> createFormRepresentationList(
final String writtenForm, final ILanguage language) {
List<FormRepresentation> result = new ArrayList<FormRepresentation>();
FormRepresentation formRepresentation = new FormRepresentation();
formRepresentation.setWrittenForm(convert(writtenForm, 255));
formRepresentation.setLanguageIdentifier(WiktionaryLMFMap.mapLanguage(language));
result.add(formRepresentation);
return result;
}
@Override
protected SubcategorizationFrame getNextSubcategorizationFrame() {
return (subcatFrames.isEmpty() ? null : subcatFrames.remove(subcatFrames.firstKey()));
}
@Override
protected Synset getNextSynset() {
// If we haven't started yet, initialize the iterator.
if (synsetIter == null)
try {
synsetIter = ontoWiktionary.getStreamedConcepts().iterator();
} catch (Exception e) {
throw new RuntimeException(e);
}
// If we're finished, convert the synset relations and free resources.
if (!synsetIter.hasNext()) {
synsetIter = null;
convertSynsetRelations();
ontoWiktionary.freeConcepts();
return null;
}
// Check if at least one sense exists.
OntoWiktionaryConcept owktSynset = synsetIter.next();
List<Sense> senses = new LinkedList<Sense>();
for (String lexicalization : owktSynset.getLexicalizations()) {
String senseId = getLmfId(Sense.class, getSenseId(lexicalization));
Sense sense = (Sense) getLmfObjectById(Sense.class, senseId);
if (sense == null) {
// Caused by different sense selection (e.g., inflected forms)
// System.err.println("Sense not found: " + lexicalization);
continue;
}
if (sense.getSynset() != null) {
System.err.println("Inconsistent synset structure for " + lexicalization);
}
senses.add(sense);
}
if (senses.size() == 0)
return getNextSynset();
// Synset.
Synset synset = new Synset();
synset.setId(getLmfId(Synset.class, getSynsetId(owktSynset.getConceptId())));
// MonolingualExternalRef.
List<MonolingualExternalRef> monolingualExternalRefs = new LinkedList<MonolingualExternalRef>();
MonolingualExternalRef monolingualExternalRef = new MonolingualExternalRef();
monolingualExternalRef.setExternalSystem("OntoWiktionary" + wktLang.getISO639_1().toUpperCase() + "_ConceptID");
monolingualExternalRef.setExternalReference(owktSynset.getConceptId());
monolingualExternalRefs.add(monolingualExternalRef);
synset.setMonolingualExternalRefs(monolingualExternalRefs);
// Senses.
for (Sense sense : senses)
sense.setSynset(synset);
synset.setSenses(senses);
return synset;
}
protected void convertSynsetRelations() {
try {
synsetIter = ontoWiktionary.getConcepts().iterator();
} catch (Exception e) {
throw new RuntimeException(e);
}
int conceptCount = 0;
while (synsetIter.hasNext()) {
OntoWiktionaryConcept owktSynset = synsetIter.next();
Synset source = (Synset) getLmfObjectById(Synset.class,
getLmfId(Synset.class, getSynsetId(owktSynset.getConceptId())));
if (source == null) {
// System.err.println("Source concept not found: " + owktSynset.getConceptId());
continue;
}
// SynsetRelation.
List<SynsetRelation> synsetRelations = new LinkedList<SynsetRelation>();
addSynsetRelations(synsetRelations, owktSynset.getSubsumesRelations(),
ERelTypeSemantics.taxonomic, "subsumes", source);
addSynsetRelations(synsetRelations, owktSynset.getSubsumedByRelations(),
ERelTypeSemantics.taxonomic, "subsumedBy", source);
addSynsetRelations(synsetRelations, owktSynset.getRelatedConcepts(),
ERelTypeSemantics.association, "related", source);
source.setSynsetRelations(synsetRelations);
saveCascade(source);
if (++conceptCount % 1000 == 0) {
System.out.println("SAVING RELATIONS / PROCESSED " + conceptCount + " SYNSETS");
tx.commit();
session.close();
session = sessionFactory.openSession();
tx = session.beginTransaction();
}
}
synsetIter = null;
}
protected void addSynsetRelations(final List<SynsetRelation> synsetRelations,
final Iterable<String> relationTargets,
final ERelTypeSemantics relType, final String relName,
final Synset source) {
for (String targetID : relationTargets) {
Synset target = (Synset) getLmfObjectById(Synset.class,
getLmfId(Synset.class, getSynsetId(targetID)));
if (target == null) {
// System.err.println("Target concept not found: " + targetID);
continue;
}
SynsetRelation synsetRelation = new SynsetRelation();
synsetRelation.setRelType(relType);
synsetRelation.setRelName(relName);
synsetRelation.setSource(source);
synsetRelation.setTarget(target);
synsetRelations.add(synsetRelation);
}
}
@Override
protected ConstraintSet getNextConstraintSet() { return null;}
@Override
protected SemanticPredicate getNextSemanticPredicate() { return null;}
@Override
protected SenseAxis getNextSenseAxis() { return null;}
@Override
protected SubcategorizationFrameSet getNextSubcategorizationFrameSet() {return null;}
@Override
protected SynSemCorrespondence getNextSynSemCorrespondence() { return null;}
@Override
protected void finish() {
commit();
// Save all unsaved word froms from the cache.
int size = wordForms.size();
System.out.println("Finishing WORD FORMS... " + size);
for (Entry<String, List<WordForm>> entry : wordForms.entrySet()) {
if (size % 1000 == 0)
System.out.println("SAVING WORD FORMS: " + size + " LEFT");
LexicalEntry lexEntry = (LexicalEntry)getLmfObjectById(LexicalEntry.class, entry.getKey());
if (lexEntry != null) {
if (lexEntry.getWordForms() == null) {
lexEntry.setWordForms(entry.getValue());
} else {
lexEntry.getWordForms().addAll(entry.getValue());
}
// Save word forms and update lexEntry.
saveList(lexEntry, lexEntry.getWordForms());
}
size--;
}
}
/** Returns unique entry ID for a WiktionaryEntry. */
protected String getEntryId(IWiktionaryEntry entry){
return "e" + entry.getKey();
}
/** Returns unique sense ID for a WiktionarySense. */
protected String getSenseId(IWiktionarySense sense){
return getSenseId(sense.getKey());
}
protected String getSenseId(final String senseKey){
return "s" + senseKey;
}
protected String getSynsetId(final String conceptId) {
return "c" + conceptId;
}
private static String convert(final String text) {
return StringUtils.replaceNonUtf8(
StringUtils.replaceHtmlEntities(text));
}
private static String convert(final String text, int maxLength) {
if (text == null)
return null;
else
return StringUtils.replaceNonUtf8(
StringUtils.replaceHtmlEntities(text), maxLength);
}
protected String convertEtymology(final IWikiString etymology) {
if (etymology == null)
return null;
try {
String result = TemplateParser.parse(etymology.getText(), new EtymologyTemplateHandler());
return WikiString.makePlainText(result);
} catch (Exception e) {
return WikiString.makePlainText(etymology.getText());
}
}
}