/*******************************************************************************
* Copyright 2016
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
package de.tudarmstadt.ukp.lmf.transform.wikipedia;
import java.io.FileNotFoundException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import de.tudarmstadt.ukp.lmf.model.core.Definition;
import de.tudarmstadt.ukp.lmf.model.core.LexicalEntry;
import de.tudarmstadt.ukp.lmf.model.core.LexicalResource;
import de.tudarmstadt.ukp.lmf.model.core.Lexicon;
import de.tudarmstadt.ukp.lmf.model.core.Sense;
import de.tudarmstadt.ukp.lmf.model.core.TextRepresentation;
import de.tudarmstadt.ukp.lmf.model.enums.EDefinitionType;
import de.tudarmstadt.ukp.lmf.model.enums.ELabelTypeSemantics;
import de.tudarmstadt.ukp.lmf.model.enums.EPartOfSpeech;
import de.tudarmstadt.ukp.lmf.model.enums.ERelNameSemantics;
import de.tudarmstadt.ukp.lmf.model.enums.ERelTypeSemantics;
import de.tudarmstadt.ukp.lmf.model.meta.SemanticLabel;
import de.tudarmstadt.ukp.lmf.model.miscellaneous.ConstraintSet;
import de.tudarmstadt.ukp.lmf.model.morphology.FormRepresentation;
import de.tudarmstadt.ukp.lmf.model.morphology.Lemma;
import de.tudarmstadt.ukp.lmf.model.mrd.Equivalent;
import de.tudarmstadt.ukp.lmf.model.multilingual.SenseAxis;
import de.tudarmstadt.ukp.lmf.model.semantics.MonolingualExternalRef;
import de.tudarmstadt.ukp.lmf.model.semantics.SemanticPredicate;
import de.tudarmstadt.ukp.lmf.model.semantics.SenseRelation;
import de.tudarmstadt.ukp.lmf.model.semantics.SynSemCorrespondence;
import de.tudarmstadt.ukp.lmf.model.semantics.Synset;
import de.tudarmstadt.ukp.lmf.model.syntax.SubcategorizationFrame;
import de.tudarmstadt.ukp.lmf.model.syntax.SubcategorizationFrameSet;
import de.tudarmstadt.ukp.lmf.transform.DBConfig;
import de.tudarmstadt.ukp.lmf.transform.LMFDBTransformer;
import de.tudarmstadt.ukp.lmf.transform.StringUtils;
import de.tudarmstadt.ukp.wikipedia.api.Category;
import de.tudarmstadt.ukp.wikipedia.api.Page;
import de.tudarmstadt.ukp.wikipedia.api.PageIterator;
import de.tudarmstadt.ukp.wikipedia.api.Wikipedia;
import de.tudarmstadt.ukp.wikipedia.api.exception.WikiApiException;
import de.tudarmstadt.ukp.wikipedia.parser.Link;
import de.tudarmstadt.ukp.wikipedia.parser.ParsedPage;
import de.tudarmstadt.ukp.wikipedia.parser.mediawiki.FlushTemplates;
import de.tudarmstadt.ukp.wikipedia.parser.mediawiki.MediaWikiParser;
import de.tudarmstadt.ukp.wikipedia.parser.mediawiki.MediaWikiParserFactory;
/**
* Converts Wikipedia to the LMF model and saves it to the database. Creation of Equivalents is optional, as it requires considerably more time.
* @author Yevgen Chebotar
* @author Christian M. Meyer
* @param createEquivalents Decides if TranslationEquivalents are created or not
*/
public abstract class WikipediaLMFTransformer extends LMFDBTransformer {
public final static String ARTICLE_TITLE = "articleTitle";
protected final Wikipedia wiki; // JWPL Wikipedia object
protected final Iterator<Page> pageIterator; // Page iterator
protected int currentEntryNr; // Number of the entries(pages) that were already transformed
protected final Set<Integer> categoryBlackList; // List of categories that should not be saved as SubjectFields
protected final MediaWikiParser mediaWikiParser; // Parser needed for parsing of Wikipedia pages
protected final String dtd_version;
protected final boolean createEquivalents; //Decision if Translation Equivalents should be generated. This takes a lot more time!!
protected String resourceVersion; //Description of external system
/**
* @param dbConfig
* @param wiki
* @param resourceVersion Version of the resource
* @param dtd Version of the dtd
* @param createEquivalents
* @throws WikiApiException
* @throws FileNotFoundException
*/
public WikipediaLMFTransformer(DBConfig dbConfig, Wikipedia wiki, String resourceVersion,
String dtd, boolean createEquivalents) throws WikiApiException, FileNotFoundException {
super(dbConfig);
this.wiki = wiki;
this.pageIterator = new PageIterator(wiki, true, 7000);
this.categoryBlackList = wiki.getCategory(getHiddenCategoryName()).getChildrenIDs();
this.currentEntryNr = 0;
this.createEquivalents = createEquivalents;
MediaWikiParserFactory pf = new MediaWikiParserFactory(); // Parse with MediaWikiParser
pf.setCalculateSrcSpans(false);
pf.setTemplateParserClass(FlushTemplates.class);
mediaWikiParser = pf.createParser();
dtd_version = dtd;
this.resourceVersion = resourceVersion;
}
protected abstract String getHiddenCategoryName();
@Override
protected abstract String getResourceAlias();
@Override
protected abstract LexicalResource createLexicalResource();
@Override
protected Lexicon createNextLexicon() {
if(!pageIterator.hasNext() /*|| currentEntryNr > 100*/) {
return null;
}
Lexicon lexicon = new Lexicon();
String lmfLang = WikipediaLMFMap.mapLanguage(wiki.getLanguage());
lexicon.setId(getLmfId(Lexicon.class, "lexiconWiki" + lmfLang));
lexicon.setLanguageIdentifier(lmfLang);
lexicon.setName("Wikipedia");
return lexicon;
}
@Override
protected LexicalEntry getNextLexicalEntry() {
if(currentEntryNr % 1000 == 0) {
System.out.println("PROCESSED "+currentEntryNr +" ENTRIES");
}
/* if(currentEntryNr > 100)
return null;*/
while(pageIterator.hasNext()){ // Iterate over pages, skip pages that should not be saved
String pageTitle = "";
Page page = null;
try{
page = pageIterator.next();
if(page.isDisambiguation() || page.isRedirect() // Skip redirect, disambiguation and discussion pages
|| isDiscussionPage(page.getTitle().getPlainTitle())) {
continue;
}
boolean alreadyExists = true; // If true there was already an entry created for this page
String wikiLmfLang = WikipediaLMFMap.mapLanguage(wiki.getLanguage());
pageTitle = page.getTitle().getPlainTitle();
String name = page.getTitle().getEntity();
String id = "Art"+name;
LexicalEntry lexEntry = null;
String entryId;
if(idMapping.containsKey(id)){ // If the entry(page without the disambiguation tag)
// for this page has been already created,
entryId = getLmfId(LexicalEntry.class, id); // get it from the database
lexEntry = (LexicalEntry)getLmfObjectById(LexicalEntry.class, entryId);
}
else {
entryId = getLmfId(LexicalEntry.class, id);
}
if(lexEntry == null){ // No entry was created-->create new one
alreadyExists = false;
lexEntry = new LexicalEntry();
lexEntry.setId(entryId);
Lemma lemma = new Lemma();
lemma.setFormRepresentations(new ArrayList<FormRepresentation>());
FormRepresentation formRep = new FormRepresentation();
formRep.setLanguageIdentifier(wikiLmfLang);
formRep.setWrittenForm(name);
lemma.getFormRepresentations().add(formRep);
lexEntry.setLemma(lemma);
lexEntry.setSenses(new ArrayList<Sense>());
lexEntry.setPartOfSpeech(EPartOfSpeech.noun);
}
Sense sense = new Sense();
sense.setId(getLmfId(Sense.class, String.valueOf(page.getPageId())));
MonolingualExternalRef monolingualExternalRef = new MonolingualExternalRef();
monolingualExternalRef.setExternalSystem(resourceVersion + "_" + ARTICLE_TITLE);
monolingualExternalRef.setExternalReference(pageTitle);
List<MonolingualExternalRef> monolingualExternalRefs = new LinkedList<MonolingualExternalRef>();
monolingualExternalRefs.add(monolingualExternalRef);
sense.setMonolingualExternalRefs(monolingualExternalRefs);
sense.setDefinitions(new ArrayList<Definition>());
String pageText = page.getText(); // Parse the page, take only first 10000 characters to improve performance
ParsedPage ppage;
if(!createEquivalents)
{
ppage = mediaWikiParser.parse(pageText.substring(0, Math.min(10000, pageText.length())));
}
else
{
ppage = mediaWikiParser.parse(pageText);
if (ppage.getLanguagesElement()!=null)
{
List<Equivalent> equivalents = new ArrayList<Equivalent>();
List<Link> languageLinks = ppage.getLanguages();
for(Link link : languageLinks)
{
String languageCode = link.getTarget().split(":")[0];
String targetForm = convert(link.getTarget().split(":")[1], 255);
if (targetForm == null || targetForm.isEmpty()) {
continue; // Do not save empty translations.
}
ILanguage language = WikipediaLanguageMapper.findByCode(languageCode);
if (language == null) {
continue; // Do not save translations to unknown languages.
}
Equivalent equivalent = new Equivalent();
equivalent.setWrittenForm(targetForm);
equivalent.setLanguageIdentifier(language.getISO639_3());
equivalents.add(equivalent);
}
sense.setEquivalents(equivalents);
}
}
if(ppage.getFirstParagraph() == null) {
continue;
}
String text = ppage.getFirstParagraph().getText();
Definition definition = new Definition(); // Save first paragraph text as definition text
definition.setDefinitionType(EDefinitionType.intensionalDefinition);
definition.setTextRepresentations(new ArrayList<TextRepresentation>());
TextRepresentation textRep = new TextRepresentation();
textRep.setLanguageIdentifier(wikiLmfLang);
textRep.setWrittenText(text);
definition.getTextRepresentations().add(textRep);
sense.getDefinitions().add(definition);
sense.setSenseRelations(new ArrayList<SenseRelation>());
for(String redirect : page.getRedirects()){ // Save redirects as SenseRelations, relType=association
if (isDiscussionPage(redirect)) {
continue;
}
SenseRelation senseRelation = new SenseRelation();
FormRepresentation targetForm = new FormRepresentation();
targetForm.setLanguageIdentifier(wikiLmfLang);
targetForm.setWrittenForm(redirect);
senseRelation.setFormRepresentation(targetForm);
senseRelation.setRelName(ERelNameSemantics.RELATED);
senseRelation.setRelType(ERelTypeSemantics.association);
sense.getSenseRelations().add(senseRelation);
}
sense.setSemanticLabels(new ArrayList<SemanticLabel>()); // Save categories as SubjectFields
for(Category category : page.getCategories()){
if(categoryBlackList.contains(category.getPageId())) {
continue;
}
SemanticLabel semanticLabel = new SemanticLabel();
semanticLabel.setType(ELabelTypeSemantics.category);
//subjectField.setRegisterType(ERegisterType.usage);
//subjectField.setSubjectField(category.getTitle().getPlainTitle());
semanticLabel.setLabel(category.getTitle().getPlainTitle());
sense.getSemanticLabels().add(semanticLabel);
}
lexEntry.getSenses().add(sense);
currentEntryNr++;
if(alreadyExists){ // If the entry already exists, then only add sense to it, and continue to the next one
saveList(lexEntry, lexEntry.getSenses());
}
else {
return lexEntry; // If the entry does not exist, return it
}
}catch (Exception ex){
System.err.println("Error while transforming '"+pageTitle+"': "
+ ex.getMessage());
ex.printStackTrace();
}
}
return null;
}
private static String convert(final String text, int maxLength) {
String returnString =StringUtils.replaceNonUtf8(StringUtils.replaceHtmlEntities(text), maxLength);
returnString = returnString.split("_\\(")[0].replace("_", " ");
return returnString;
}
protected abstract boolean isDiscussionPage(final String pageTitle);
@Override
protected void finish() {}
@Override
protected ConstraintSet getNextConstraintSet() {return null;}
@Override
protected SemanticPredicate getNextSemanticPredicate() {return null;}
@Override
protected SenseAxis getNextSenseAxis() {return null;}
@Override
protected SubcategorizationFrame getNextSubcategorizationFrame() {return null;}
@Override
protected SubcategorizationFrameSet getNextSubcategorizationFrameSet() {return null;}
@Override
protected SynSemCorrespondence getNextSynSemCorrespondence() {return null;}
@Override
protected Synset getNextSynset() {return null;}
}