/* * Concept profile generation tool suite * Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center, * Rotterdam, The Netherlands * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/> */ package org.erasmusmc.dataimport.wikipedia; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.regex.Pattern; import org.erasmusmc.ontology.Concept; import org.erasmusmc.ontology.TermStore; import org.erasmusmc.utilities.ReadTextFile; import org.erasmusmc.utilities.StringUtilities; import org.erasmusmc.utilities.WriteTextFile; public class WikiOntology { public static String wikiFile = "/media/OS/home/data/Wikipedia/enwiki-latest-pages-articles.xml"; public static String targetOntology = "/media/OS/home/public/thesauri/Wikipedia.psf"; private Map<String, Data> title2data = new HashMap<String, Data>(); private Pattern pattern = Pattern.compile("\\* ?\\[\\["); public static void main(String[] args) { new WikiOntology(); } public WikiOntology(){ boolean firstTitle = true; boolean addEntry = true; boolean disambiguation = true; int cid = 0; String definition = ""; List<String> terms = new ArrayList<String>(); String title = ""; int count = 0; for (String line : new ReadTextFile(wikiFile)){ String trimLine = line.trim(); String lcLine = trimLine.toLowerCase(); if (lcLine.equals("<page>")){ count++; if (count % 10000 == 0) System.out.println(count); //if (count == 250000) // break; firstTitle = true; addEntry = true; disambiguation = false; } else if (firstTitle && lcLine.startsWith("<title>")){ firstTitle = false; title = StringUtilities.findBetween(trimLine, "<title>", "</title>"); if (title.endsWith("(disambiguation)")){ disambiguation = true; addEntry = false; title = title.replace("(disambiguation)", "").trim(); } else if (title.toLowerCase().startsWith("file:") || title.toLowerCase().startsWith("category:") || title.toLowerCase().startsWith("list of") || title.toLowerCase().startsWith("wikipedia:") || title.toLowerCase().startsWith("template:") || title.toLowerCase().startsWith("portal:")) addEntry = false; } else if (lcLine.contains("#redirect [[")){ String redirect = StringUtilities.findBetween(trimLine, "[[", "]]"); addTerm(redirect, title); addEntry = false; } else if (disambiguation && lcLine.startsWith("*") && pattern.matcher(lcLine).find()){ String redirect = StringUtilities.findBetween(trimLine, "[[", "]]"); if (redirect.length() != 0) addTerm(redirect, title); } else if (disambiguation && lcLine.contains("{{disambig}}")){ disambiguation = false; } else if (addEntry && trimLine.equals("</page>")){ addTerm(title, title); for (String term : terms) addTerm(title, term); } } dumpToFile(); } private void dumpToFile() { WriteTextFile out = new WriteTextFile(targetOntology); int cid = 0; for (Map.Entry<String, Data> entry : title2data.entrySet()){ out.writeln("0|"+StringUtilities.join(entry.getValue(), ";") + "|" + cid++); } out.close(); } private void addTerm(String title, String term){ Data data = title2data.get(title); if (data == null){ data = new Data(1); title2data.put(title, data); } if (!data.contains(term)) data.add(term); } private Concept newConcept(int cid, String title, List<String> terms, String definition) { List<TermStore> termStores = new ArrayList<TermStore>(terms.size()); Set<String> seenTerms = new HashSet<String>(terms.size()); for (String term : terms) if (seenTerms.add(term)){ TermStore termStore = new TermStore(term); termStores.add(termStore); } Concept concept = new Concept(cid); concept.setTerms(termStores); concept.setDefinition(definition); return concept; } private class Data extends ArrayList<String>{ public Data(int size){ super(size); } String definition; } }