package org.wikibrain.mapper.algorithms; import com.google.common.collect.HashMultimap; import com.google.common.collect.Maps; import com.google.common.collect.Multimap; import com.typesafe.config.Config; import org.wikibrain.conf.Configuration; import org.wikibrain.conf.ConfigurationException; import org.wikibrain.conf.Configurator; import org.wikibrain.core.cmd.Env; import org.wikibrain.core.cmd.FileMatcher; import org.wikibrain.core.dao.DaoException; import org.wikibrain.core.dao.LocalPageDao; import org.wikibrain.core.lang.Language; import org.wikibrain.core.lang.LanguageSet; import org.wikibrain.core.lang.LocalId; import org.wikibrain.core.model.NameSpace; import org.wikibrain.core.model.Title; import org.wikibrain.core.model.UniversalPage; import org.wikibrain.mapper.ConceptMapper; import org.wikibrain.mapper.MapperIterator; import org.wikibrain.parser.sql.MySqlDumpParser; import java.io.*; import java.util.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * User: bjhecht * Date: 6/25/13 * Time: 1:59 PM */ public class PureWikidataConceptMapper extends ConceptMapper { private static Logger LOG = LoggerFactory.getLogger(PureWikidataConceptMapper.class); private final File wikiDataPath; public PureWikidataConceptMapper(File wikiDataPath, int id, LocalPageDao localPageDao) { super(id, localPageDao); this.wikiDataPath = wikiDataPath; } @Override public int getId() { return super.getId(); } @Override public Iterator<UniversalPage> getConceptMap(LanguageSet ls) throws DaoException { final Map<Integer, Multimap<Language, LocalId>> backend = Maps.newHashMap(); final Map<Integer, NameSpace> nsBackend = Maps.newHashMap(); // loop through sql dump MySqlDumpParser dumpParser = new MySqlDumpParser(); Iterable<Object[]> lines = dumpParser.parse(wikiDataPath); int lineCounter = 0; int validLineCounter = 0; int[] numLangsCount = new int[ls.size()]; Set<String> unknownLangs = new HashSet<String>(); int unknownPages = 0; for (Object[] line : lines){ lineCounter++; if (lineCounter % 1000000 == 0){ LOG.info(String.format("Done with %d total lines of Wikidata dump file", lineCounter)); } String langCode = ((String)line[2]).replaceAll("wiki",""); if (!Language.hasLangCode(langCode)) { unknownLangs.add(langCode); continue; } Language lang = Language.getByLangCode(langCode); if (!ls.containsLanguage(lang)){ continue; } Integer univId = (Integer)line[1]; String strTitle = (String)line[3]; Title title = new Title(strTitle, lang); int localId = localPageDao.getIdByTitle(title); if (localId <= 0){ unknownPages++; continue; } if (!backend.containsKey(univId)){ Multimap<Language, LocalId> mmap = HashMultimap.create(); backend.put(univId, mmap); nsBackend.put(univId, title.getNamespace()); // defines the universal page as having the namespace of the first LocalPage encountered numLangsCount[0]++; }else{ numLangsCount[backend.get(univId).size()-1]--; numLangsCount[backend.get(univId).size()]++; } backend.get(univId).put(lang, new LocalId(lang, localId)); validLineCounter++; if (validLineCounter % 10000 == 0){ // do some reporting in the log, necessary for such a large operation (both for debugging and for providing the user with something to watch :-)) LOG.info("Found " + validLineCounter + " local pages in input language set"); StringBuilder langDistLine = new StringBuilder(); langDistLine.append("distribution of pages per # languages: "); for(int i = 0; i < numLangsCount.length; i++){ langDistLine.append(numLangsCount[i]); langDistLine.append("\t"); } LOG.info(langDistLine.toString()); } } LOG.warn("encountered unknown languages: " + unknownLangs); LOG.warn("encountered " + unknownPages + " local pages not in the database"); return new MapperIterator<UniversalPage>(backend.keySet()) { @Override public UniversalPage transform(Object obj) { Integer univId = (Integer)obj; return new UniversalPage(univId, getId(), nsBackend.get(univId), backend.get(univId)); } }; } public static class Provider extends org.wikibrain.conf.Provider<ConceptMapper> { public Provider(Configurator configurator, Configuration config) throws ConfigurationException { super(configurator, config); } @Override public Class getType() { return ConceptMapper.class; } @Override public String getPath() { return "mapper"; } @Override public ConceptMapper get(String name, Config config, Map<String, String> runtimeParams) throws ConfigurationException { if (!config.getString("type").equals("purewikidata")) { return null; } List<File> paths = Env.getFiles(Language.WIKIDATA, FileMatcher.WIKIDATA_ITEMS, getConfig()); if (paths.isEmpty()) { throw new ConfigurationException("No wikidata file available for PurWikidataConceptMapper"); } return new PureWikidataConceptMapper( paths.get(0), config.getInt("algorithmId"), getConfigurator().get( LocalPageDao.class, config.getString("localPageDao")) ); } } }