package org.wikibrain.loader; import gnu.trove.set.TIntSet; import org.apache.commons.lang.StringUtils; import org.wikibrain.core.cmd.FileMatcher; import org.wikibrain.core.dao.*; import org.wikibrain.core.lang.Language; import org.wikibrain.core.lang.LanguageInfo; import org.wikibrain.core.model.LocalLink; import org.wikibrain.core.model.NameSpace; import org.wikibrain.core.model.Title; import org.wikibrain.parser.sql.MySqlDumpParser; import org.wikibrain.utils.ParallelForEach; import org.wikibrain.utils.Procedure; import org.wikibrain.utils.WpThreadUtils; import java.io.File; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Loads links that are in the SQL dump but not the parsed wiki text. */ public class SqlLinksLoader { private static final Logger LOG = LoggerFactory.getLogger(SqlLinksLoader.class); private final AtomicInteger counter = new AtomicInteger(); private final File sqlDump; private final Language language; private TIntSet validIds; private final LocalLinkDao dao; private final LocalPageDao pageDao; private final LocalLinkSet existing; private final MetaInfoDao metaDao; private AtomicLong totalLinks = new AtomicLong(); private AtomicLong interestingLinks = new AtomicLong(); private AtomicLong newLinks = new AtomicLong(); public SqlLinksLoader(LocalLinkDao dao, LocalPageDao pageDao, MetaInfoDao metaDao, File file, LocalLinkSet existing) throws DaoException { this.dao = dao; this.metaDao = metaDao; this.pageDao = pageDao; this.sqlDump = file; this.language = FileMatcher.LINK_SQL.getLanguage(file.getAbsolutePath()); int n = dao.getCount(new DaoFilter().setLanguages(language)); n = Math.max(10000, n); n *= 2 * 3; // guess that there will be twice as many links as there are now, to be safe, array size should be 3 times as big. LOG.info("guessing at size of array at " + n); this.existing = existing; } public void load() throws DaoException { totalLinks.set(0); newLinks.set(0); interestingLinks.set(0); ParallelForEach.iterate( new MySqlDumpParser().parse(sqlDump).iterator(), WpThreadUtils.getMaxThreads(), 1000, new Procedure<Object[]>() { @Override public void call(Object[] row) throws Exception { processOneLink(row); } }, 1000000 ); } private void processOneLink(Object[] row) throws DaoException { if (totalLinks.incrementAndGet() % 100000 == 0) { LOG.info("Processed link " + totalLinks + ", found " + interestingLinks + " interesting and " + newLinks + " new"); } Integer srcPageId = (Integer) row[0]; Integer destNamespace = (Integer) row[1]; String destTitle = (String) row[2]; NameSpace ns = NameSpace.getNameSpaceByValue(destNamespace); // TODO: make this configurable if (ns == null || (ns != NameSpace.ARTICLE && ns != NameSpace.CATEGORY)) { return; } if (srcPageId < 0 || StringUtils.isEmpty(destTitle)) { return; } interestingLinks.incrementAndGet(); Title title = new Title(destTitle, LanguageInfo.getByLanguage(language)); int destId = pageDao.getIdByTitle(title.getTitleStringWithoutNamespace(), language, ns); if (destId < 0) { // Handle red link } else if (validIds != null && (!validIds.contains(srcPageId) || !validIds.contains(destId))) { // Skip } else { LocalLink ll = new LocalLink(language, "", srcPageId, destId, true, -1, false, LocalLink.LocationType.NONE); if (!existing.contains(ll)) { newLinks.incrementAndGet(); dao.save(ll); } } } public void setValidIds(TIntSet validIds) { this.validIds = validIds; } }