package tv.dyndns.kishibe.qmaclone.server.relevance; import java.io.BufferedInputStream; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.net.URL; import java.util.List; import java.util.Scanner; import java.util.logging.Level; import java.util.logging.Logger; import java.util.zip.GZIPInputStream; import com.google.common.base.Preconditions; import com.google.common.base.Throwables; import com.google.common.collect.Lists; import com.google.inject.Inject; import tv.dyndns.kishibe.qmaclone.client.constant.Constant; import tv.dyndns.kishibe.qmaclone.server.util.Downloader; import tv.dyndns.kishibe.qmaclone.server.util.DownloaderException; import tv.dyndns.kishibe.qmaclone.server.util.Normalizer; public class WikipediaAllTitlesDictionary implements Dictionary { private static final Logger logger = Logger .getLogger(WikipediaAllTitlesDictionary.class.getName()); private static final String WIKIPEDIA_ALL_TITLE_URL = "https://dumps.wikimedia.org/jawiki/latest/jawiki-latest-all-titles-in-ns0.gz"; private static final File WIKIPEDIA_ALL_TITLE_FILE = new File( Constant.FILE_PATH_BASE + "qmaclone/jawiki-latest-all-titles-in-ns0.gz"); private final Downloader downloader; @Inject public WikipediaAllTitlesDictionary(Downloader downloader) { this.downloader = Preconditions.checkNotNull(downloader); } @Override public List<String> getWords() { try { ensureFile(); return readFile(); } catch (IOException e) { logger.log(Level.WARNING, "Wikipediaのタイトル一覧の取得に失敗しました", e); return Lists.newArrayList(); } } private void ensureFile() throws IOException { if (WIKIPEDIA_ALL_TITLE_FILE.isFile() && System .currentTimeMillis() < WIKIPEDIA_ALL_TITLE_FILE.lastModified() + 7L * 24 * 60 * 60 * 1000) { return; } try { downloader.downloadToFile(new URL(WIKIPEDIA_ALL_TITLE_URL), WIKIPEDIA_ALL_TITLE_FILE); } catch (DownloaderException e) { logger.log(Level.SEVERE, "Wikipedia全記事タイトル一覧のダウンロードに失敗しました", e); throw Throwables.propagate(e); } } private List<String> readFile() throws IOException { List<String> words = Lists.newArrayList(); try ( Scanner scanner = new Scanner( new BufferedInputStream(new GZIPInputStream( new BufferedInputStream(new FileInputStream(WIKIPEDIA_ALL_TITLE_FILE)))), "utf-8")) { while (scanner.hasNextLine()) { String line = scanner.nextLine().trim(); if (line.isEmpty()) { continue; } if (line.contains("(")) { line = line.substring(0, line.indexOf("(")); } line = line.replaceAll("_", ""); words.add(Normalizer.normalize(line)); if (words.size() % 100000 == 0) { logger.log(Level.INFO, "Wikipedia: " + words.size()); } } } return words; } }