/* * Copyright 2015 Ben Manes. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.github.benmanes.caffeine.cache.simulator.parser.wikipedia; import java.io.IOException; import java.util.List; import java.util.Objects; import java.util.stream.LongStream; import javax.annotation.Nullable; import org.apache.commons.lang3.StringUtils; import com.github.benmanes.caffeine.cache.simulator.parser.TextTraceReader; import com.google.common.hash.Hashing; /** * A reader for the trace files provided by the <a href="http://www.wikibench.eu">wikibench</a> * project. The requests are sanitized and filtered using the <tt>TraceBench</tt> optimizations. * * @author ben.manes@gmail.com (Ben Manes) */ public final class WikipediaTraceReader extends TextTraceReader { private static final String[] CONTAINS_FILTER = {"?search=", "&search=", "User+talk", "User_talk", "User:", "Talk:", "&diff=", "&action=rollback", "Special:Watchlist"}; private static final String[] STARTS_WITH_FILTER = {"wiki/Special:Search", "w/query.php", "wiki/Talk:", "wiki/Special:AutoLogin", "Special:UserLogin", "w/api.php", "error:"}; private static final String[] SEARCH_LIST = { "%2F", "%20", "&", "%3A" }; private static final String[] REPLACEMENT_LIST = { "/", " ", "&", ":" }; public WikipediaTraceReader(List<String> filePaths) { super(filePaths); } @Override public LongStream events() throws IOException { return lines() .map(this::parseRequest) .filter(Objects::nonNull) .mapToLong(path -> Hashing.murmur3_128().hashUnencodedChars(path).asLong()); } /** * Returns the request's path or {@code null} if this request should be ignored. The input is * space deliminated with the following format, * <ul> * <li>A monotonically increasing counter (useful for sorting the trace in chronological order) * <li>The timestamp of the request in Unix notation with millisecond precision * <li>The requested URL * <li>A flag to indicate if the request resulted in a database update or not ('-' or 'save') * </ul> */ private @Nullable String parseRequest(String line) { if (!isRead(line)) { return null; } String url = getRequestUrl(line); if (url.length() > 12) { String path = getPath(url); if (isAllowed(path)) { return path; } } return null; } /** Returns whether the request resulted in a write to the database. */ private boolean isRead(String line) { return line.charAt(line.length() - 1) == '-'; } /** Returns the request URL. */ private String getRequestUrl(String line) { int end = line.length() - 2; while (line.charAt(end) != ' ') { end--; } int start = end - 1; while (line.charAt(start) != ' ') { start--; } return line.substring(start + 1, end); } /** Returns the path segment of the URL. */ private String getPath(String url) { int index = url.indexOf('/', 7); if (index == -1) { return url; } // Replace the html entities that we want to search for inside paths String cleansed = url.substring(index + 1); for (int i = 0; i < SEARCH_LIST.length; i++) { cleansed = StringUtils.replace(cleansed, SEARCH_LIST[i], REPLACEMENT_LIST[i]); } return cleansed; } /** * Returns if the path should be included. The request is ignored if it is a search query, a * page revision, related to users or user management, or talk pages. */ public boolean isAllowed(String path) { for (String filter : STARTS_WITH_FILTER) { if (path.startsWith(filter)) { return false; } } for (String filter : CONTAINS_FILTER) { if (path.contains(filter)) { return false; } } return true; } }