/** * File ./src/main/java/de/lemo/dms/connectors/chemgapedia/fizHelper/BotFinder.java * Lemo-Data-Management-Server for learning analytics. * Copyright (C) 2013 * Leonard Kappe, Andreas Pursian, Sebastian Schwarzrock, Boris Wenzlaff * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. **/ /** * File ./main/java/de/lemo/dms/connectors/chemgapedia/fizHelper/BotFinder.java * Date 2013-01-24 * Project Lemo Learning Analytics */ package de.lemo.dms.connectors.chemgapedia.fizHelper; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.Hashtable; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; /** * This class offers some functions for the detection of web-crawlers. * * @author s.schwarzrock */ public class BotFinder { /** * Takes all requests of a user and divides the sequence into several sessions according to web-usyge heuristics. * * @param log * @return */ public List<ArrayList<LogObject>> sessionize(final List<LogObject> log) { final ArrayList<ArrayList<LogObject>> sessions = new ArrayList<ArrayList<LogObject>>(); final ArrayList<LogObject> currentSession = new ArrayList<LogObject>(); final HashSet<String> previousPages = new HashSet<String>(); for (int i = 0; i < log.size(); i++) { final LogObject l = log.get(i); if ((l.getReferrer().equals("-")) || !previousPages.contains(l.getReferrer())) { if (currentSession.size() > 0) { sessions.add(currentSession); } currentSession.clear(); previousPages.clear(); } currentSession.add(l); previousPages.add(l.getUrl()); } return sessions; } /** * @param log * Logfile containing all views the user did. * @param chunkLimit * Maximal number of views within a second. * @return List with all suspicious chunks. */ public List<Integer> checkFastOnes(final List<LogObject> log, final int chunkLimit) { final ArrayList<Integer> chunks = new ArrayList<Integer>(); Collections.sort(log); long lastTime = 0; int chunksize = 0; for (int i = 0; i < log.size(); i++) { if (i > 0) { if (log.get(i).getTime() == lastTime) { chunksize++; } else { if (chunksize > chunkLimit) { chunks.add(chunksize); } chunksize = 0; } } lastTime = log.get(i).getTime(); } return chunks; } /** * @param log * Logfile containing all views the user did. * @param chunkLimit * Maximal number of occurrences of a specific time span in relation to the total number of views by this * user. * @return List containing all suspicious chunks. */ public int checkPeriods(final List<LogObject> log, final int chunkLimit) { Collections.sort(log); // Skip if there was just one view if (log.size() < chunkLimit) { return 0; } // Final List int chunks = 0; // List of time spans that already occurred final HashMap<Integer, Integer> span = new HashMap<Integer, Integer>(); // List of the number of occurrences of known time spans long lastT = 0; for (int i = 0; i < log.size(); i++) { if (i > 0) { final Integer l = (int) (log.get(i).getTime() - lastT); if (span.get(l) != null) { final int c = span.get(l) + 1; span.put(l, c); } else { span.put(l, 1); } lastT = log.get(i).getTime(); } } final ArrayList<Integer> counts = new ArrayList<Integer>(span.values()); for (int i = 0; i < counts.size(); i++) { if (counts.get(i) > (log.size() / chunkLimit)) { chunks++; } } return chunks; } /** * @param log * Logfile containing all views the user did. * @param chunkLimit * Maximal number of views performed on a single page. * @return number of requests of the most frequent requested page */ public int checkForRepetitions(final List<LogObject> log, final int chunkLimit) { final Map<String, Integer> temp = new Hashtable<String, Integer>(); for (int i = 0; i < log.size(); i++) { if (temp.containsKey(log.get(i).getUrl())) { final int d = temp.get(log.get(i).getUrl()) + 1; temp.put(log.get(i).getUrl(), d); } else { temp.put(log.get(i).getUrl(), 1); } } final Set<String> set = temp.keySet(); final Iterator<String> it = set.iterator(); String ke; int max = 0; while (it.hasNext()) { ke = it.next(); if ((temp.get(ke) > chunkLimit) && (temp.get(ke) > max)) { max = temp.get(ke); } } return max; } }