Crawler.java example

Explorer
WotCrawler-master
- src
  - main
    - java
      - de
        nx42
        wotcrawler
        Launcher.java
        db
        BaseProperties.java
        Modules.java
        TanksDB.java
        module
        Engine.java
        Gun.java
        Module.java
        Radio.java
        Suspension.java
        Turret.java
        package-info.java
        package-info.java
        tank
        Equipment.java
        Tank.java
        TankRef.java
        package-info.java
        ext
        Evaluator.java
        Field.java
        FieldDef.java
        ModuleMap.java
        TankRating.java
        package-info.java
        package-info.java
        util
        Conversion.java
        Download.java
        Tuple.java
        package-info.java
        xml
        Crawler.java
        Parser.java
        Serializer.java
        Transformer.java
        package-info.java
  - test
    - java
      - de
        nx42
        wotcrawler
        AppTest.java
/*
 * Copyright (C) 2012 Sebastian Straub <sebastian-straub@gmx.net>
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
package de.nx42.wotcrawler.xml;

import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.text.NumberFormat;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Locale;
import java.util.Map;

import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;

import de.nx42.wotcrawler.db.BaseProperties;
import de.nx42.wotcrawler.db.BaseProperties.Currency;
import de.nx42.wotcrawler.db.BaseProperties.Nation;
import de.nx42.wotcrawler.db.Modules;
import de.nx42.wotcrawler.db.TanksDB;
import de.nx42.wotcrawler.db.module.Engine;
import de.nx42.wotcrawler.db.module.Engine.Gas;
import de.nx42.wotcrawler.db.module.Gun;
import de.nx42.wotcrawler.db.module.Module;
import de.nx42.wotcrawler.db.module.Module.ModuleType;
import de.nx42.wotcrawler.db.module.Radio;
import de.nx42.wotcrawler.db.module.Suspension;
import de.nx42.wotcrawler.db.module.Turret;
import de.nx42.wotcrawler.db.tank.Equipment;
import de.nx42.wotcrawler.db.tank.Tank;
import de.nx42.wotcrawler.db.tank.Tank.TankType;
import de.nx42.wotcrawler.db.tank.TankRef;
import de.nx42.wotcrawler.util.Conversion;
import de.nx42.wotcrawler.util.Download;
import de.nx42.wotcrawler.util.Tuple;

/**
 * This is the database crawler. It retrieves Information from the WoT Wiki
 * pages and stores them in the TanksDB.
 * 
 * @author Sebastian Straub <sebastian-straub@gmx.net>
 */
public class Crawler {
    
    private static final Logger log = LoggerFactory.getLogger(Crawler.class);
    
    /** base URL of the site to retrieve details from */
    public final static String baseURL = "http://wiki.worldoftanks.eu";
    /** US number format (to recognize stuff like "100,000.00" correctly) */
    protected final static NumberFormat format = NumberFormat.getInstance(Locale.US);
    
    /** Decides, if this crawler recieves files from local filesystem or directly from the web */
    protected Source src;
    /** If the local file system is used as source, this is the folder where all the pages are */
    protected String localFolder;
    /** Common subfolders that might be used for different types of data */
    protected File[] localParents;
    
    /** the tanks database that is filled by this crawler */
    protected TanksDB db = new TanksDB();
    /** a mapping from tank name (actually: wiki URL) to the internal tank objects
     * (used to create parent-child relations) */
    protected Map<String,Tank> tankMapping = new HashMap<String, Tank>();
    
    
    /**
     * Crawl the wiki directly from the web
     */
    public Crawler() {
        this.src = Source.URL;
    }
    
    /**
     * Crawls a previously created local copy of the relevant wiki pages
     * (using the Download class)
     * @param localFolder
     */
    public Crawler(String localFolder) {
        this.src = Source.FILE;
        this.localFolder = localFolder;
        this.localParents = new File[]{
            new File(localFolder, Download.folderTanks),
            new File(localFolder, Download.folderModules),
            new File(localFolder, Download.folderLists),
            new File(localFolder)
        };
    }
    
    /**
     * Generates the tankdb, using the current crawler settings.
     * @return the complete Tank database
     */
    public TanksDB buildTankDB() {
        
        // tanks
        
        System.out.print("Retrieving tank URLs... ");
        List<URL> tankSource = getTankURLs();
        System.out.println("done.");
        
        System.out.println(String.format("\nCrawling detail pages for %s tanks...", tankSource.size()));
        List<Tank> tanks = crawlAllTankDetails(tankSource);
        
        System.out.print("\nCreating parent and child relations... ");
        linkTankRelations(tanks);
        System.out.println("done.");
        
        db.tanks = tanks;
        
        // modules
        
        System.out.println("\nCrawling Modules...");
        db.modules = crawlModules();
        
        return db;
    }
    
    
    // -------------------- URLs (file/http) --------------------
    
    
    /**
     * Generates the URLs for all tank overview lists (http or file)
     * @return
     */
    public List<URL> getTankOverviewURLs() {
        List<URL> urls = new LinkedList<URL>();
        for (TankType tankType : TankType.values()) {
            try {
                urls.add(buildURL(tankType.getOverviewPage()));
            } catch (IOException ex) {
                log.error("Error building URL", ex);
            }
        }
        return urls;
    }
    
    /**
     * Generates the URLs for all module overview lists (http or file)
     * @return
     */
    public List<URL> getModuleOverviewURLs() {
        List<URL> urls = new LinkedList<URL>();
        for (ModuleType m : ModuleType.values()) {
            urls.addAll(getModuleOverviewURLs(m));
        }
        return urls;
    }
    
    /**
     * Generates the URLs for the module overview pages of the specified
     * module (http or file). Module lists might be split by nation on
     * several wiki-pages
     * @param m the module type to crawl
     * @return A list of URLs with overview-pages for this module type
     * (either one big list or one for each nation)
     */
    protected List<URL> getModuleOverviewURLs(ModuleType m) {
        List<URL> urls = new LinkedList<URL>();
        try {
            boolean subFolders = false;
            String testPage = m.getOverviewPage() + "/" + Nation.Germany;
            if(src == Source.FILE)
                subFolders = existsLocalFile(testPage);
            if(src == Source.URL)
                subFolders = !is404(buildURL(testPage));
            
            if(subFolders) {
                // each nation has it's own subpage for this module
                for (Nation n : Nation.values()) {
                    urls.add(buildURL(m.getOverviewPage() + "/" + n));
                }
            } else {
                // single overview page for all nations
                urls.add(buildURL(m.getOverviewPage()));
            }
        } catch (IOException ex) {
            log.error("Error building URL", ex);
        }
        return urls;
    }

    /**
     * Checks, if a HTTP URL request would return 404 (without redirecting, so
     * empty wikipages do also return 404)
     * @param httpUrl the URL to check
     * @return true, if the webserver would return 404 for this URL (no
     * redirection)
     */
    protected static boolean is404(URL httpUrl) {
        OutputStream os = null;
        try {
            HttpURLConnection huc = (HttpURLConnection) httpUrl.openConnection();
            huc.setRequestMethod("GET");
            huc.setInstanceFollowRedirects(false);
            huc.setDoOutput(true);
            huc.connect();
            os = huc.getOutputStream();
            return huc.getResponseCode() == 404;
        } catch (IOException ex) {
            log.error("Error while connecting to server", ex);
            return false;
        } finally {
            try {
                if (os != null) {
                    os.close();
                }
            } catch (IOException ex) {
                log.error("Error closing connection to server", ex);
            }
        }
    }
    
    /**
     * Generates the URLs for all tanks (http or file)
     * @return URLs to the overview-pages of all tanks
     */
    public List<URL> getTankURLs() {
        List<URL> urls = new LinkedList<URL>();
        
        for (URL overviewPage : getTankOverviewURLs()) {
            try {
                Document overview = Parser.parseHTML(overviewPage);
                Node context = firstXPathResult(overview, "//div[@class=\"mw-content-ltr\"]");
                urls.addAll(crawlTankURLs(context));
            } catch (IOException ex) {
                log.error("", ex);
            } catch (ParserConfigurationException ex) {
                log.error("", ex);
            } catch (SAXException ex) {
                log.error("", ex);
            }
        }
        
        return urls;
    }
    
    
    // -------------------- Crawlers --------------------
    
    
    /**
     * Creates a list of URLs for relevant tanks from a tank overview site
     * @param context the relevant context node that holds all lists (an not more,
     * if possible). Currently: div.mw-content-ltr
     * @return List of URLs found on this site
     */
    protected List<URL> crawlTankURLs(Node context) throws MalformedURLException {
        List<URL> urls = new LinkedList<URL>();
        
        List<Node> nodes = evaluateXPath(context, ".//li/a/@href");
        // System.out.println("Tanks in this category: " + nodes.size());
        for (Node node : nodes) {
            // build url (the substring is used to remove the leading '/')
            String tankUrl = node.getTextContent().substring(1);
            if(!tankUrl.contains(".png")) {
                urls.add(buildURL(tankUrl));
            }
        }
        
        return urls;
    }
    
    /**
     * crawl all details & build tank mapping
     * @param urls the urls of all tanks
     * @return list of Tank objects
     */
    public List<Tank> crawlAllTankDetails(List<URL> urls) {
        List<Tank> tanks = new ArrayList<Tank>(urls.size());
        
        int counter = 0;
        int max = urls.size();
        
        for (URL tankURL : urls) {
            String name = tankURL.getPath();
            if(src == Source.FILE) {
                name = name.substring(name.lastIndexOf('/') +1, name.length() -5);
            }
            
            try {
                counter++;
                System.out.println(String.format("- Crawling details for tank %s/%s: %s", counter, max, name));
                
                Document tankHtml = Parser.parseHTML(tankURL);
                if(isValidTank(tankHtml)) {
                    Node context = firstXPathResult(tankHtml, "//div[@id=\"Panel\" and @class=\"Tank\"]");
                    Tank tank = crawlTankDetails(context);
                    
                    tanks.add(tank);
                    tankMapping.put(tank.wikiURL, tank);
                }
                
            } catch (ParseException ex) {
                log.error("Crawling of details failed for Tank " + name, ex);
            } catch (IllegalAccessException ex) {
                log.error("Crawling of details failed for Tank " + name, ex);
            } catch (IOException ex) {
                log.error("Crawling of details failed for Tank " + name, ex);
            } catch (ParserConfigurationException ex) {
                log.error("Crawling of details failed for Tank " + name, ex);
            } catch (SAXException ex) {
                log.error("Crawling of details failed for Tank " + name, ex);
            } catch (NumberFormatException ex) {
                log.error("Crawling of details failed for Tank " + name, ex);
            }
        }
        
        return tanks;
    }
    
    /**
     * Decides, if the given document contains a valid tank detail page
     * @param wikiPage the detail page of a tank from the wot wiki
     * @return true, iff this page describes a valid tank
     */
    protected boolean isValidTank(Document wikiPage) {
        
        // wiki page nonexistent
        List<Node> context = evaluateXPath(wikiPage, "//div[@id=\"Panel\" and @class=\"Tank\"]");
        if(context.isEmpty()) {
            System.out.println("  -> the page about this tank is empty");
            return false;
        }
        
        // tank removed from game
        List<Node> candidates = evaluateXPath(wikiPage, "//div[@id=\"Panel\"]/p");
        if(isDeprecated(candidates)) {
            System.out.println("  -> this tank was removed from the game");
            return false;
        }
        
        return true;
    }
    
    /**
     * Decides, if one of the given context nodes contains a text pattern
     * saying that this tank was removed from the game...
     * @param candidates the context nodes that might contain the text
     * @return true, iff one of the nodes contains the text "removed from the game"
     */
    private boolean isDeprecated(List<Node> candidates) {
        for (Node node : candidates) {
            if(node.getTextContent().contains("removed from the game")) {
                return true;
            }
        }
        return false;
    }
    
    // ----------- tank details -----------
    
    /**
     * Reads all details from the specified context node to create a tank object
     * @param context the div that contains all the tank data (class="Tank")
     * @return a complete tank object
     * @throws ParseException thrown if a number cannot be correctly parsed
     * (watch out for additional stuff like units)
     * @throws IllegalAccessException thrown if an enum was not recognized
     * correctly. Watch out for typing and additional characters around the word
     */
    protected Tank crawlTankDetails(Node context) throws ParseException, IllegalAccessException, NumberFormatException {
        
        Tank tank = new Tank();
        
        // basics
        
        tank.wikiURL = firstXPathTextResult(context, "//h1[@id=\"firstHeading\"]");
        tank.name = firstXPathTextResult(context, "./h3").replace("\u00a0","");     // remove nbsp
        tank.id = generateTankID(tank.name);
        tank.nation = BaseProperties.Nation.parse(firstXPathTextResult(context, "./table[1]//td[1]"));
        tank.type = TankType.parse(firstXPathTextResult(context, "./table[1]//td[2]"));
        tank.tier = (byte) Conversion.romanToDecimal(firstXPathTextResult(context, "./table[1]//td[3]").split(" ")[1]);
        
        // prepare for details
        Node context2 = evaluateXPath(context, "./table[2]/tbody").get(0);
        
        
        // battle tier
        tank.battleTierMin = Byte.parseByte(firstXPathTextResult(context2, ".//td[@style=\"background-color:#A29C84;\"]"));
        tank.battleTierMax = (byte) (evaluateXPath(context2, ".//td[@style=\"background-color:#A29C84;\"]").size() + tank.battleTierMin - 1);
        
        // money
        String cost = firstCellAfterHeader(context2, "Cost");
        String costTest = cost.toLowerCase();
        if (costTest.length() < 3 || costTest.contains("error") || costTest.contains("not available")) {
            // no currency value -> regular for free tanks or tanks that are not available for the masses
            tank.cost = 0;
            tank.currency = Currency.Credits;
        } else {
            // gift tank?
            if (cost.toLowerCase().contains("gift")) {
                tank.gift = true;
                tank.cost = format.parse(cost.substring(cost.indexOf('(') + 1, cost.indexOf(')'))).intValue();
            } else {
                tank.cost = format.parse(cost).intValue();
            }
            // currency
            tank.currency = BaseProperties.Currency.parse(evaluateXPath(context2, ".//th[text() = \"Cost\"]/following-sibling::td/img/@alt").get(0).getTextContent());
        }
        
        // crew
        tank.crewMembers = (byte) (evaluateXPath(context2, ".//th[text() = \"Crew\"]/../following-sibling::tr[1]//br").size() + 1);
        
        // speed (new format, e.g.: 58/20 km/h)
        String speed = firstCellAfterHeader(context2, "Speed Limit").split(" ")[0];
        tank.speed = Double.parseDouble(speed.contains("/") ? speed.substring(0, speed.indexOf('/')) : speed);
        
        // hull
        String[] hull = firstCellAfterHeader(context2, "Hull Armor").split("/");
        tank.hullFront = Double.parseDouble(hull[0]);
        tank.hullSide = Double.parseDouble(hull[1]);
        tank.hullRear = Double.parseDouble(hull[2].split(" ")[0]);
        
        // gunarc
        String[] gunarc = firstCellAfterHeader(context2, "Gun Arc").split("/");
        if(gunarc.length == 1) {
            // usually 360 here, so make it 0-360
            tank.gunArcLeft  = 0;
            tank.gunArcRight = gunarc[0].contains("?") ? 0 : Double.parseDouble(gunarc[0]);
        } else {
            if(gunarc[0].contains("?") || gunarc[1].contains("?")) {
                tank.gunArcLeft = tank.gunArcRight = 0;
            } else {
                tank.gunArcLeft  = Double.parseDouble(gunarc[0]);
                tank.gunArcRight = Double.parseDouble(gunarc[1]);
            }
        }
        
        // depending equipment
        tank.equipmentStock = crawlTankEquipment(context2, BaseProperties.Development.Stock);
        tank.equipmentTop =   crawlTankEquipment(context2, BaseProperties.Development.Top);
        
        // read parent and children names, link them later
        String base = ".//tr/th[text() = \"%s\"]/following-sibling::td[1]//a/@title";
        
        List<Node> parents = evaluateXPath(context, String.format(base, "Parent"));
        for (Node node : parents) {
            tank.addParentName(node.getTextContent());
        }
        List<Node> children = evaluateXPath(context, String.format(base, "Child"));
        for (Node node : children) {
            tank.addChildName(node.getTextContent());
        }
        
        return tank;
    }
    
    /**
     * Reads the additional equipment, that can be either in the stock- or top-
     * development
     * @param context the context from where the details are read
     * @param dev the development (stock or top)
     * @return all the details of the given development
     */
    protected Equipment crawlTankEquipment(Node context, BaseProperties.Development dev) throws NumberFormatException, ParseException {
        Equipment equip = new Equipment();
        equip.development = dev;
        
        // hitpoints
        equip.hitpoints = format.parse(firstCellAfterHeader(context, "Hit Points", dev)).intValue();
        
        // weight and limit
        String[] weightLoad = firstCellAfterHeader(context, "Weight Limit", dev).split("/");
        equip.weight = Double.parseDouble(weightLoad[0]);
        if(weightLoad.length > 1) {
            equip.weightLimit = Double.parseDouble(weightLoad[1]);
        } else {
            // if no second value is given, set equal...
            equip.weightLimit = equip.weight;
        }
        
        // elevation
        String[] elevation = firstCellAfterHeader(context, "Elevation Arc", dev).split("/");
        if(!elevation[0].contains("?") && !elevation[1].contains("?")) {
            equip.gunElevationLow = Double.parseDouble(elevation[0].replace("--", "-"));       // fix for bug in wiki...
            equip.gunElevationHigh = Double.parseDouble(elevation[1]);
        }
        
        // view
        String view = firstXPathTextResult(context, String.format(".//tr/th[text() = \"View Range\"]/following-sibling::td/span[@class=\"%s\"]/div/text()", dev), "0");
        equip.viewRange = format.parse(view).doubleValue();
        
        if(equip.viewRange < 1) {   // use old method for unrealistic values
            equip.viewRange = format.parse(firstCellAfterHeader(context, "View Range", dev)).doubleValue();
        }
        
        return equip;
    }
    
    /**
     * Adds the correct Parent- and Child-Relations to the list of tanks.
     * As the used list implementation is mutable and ephemeral, no return
     * is required, the changes are made in-place...
     * @param tanks
     */
    protected void linkTankRelations(List<Tank> tanks) {
        
        for (Tank tank : tanks) {
            for (String parentName : tank.parentNames) {
                tank.addParent(tankMapping.get(parentName));
            }
            for (String childName : tank.childrenNames) {
                tank.addChild(tankMapping.get(childName));
            }
        }
        
    }
    
    // ----------- module details -----------
    
    /**
     * Crawls all available modules and writes them into the TanksDB
     * @return the Modules object, containing all modules in the game
     */
    @SuppressWarnings("unchecked")
    public Modules crawlModules() {
        Modules mods = new Modules();
        
        mods.engines = crawlModuleType(ModuleType.Engine);
        mods.guns = crawlModuleType(ModuleType.Gun);
        mods.radios = crawlModuleType(ModuleType.Radio);
        mods.suspensions = crawlModuleType(ModuleType.Suspension);
        mods.turrets = crawlModuleType(ModuleType.Turret);
        
        return mods;
    }
    
    /**
     * Returns a list of all Modules of the given type. As only Modules of the
     * specified type are crawled, the resulting list could be as well assigned
     * to variables that are typed in the corresponding subclass.
     * Due to restrictions in java generics, this problem cannot be resolved
     * on compile-time...
     * @param type the type of Module to crawl
     * @return list of all modules of the specified type
     */
    @SuppressWarnings("rawtypes")
    protected List crawlModuleType(ModuleType type) {
        System.out.println("- Crawling Modules: " + type.getOverviewPage());
        
        List<Module> modules = new ArrayList<Module>(100);
        List<URL> source = getModuleOverviewURLs(type);
   
        try {
            if (source.size() == 1) {
                
                System.out.println("  * Retrieving... ");

                Document moduleSite = Parser.parseHTML(buildURL(type.getOverviewPage()));
                Node context = firstXPathResult(moduleSite, "//div[@class=\"mw-content-ltr\"]");

                System.out.print("  * Parsing... ");

                List<Node> perNation = evaluateXPath(context, ".//div[@class = \"ModuleList\"]");
                for (Node node : perNation) {
                    crawlModuleNation(type, modules, node);
                }
                
            } else {

                System.out.print("  * Processing... ");
                for (URL url : source) {
                    Document moduleSite = Parser.parseHTML(url);
                    Node context = firstXPathResult(moduleSite, "//div[@class = \"ModuleList\"]");
                    crawlModuleNation(type, modules, context);
                }

            }
        } catch (IOException ex) {
            log.error("Crawling of modules failed", ex);
        } catch (ParserConfigurationException ex) {
            log.error("Crawling of modules failed", ex);
        } catch (SAXException ex) {
            log.error("Crawling of modules failed", ex);
        }

        System.out.println("done.");
        return modules;
    }
    
    /**
     * Crawls all Modules from the given context node and associates them with the
     * nation that is found at the context node.
     * Adds all modules to the given (mutable) list, so no return type required...
     * @param type the type of module to crawl
     * @param modules mutable list of modules. results will be added to this
     * @param context from this context node the search is starting
     */
    protected void crawlModuleNation(ModuleType type, List<Module> modules, Node context) {
        String nationString = "";
        try {
            nationString = firstXPathResult(context, "./h3/span/@id").getTextContent();
        } catch (NullPointerException ex) {
            // ignore, for some entries there is no nation defined...
        }
        
        Nation nation = Nation.parseAdvanced(nationString);
        System.out.print(nation + ".. ");
        
        List<Node> rows = evaluateXPath(context, "./table/tbody/tr[not(@*)]");
        for (Node row : rows) {
            try {
                List<Node> cells = evaluateXPath(row, "./td");
                
                // switching on each row sucks ass, but doing it before makes
                // code massively redundant...
                switch(type) {
                    case Engine: modules.add(crawlSingleEngine(cells, nation)); break;
                    case Gun: modules.add(crawlSingleGun(cells, nation)); break;
                    case Radio: modules.add(crawlSingleRadio(cells, nation)); break;
                    case Suspension: modules.add(crawlSingleSuspension(cells, nation)); break;
                    case Turret: modules.add(crawlSingleTurret(cells, nation)); break;
                    default: log.warn("Unrecognized Module Type: {}", type.toString());
                }
            } catch (ParseException ex) {
                log.error("Crawling of module details failed", ex);
            } catch (IllegalAccessException ex) {
                log.error("Crawling of module details failed", ex);
            }
        }
        
    }
    
    // single modules
    
    /**
     * Crawls the details for a single Engine
     * @param cells the cells of the row containing the details of this engine
     * @param nation the nation this engine belongs to
     * @return a completely filled engine object
     * @throws ParseException if a number cannot be parsed
     * @throws IllegalAccessException if an enum cannot be parsed
     */
    protected Engine crawlSingleEngine(List<Node> cells, Nation nation) throws ParseException, IllegalAccessException {
        Engine e = new Engine();
        
        e.nation = nation;
        e.tier = parseTier(cells, 1);
        e.wikiURL = parseModuleUrl(ModuleType.Engine, cells);
        e.name = parseStringBold(cells, 2);
        e.power = parseInt(cells, 3);
        e.firechance = parseInt(cells, 4);
        
        // gas
        String gas = parseString(cells, 5);
        e.gas = (gas.equals("--")) ? null : Gas.parse(gas);
        
        // cost
        parseCost(cells, 6, e);
        
        // weight
        String weight = parseString(cells, 7);
        e.weight = (weight.equals("--")) ? 0 : format.parse(weight).doubleValue();
        
        // compatibility
        e.compatibility = parseCompatibility(cells, 8);
        
        return e;
    }
    
    /**
     * Crawls the details for a single Gun
     * @param cells the cells of the row containing the details of this engine
     * @param nation the nation this engine belongs to
     * @return a completely filled gun object
     * @throws ParseException if a number cannot be parsed
     */
    protected Gun crawlSingleGun(List<Node> cells, Nation nation) throws ParseException {
        Gun g = new Gun();
        
        g.nation = nation;
        g.tier = parseTier(cells, 1);
        g.wikiURL = parseModuleUrl(ModuleType.Gun, cells);
        g.name = parseStringBold(cells, 2);
        
        // ammo
        Tuple<String,String> ammo = resolveStringTuple(parseStringBold(cells, 3), "-");
        g.ammoCapacityMin = ammo.fst().equals("?") ? -1 : Integer.parseInt(ammo.fst());
        g.ammoCapacityMax = ammo.snd().equals("?") ? -1 : Integer.parseInt(ammo.snd());
        
        // damage
        g.dmgAP = format.parse(firstXPathTextResult(cells.get(3), "./span[@class = \"ammoAP\"]", "0")).intValue();
        g.dmgAPCR = format.parse(firstXPathTextResult(cells.get(3), "./span[@class = \"ammoAPCR\"]", "0")).intValue();
        g.dmgHE = format.parse(firstXPathTextResult(cells.get(3), "./span[@class = \"ammoHE\"]", "0")).intValue();
        g.dmgHEAT = format.parse(firstXPathTextResult(cells.get(3), "./span[@class = \"ammoHEAT\"]", "0")).intValue();
        
        // penetration
        g.penAP = format.parse(firstXPathTextResult(cells.get(4), "./span[@class = \"ammoAP\"]", "0")).intValue();
        g.penAPCR = format.parse(firstXPathTextResult(cells.get(4), "./span[@class = \"ammoAPCR\"]", "0")).intValue();
        g.penHE = format.parse(firstXPathTextResult(cells.get(4), "./span[@class = \"ammoHE\"]", "0")).intValue();
        g.penHEAT = format.parse(firstXPathTextResult(cells.get(4), "./span[@class = \"ammoHEAT\"]", "0")).intValue();
        
        // fire rate
        Tuple<String,String> fireRate = resolveStringTuple(parseString(cells, 6), "-");
        g.fireRateMin = Double.parseDouble(fireRate.fst());
        g.fireRateMax = Double.parseDouble(fireRate.snd());
        
        // accuracy
        Tuple<String,String> accuracy = resolveStringTuple(parseString(cells, 7), "-");
        g.accuracyMin = Double.parseDouble(accuracy.fst());
        g.accuracyMax = Double.parseDouble(accuracy.snd());
        
        // aim time
        Tuple<String,String> aimTime = resolveStringTuple(parseString(cells, 8), "-");
        g.aimTimeMin = Double.parseDouble(aimTime.fst());
        g.aimTimeMax = Double.parseDouble(aimTime.snd());
        
        // cost
        parseCost(cells, 9, g);
        
        // weight
        String weight = parseString(cells, 10);
        g.weight = (weight.equals("--")) ? 0 : format.parse(weight).doubleValue();
        
        // compatibility
        g.compatibility = parseCompatibility(cells, 11);
        
        return g;
    }
    
    /**
     * Crawls the details for a single Radio
     * @param cells the cells of the row containing the details of this engine
     * @param nation the nation this engine belongs to
     * @return a completely filled radio object
     * @throws ParseException if a number cannot be parsed
     */
    protected Radio crawlSingleRadio(List<Node> cells, Nation nation) throws ParseException {
        Radio r = new Radio();
        
        r.nation = nation;
        r.tier = parseTier(cells, 1);
        r.wikiURL = parseModuleUrl(ModuleType.Radio, cells);
        r.name = parseStringBold(cells, 2);
        r.range = parseInt(cells, 3);
        parseCost(cells, 4, r);
        
        String weight = parseString(cells, 5);
        r.weight = (weight.equals("--")) ? 0 : format.parse(weight).doubleValue();
        
        r.compatibility = parseCompatibility(cells, 6);
        
        return r;
    }
    
    /**
     * Crawls the details for a single Suspension
     * @param cells the cells of the row containing the details of this engine
     * @param nation the nation this engine belongs to
     * @return a completely filled Suspension object
     * @throws ParseException if a number cannot be parsed
     */
    protected Suspension crawlSingleSuspension(List<Node> cells, Nation nation) throws ParseException {
        Suspension s = new Suspension();
        
        s.nation = nation;
        s.tier = parseTier(cells, 1);
        s.wikiURL = parseModuleUrl(ModuleType.Suspension, cells);
        s.name = parseStringBold(cells, 2);
        s.load = parseDouble(cells, 3);
        s.traverse = parseInt(cells, 4);
        parseCost(cells, 5, s);
        
        String weight = parseString(cells, 6);
        s.weight = (weight.equals("--")) ? 0 : format.parse(weight).doubleValue();
        
        s.compatibility = parseCompatibility(cells, 7);
        
        return s;
    }
    
    /**
     * Crawls the details for a single Turret
     * @param cells the cells of the row containing the details of this engine
     * @param nation the nation this engine belongs to
     * @return a completely filled Turret object
     * @throws ParseException if a number cannot be parsed
     */
    protected Turret crawlSingleTurret(List<Node> cells, Nation nation) throws ParseException {
        Turret t = new Turret();
        
        t.nation = nation;
        t.tier = parseTier(cells, 1);
        t.wikiURL = parseModuleUrl(ModuleType.Turret, cells);
        t.name = parseStringBold(cells, 2);
        
        // armor
        String[] armor = parseString(cells, 3).split("/");
        if (armor.length == 3) {
            t.armorFront = Double.parseDouble(armor[0]);
            t.armorSide = Double.parseDouble(armor[1]);
            t.armorRear = Double.parseDouble(armor[2]);
        } else {
            log.warn("Error parsing armor: not enough values");
        }
        
        t.traverse = parseDouble(cells, 4);
        t.viewRange = parseDouble(cells, 5);
        parseCost(cells, 6, t);
        
        String weight = parseString(cells, 7);
        t.weight = (weight.equals("--")) ? 0 : format.parse(weight).doubleValue();
        
        t.compatibility = parseCompatibility(cells, 8);
        
        return t;
    }
    
    // ----------- generic parsers -----------
    
    /**
     * Parses the Tier of an Object.
     * In fact, this method is just a generic roman number parses, that takes
     * a number of cells as input and an index that decides in which cell the
     * roman number can be found.
     * @param cells the cells of the row containing the required value
     * @param cell the position (starting by 1) of the required value in the list of cells
     * @return the converted roman number (as byte)
     */
    protected byte parseTier(List<Node> cells, int cell) {
        return (byte) Conversion.romanToDecimal(allXPathTextResult(getRelevantContext(cells, cell), "./span/b/text()"));
    }
    
    /**
     * Parses a simple String from a given cell
     * @param cells the cells of the row containing the required value
     * @param cell the position (starting by 1) of the required value in the list of cells
     * @return the string value of the selected cell
     */
    protected String parseString(List<Node> cells, int cell) {
        return allXPathTextResult(getRelevantContext(cells, cell), "./text()");
    }
    
    /**
     * Parses a simple String in within <b>-tags from a given cell
     * @param cells the cells of the row containing the required value
     * @param cell the position (starting by 1) of the required value in the list of cells
     * @return the string value (without <b>-tags) of the selected cell
     */
    protected String parseStringBold(List<Node> cells, int cell) {
        return allXPathTextResult(getRelevantContext(cells, cell), "./b/text()");
    }
    
    /**
     * Parses an integer value from a given cell
     * @param cells the cells of the row containing the required value
     * @param cell the position (starting by 1) of the required value in the list of cells
     * @return the integer value of the selected cell
     * @throws ParseException if the integer cannot be parsed
     */
    protected int parseInt(List<Node> cells, int cell) throws ParseException {
        return format.parse(allXPathTextResult(getRelevantContext(cells, cell), "./text()")).intValue();
    }
    
    /**
     * Parses a double value from a given cell
     * @param cells the cells of the row containing the required value
     * @param cell the position (starting by 1) of the required value in the list of cells
     * @return the double value of the selected cell
     * @throws ParseException if the double cannot be parsed
     */
    protected double parseDouble(List<Node> cells, int cell) throws ParseException {
        return format.parse(allXPathTextResult(getRelevantContext(cells, cell), "./text()")).doubleValue();
    }
    
    /**
     * Parses the cost and currency of a module and writes them directly in the
     * given Module object (to avoid tuple return values...)
     * @param cells the cells of the row containing the required values
     * @param cell the position (starting by 1) of the required values in the list of cells
     * @throws ParseException if a number cannot be parsed
     */
    protected void parseCost(List<Node> cells, int cell, Module mod) throws ParseException {
        Node context = getRelevantContext(cells, cell);
        
        String currency = firstXPathTextResult(context, "./span/img/@alt").toLowerCase();
        if(currency.contains("credit")) {
            String cost = allXPathTextResult(context, "./text()");
            mod.cost = (cost.equals("--")) ? 0 : format.parse(cost).intValue();
            mod.currency = Currency.Credits;
        } else if(currency.contains("premium")) {
            mod.cost = 0;
            mod.currency = Currency.Premium;
        } else {
            log.warn("error parsing cost, currency {} not recognized.", currency);
        }
    }
    
    /**
     * Parses the id of the selected module and generates a wikilink from it.
     * @param type the type of module that is requested
     * @param cells the cells of the row containing the details of this engine
     * @return the wikilink to the module
     */
    protected String parseModuleUrl(ModuleType type, List<Node> cells) {
        return type.toString() + "#" + firstXPathTextResult(cells.get(1), ".//div/@id", "");
    }
    
    /**
     * Parses a list of tanks that are compatible to a module
     * @param cells the cells of the row containing the required value
     * @param cell the position (starting by 1) of the required value in the list of cells
     * @return the list of tank references
     */
    protected List<TankRef> parseCompatibility(List<Node> cells, int cell) {
        List<Node> links = evaluateXPath(getRelevantContext(cells, cell), "./a/@title");
        List<TankRef> compat = new ArrayList<TankRef>(links.size());
        
        for (Node tankWikiLink : links) {
            compat.add(getTankRefByUrl(tankWikiLink.getTextContent()));
        }
        
        return compat;
    }
    
    // parser helper
    
    /**
     * Gets the context Node of a specific cell from a list of cells. If the
     * cell contains a <center> tag, it is removed and only it's actual contents
     * are shown.
     * @param cells the cells of the row containing the required value
     * @param cell the position (starting by 1) of the required value in the list of cells
     * @return the relevant context node at the specified position
     */
    private Node getRelevantContext(List<Node> cells, int cell) {
        return firstXPathResult(cells.get(cell-1), "./center", ".");
    }
    
    /**
     * Separates an input string using the specified delimiter and returns
     * a tuple of the first two results of this split. If there is no second
     * value, the first value will be copied to the second position instead.
     * @param input the input string
     * @param delimiter the delimiter to split the string
     * @return a tuple containing the first two values of the split
     */
    protected Tuple<String, String> resolveStringTuple(String input, String delimiter) {
        if (input.contains(delimiter)) {
            String[] ab = input.split(delimiter);
            return Tuple.of(ab[0], ab.length > 1 ? ab[1] : ab[0]);
        } else {
            return Tuple.of(input, input);
        }
    }
    
    /**
     * Returns a tank reference that points to the tank of the specified name
     * @param wikiURL the wiki url of the tank to refer to
     * @return the corresponding tank reference
     */
    protected TankRef getTankRefByUrl(String wikiURL) {
        return new TankRef(tankMapping.get(wikiURL));
    }
    
    
    
    // -------------------- XPath --------------------
    
    
    
    /**
     * the XPath-Factory can create new XPath queries. No need to initialize
     * it more than once...
     */
    protected final static XPathFactory xpf = XPathFactory.newInstance();
    
    /**
     * Evaluates a given XPath query in the given context (must be an Element,
     * or a Node or something...) and returns a (possibly empty) list of results.
     * @param context the context where to start the query from. Will be ignored
     * of course, if the query starts with / or something. Use . to address the
     * context node.
     * @param expression the XPath expression as String
     * @return list of results (as Node-objects)
     */
    protected List<Node> evaluateXPath(Object context, String expression) {
        
        try {
            // evaluate xpath
            XPath xpath = xpf.newXPath();
            XPathExpression expr = xpath.compile(expression);
            NodeList result = (NodeList) expr.evaluate(context, XPathConstants.NODESET);
            
            // store result
            List<Node> nodes = new ArrayList<Node>(result.getLength());
            for (int i = 0; i < result.getLength(); i++) {
                nodes.add(result.item(i));
            }
            return nodes;
        } catch (XPathExpressionException ex) {
            log.error("Evaluating of XPath expression failed", ex);
        }
        
        // return null, if xpath expression fails
        return null;
    }
    
    /**
     * Evaluates the given XPath expression and returns the first resulting node.
     * Warning: returns null if no results were found
     * @param context the context where to start the query from
     * @param expression the XPath expression as String
     * @return the first resulting Node
     */
    protected Node firstXPathResult(Object context, String expression) {
        try {
            return evaluateXPath(context, expression).get(0);
        } catch(IndexOutOfBoundsException e) {
            // null results are expected...
            return null;
        }
    }
    
    /**
     * Evaluates the XPath expressions in their given order and returns the
     * first resulting node.
     * Warning: returns null if no results were found
     * @param context the context where to start the query from
     * @param expression the XPath expression as String
     * @return the first resulting Node
     */
    protected Node firstXPathResult(Object context, String... expressions) {
        for (String expr : expressions) {
            List<Node> results = evaluateXPath(context, expr);
            if(!results.isEmpty()) {
                return results.get(0);
            }
        }
        return null;
    }
    
    /**
     * Returns the full textual representation of the first resulting Node,
     * including the text contents of all subnodes.
     * @param context the context where to start the query from
     * @param expression the XPath expression as String
     * @return all text contents of the resulting node + subnodes. Trimmed.
     */
    protected String firstXPathTextResult(Object context, String expression) {
        return firstXPathResult(context, expression).getTextContent().trim();
    }
    
    /**
     * Returns the full textual representation of the first resulting Node,
     * including the text contents of all subnodes.
     * If there is no result, the value in returnOnError is returned
     * @param context the context where to start the query from
     * @param expression the XPath expression as String
     * @param returnOnError the value to return if no result is found
     * @return all text contents of the resulting node + subnodes. Trimmed.
     * (or the backup value, in case no result is found)
     */
    protected String firstXPathTextResult(Object context, String expression, String returnOnError) {
        try {
            return firstXPathTextResult(context, expression);
        } catch(NullPointerException ex) {
            return returnOnError;
        }
    }
    
    /**
     * Returns the combined textual representation of all resulting Nodes.
     * Good if you want to append several text() Nodes that are split by
     * other non-text nodes, like
     * <p>Hello, <br/>Mike!<p>
     * @param context the context where to start the query from
     * @param expression the XPath expression as String
     * @return all text contents of all resulting nodes, combined without
     * whitespaces. Result is trimmed.
     */
    protected String allXPathTextResult(Object context, String expression) {
        List<Node> nodes = evaluateXPath(context, expression);
        StringBuilder sb = new StringBuilder(nodes.size());
        for (Node node : nodes) {
            sb.append(node.getTextContent());
        }
        return sb.toString().trim();
    }
    
    /**
     * Special construct: Searches a <th> with text content == head. Picks
     * it's first following sibling <td> and returns it's text content
     * @param context the context where to start the query from
     * @param head the name of the table header to search
     * @return the information associated with this header
     */
    protected String firstCellAfterHeader(Object context, String head) {
        return allXPathTextResult(context, String.format(".//tr/th[text() = \"%s\"]/following-sibling::td[1]/text()", head));
    }
    
    /**
     * Special construct: Searches a <th> with text content == head. Picks
     * it's first following sibling <td> and returns the text content of
     * it's inner span with class == dev
     * @param context the context where to start the query from
     * @param head the name of the table header to search
     * @param dev the development to retrieve (stock or top)
     * @return the information associated with this header as stock or top
     */
    protected String firstCellAfterHeader(Object context, String head, BaseProperties.Development dev) {
        String xpath = String.format(".//tr/th[text() = \"%s\"]/following-sibling::td/span[@class=\"%s\"]/text()", head, dev);
        return allXPathTextResult(context, xpath);
    }
    
    
    
    // -------------------- Helpers --------------------
    
    
    /**
     * Builds the URL of the specified wiki page name, according to the settings
     * of this class (this means, it could return a http-URL pointing to the wot
     * wiki or a file-URL pointing to the local directory where the wiki pages
     * are stored)
     * @param siteName the name of the site in the wot wiki
     * @return the url of this site (file or http)
     * @throws MalformedURLException if there is something heavily wrong with the
     * site name (illegal characters and stuff...)
     */
    protected URL buildURL(String siteName) throws MalformedURLException {
        switch(src) {
            case FILE:
                return findLocalFile(siteName).toURI().toURL();
            case URL:
                return buildWikiLink(siteName);
            default:
                log.warn("Unknown enum value: " + src.toString());
                return null;
        }
    }
    
    /**
     * Searches for a local file for the given site name in the commonly used
     * folders. Throws an exception, if the file is not found.
     * @param siteName the name of the site in the wot wiki
     * @return the local copy of the wiki-page
     */
    protected File findLocalFile(String siteName) {
        String file = siteToFileName(siteName);
        for (File parent : localParents) {
            File f = new File(parent, file);
            if(f.exists()) {
                return f;
            }
        }
        throw new RuntimeException("File for " + siteName + " was not found");
    }
    
    /**
     * Decices, if there is a local copy of a wikipage for the given page name
     * @param siteName the name of the site in the wot wiki
     * @return true, if the local copy exists
     */
    protected boolean existsLocalFile(String siteName) {
        String file = siteToFileName(siteName);
        for (File parent : localParents) {
            File f = new File(parent, file);
            if(f.exists()) {
                return true;
            }
        }
        return false;
    }
    
    /**
     * Builds the URL of the specified wiki page name, always HTTP
     * @param siteName the name of the site in the wot wiki
     * @return the url of this site (http)
     * @throws MalformedURLException if there is something heavily wrong with the
     * site name (illegal characters and stuff...)
     */
    public static URL buildWikiLink(String siteName) throws MalformedURLException {
        return new URL(String.format("%s/%s", Crawler.baseURL, siteName));
    }
    
    /**
     * Generates a tank ID that conforms to the rules of NCName, as defined by
     * the W3C: http://www.w3.org/TR/1999/REC-xml-names-19990114/#NT-NCName
     * the NCName restriction is required for ID and IDREF values
     * @param name the name of the tank
     * @return the NCName conform ID generated from this name
     */
    protected static String generateTankID(String name) {
        return "_" + cleanFileName(name).replace(" ", "").replace("(", "").replace(")", "").replace(".", "");
    }
    
    /**
     * Adds a ".html" prefix and removes some characters from the sitename,
     * that would make it illegal to store in a filesystem (like / or \)
     * @param siteName the name of the site
     * @return the filename generated from the site name
     */
    public static String siteToFileName(String siteName) {
        return cleanFileName(siteName) + ".html";
    }
    
    /**
     * A set of illegal chars in filenames (unified list of unix and windows chars)
     */
    final static int[] illegalChars = {34, 60, 62, 124, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 58, 42, 63, 92, 47};
    static {
        // need this to ensure that binarySearch can work!
        Arrays.sort(illegalChars);
    }
    
    /**
     * Removes all illegal chars from a filename and replaces them with '_' at
     * their original index
     * @param badFileName the old (possibly faulty) filename
     * @return a possibly modified version of the file name that works with
     * every file system
     */
    public static String cleanFileName(String badFileName) {
        StringBuilder cleanName = new StringBuilder();
        for (int i = 0; i < badFileName.length(); i++) {
            int c = badFileName.charAt(i);
            if (Arrays.binarySearch(illegalChars, c) < 0) {
                cleanName.append((char) c);
            } else {
                // replace all illegal chars by '_'
                cleanName.append('_');
            }
        }
        return cleanName.toString();
    }
    
    /**
     * Decides, if a crawler recieves files from the local filesystem or
     * directly from the web.
     */
    enum Source {
        FILE,
        URL;
    }
    
}