/*
* Copyright (C) 2012 Sebastian Straub <sebastian-straub@gmx.net>
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package de.nx42.wotcrawler.xml;
import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.text.NumberFormat;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
import de.nx42.wotcrawler.db.BaseProperties;
import de.nx42.wotcrawler.db.BaseProperties.Currency;
import de.nx42.wotcrawler.db.BaseProperties.Nation;
import de.nx42.wotcrawler.db.Modules;
import de.nx42.wotcrawler.db.TanksDB;
import de.nx42.wotcrawler.db.module.Engine;
import de.nx42.wotcrawler.db.module.Engine.Gas;
import de.nx42.wotcrawler.db.module.Gun;
import de.nx42.wotcrawler.db.module.Module;
import de.nx42.wotcrawler.db.module.Module.ModuleType;
import de.nx42.wotcrawler.db.module.Radio;
import de.nx42.wotcrawler.db.module.Suspension;
import de.nx42.wotcrawler.db.module.Turret;
import de.nx42.wotcrawler.db.tank.Equipment;
import de.nx42.wotcrawler.db.tank.Tank;
import de.nx42.wotcrawler.db.tank.Tank.TankType;
import de.nx42.wotcrawler.db.tank.TankRef;
import de.nx42.wotcrawler.util.Conversion;
import de.nx42.wotcrawler.util.Download;
import de.nx42.wotcrawler.util.Tuple;
/**
* This is the database crawler. It retrieves Information from the WoT Wiki
* pages and stores them in the TanksDB.
*
* @author Sebastian Straub <sebastian-straub@gmx.net>
*/
public class Crawler {
private static final Logger log = LoggerFactory.getLogger(Crawler.class);
/** base URL of the site to retrieve details from */
public final static String baseURL = "http://wiki.worldoftanks.eu";
/** US number format (to recognize stuff like "100,000.00" correctly) */
protected final static NumberFormat format = NumberFormat.getInstance(Locale.US);
/** Decides, if this crawler recieves files from local filesystem or directly from the web */
protected Source src;
/** If the local file system is used as source, this is the folder where all the pages are */
protected String localFolder;
/** Common subfolders that might be used for different types of data */
protected File[] localParents;
/** the tanks database that is filled by this crawler */
protected TanksDB db = new TanksDB();
/** a mapping from tank name (actually: wiki URL) to the internal tank objects
* (used to create parent-child relations) */
protected Map<String,Tank> tankMapping = new HashMap<String, Tank>();
/**
* Crawl the wiki directly from the web
*/
public Crawler() {
this.src = Source.URL;
}
/**
* Crawls a previously created local copy of the relevant wiki pages
* (using the Download class)
* @param localFolder
*/
public Crawler(String localFolder) {
this.src = Source.FILE;
this.localFolder = localFolder;
this.localParents = new File[]{
new File(localFolder, Download.folderTanks),
new File(localFolder, Download.folderModules),
new File(localFolder, Download.folderLists),
new File(localFolder)
};
}
/**
* Generates the tankdb, using the current crawler settings.
* @return the complete Tank database
*/
public TanksDB buildTankDB() {
// tanks
System.out.print("Retrieving tank URLs... ");
List<URL> tankSource = getTankURLs();
System.out.println("done.");
System.out.println(String.format("\nCrawling detail pages for %s tanks...", tankSource.size()));
List<Tank> tanks = crawlAllTankDetails(tankSource);
System.out.print("\nCreating parent and child relations... ");
linkTankRelations(tanks);
System.out.println("done.");
db.tanks = tanks;
// modules
System.out.println("\nCrawling Modules...");
db.modules = crawlModules();
return db;
}
// -------------------- URLs (file/http) --------------------
/**
* Generates the URLs for all tank overview lists (http or file)
* @return
*/
public List<URL> getTankOverviewURLs() {
List<URL> urls = new LinkedList<URL>();
for (TankType tankType : TankType.values()) {
try {
urls.add(buildURL(tankType.getOverviewPage()));
} catch (IOException ex) {
log.error("Error building URL", ex);
}
}
return urls;
}
/**
* Generates the URLs for all module overview lists (http or file)
* @return
*/
public List<URL> getModuleOverviewURLs() {
List<URL> urls = new LinkedList<URL>();
for (ModuleType m : ModuleType.values()) {
urls.addAll(getModuleOverviewURLs(m));
}
return urls;
}
/**
* Generates the URLs for the module overview pages of the specified
* module (http or file). Module lists might be split by nation on
* several wiki-pages
* @param m the module type to crawl
* @return A list of URLs with overview-pages for this module type
* (either one big list or one for each nation)
*/
protected List<URL> getModuleOverviewURLs(ModuleType m) {
List<URL> urls = new LinkedList<URL>();
try {
boolean subFolders = false;
String testPage = m.getOverviewPage() + "/" + Nation.Germany;
if(src == Source.FILE)
subFolders = existsLocalFile(testPage);
if(src == Source.URL)
subFolders = !is404(buildURL(testPage));
if(subFolders) {
// each nation has it's own subpage for this module
for (Nation n : Nation.values()) {
urls.add(buildURL(m.getOverviewPage() + "/" + n));
}
} else {
// single overview page for all nations
urls.add(buildURL(m.getOverviewPage()));
}
} catch (IOException ex) {
log.error("Error building URL", ex);
}
return urls;
}
/**
* Checks, if a HTTP URL request would return 404 (without redirecting, so
* empty wikipages do also return 404)
* @param httpUrl the URL to check
* @return true, if the webserver would return 404 for this URL (no
* redirection)
*/
protected static boolean is404(URL httpUrl) {
OutputStream os = null;
try {
HttpURLConnection huc = (HttpURLConnection) httpUrl.openConnection();
huc.setRequestMethod("GET");
huc.setInstanceFollowRedirects(false);
huc.setDoOutput(true);
huc.connect();
os = huc.getOutputStream();
return huc.getResponseCode() == 404;
} catch (IOException ex) {
log.error("Error while connecting to server", ex);
return false;
} finally {
try {
if (os != null) {
os.close();
}
} catch (IOException ex) {
log.error("Error closing connection to server", ex);
}
}
}
/**
* Generates the URLs for all tanks (http or file)
* @return URLs to the overview-pages of all tanks
*/
public List<URL> getTankURLs() {
List<URL> urls = new LinkedList<URL>();
for (URL overviewPage : getTankOverviewURLs()) {
try {
Document overview = Parser.parseHTML(overviewPage);
Node context = firstXPathResult(overview, "//div[@class=\"mw-content-ltr\"]");
urls.addAll(crawlTankURLs(context));
} catch (IOException ex) {
log.error("", ex);
} catch (ParserConfigurationException ex) {
log.error("", ex);
} catch (SAXException ex) {
log.error("", ex);
}
}
return urls;
}
// -------------------- Crawlers --------------------
/**
* Creates a list of URLs for relevant tanks from a tank overview site
* @param context the relevant context node that holds all lists (an not more,
* if possible). Currently: div.mw-content-ltr
* @return List of URLs found on this site
*/
protected List<URL> crawlTankURLs(Node context) throws MalformedURLException {
List<URL> urls = new LinkedList<URL>();
List<Node> nodes = evaluateXPath(context, ".//li/a/@href");
// System.out.println("Tanks in this category: " + nodes.size());
for (Node node : nodes) {
// build url (the substring is used to remove the leading '/')
String tankUrl = node.getTextContent().substring(1);
if(!tankUrl.contains(".png")) {
urls.add(buildURL(tankUrl));
}
}
return urls;
}
/**
* crawl all details & build tank mapping
* @param urls the urls of all tanks
* @return list of Tank objects
*/
public List<Tank> crawlAllTankDetails(List<URL> urls) {
List<Tank> tanks = new ArrayList<Tank>(urls.size());
int counter = 0;
int max = urls.size();
for (URL tankURL : urls) {
String name = tankURL.getPath();
if(src == Source.FILE) {
name = name.substring(name.lastIndexOf('/') +1, name.length() -5);
}
try {
counter++;
System.out.println(String.format("- Crawling details for tank %s/%s: %s", counter, max, name));
Document tankHtml = Parser.parseHTML(tankURL);
if(isValidTank(tankHtml)) {
Node context = firstXPathResult(tankHtml, "//div[@id=\"Panel\" and @class=\"Tank\"]");
Tank tank = crawlTankDetails(context);
tanks.add(tank);
tankMapping.put(tank.wikiURL, tank);
}
} catch (ParseException ex) {
log.error("Crawling of details failed for Tank " + name, ex);
} catch (IllegalAccessException ex) {
log.error("Crawling of details failed for Tank " + name, ex);
} catch (IOException ex) {
log.error("Crawling of details failed for Tank " + name, ex);
} catch (ParserConfigurationException ex) {
log.error("Crawling of details failed for Tank " + name, ex);
} catch (SAXException ex) {
log.error("Crawling of details failed for Tank " + name, ex);
} catch (NumberFormatException ex) {
log.error("Crawling of details failed for Tank " + name, ex);
}
}
return tanks;
}
/**
* Decides, if the given document contains a valid tank detail page
* @param wikiPage the detail page of a tank from the wot wiki
* @return true, iff this page describes a valid tank
*/
protected boolean isValidTank(Document wikiPage) {
// wiki page nonexistent
List<Node> context = evaluateXPath(wikiPage, "//div[@id=\"Panel\" and @class=\"Tank\"]");
if(context.isEmpty()) {
System.out.println(" -> the page about this tank is empty");
return false;
}
// tank removed from game
List<Node> candidates = evaluateXPath(wikiPage, "//div[@id=\"Panel\"]/p");
if(isDeprecated(candidates)) {
System.out.println(" -> this tank was removed from the game");
return false;
}
return true;
}
/**
* Decides, if one of the given context nodes contains a text pattern
* saying that this tank was removed from the game...
* @param candidates the context nodes that might contain the text
* @return true, iff one of the nodes contains the text "removed from the game"
*/
private boolean isDeprecated(List<Node> candidates) {
for (Node node : candidates) {
if(node.getTextContent().contains("removed from the game")) {
return true;
}
}
return false;
}
// ----------- tank details -----------
/**
* Reads all details from the specified context node to create a tank object
* @param context the div that contains all the tank data (class="Tank")
* @return a complete tank object
* @throws ParseException thrown if a number cannot be correctly parsed
* (watch out for additional stuff like units)
* @throws IllegalAccessException thrown if an enum was not recognized
* correctly. Watch out for typing and additional characters around the word
*/
protected Tank crawlTankDetails(Node context) throws ParseException, IllegalAccessException, NumberFormatException {
Tank tank = new Tank();
// basics
tank.wikiURL = firstXPathTextResult(context, "//h1[@id=\"firstHeading\"]");
tank.name = firstXPathTextResult(context, "./h3").replace("\u00a0",""); // remove nbsp
tank.id = generateTankID(tank.name);
tank.nation = BaseProperties.Nation.parse(firstXPathTextResult(context, "./table[1]//td[1]"));
tank.type = TankType.parse(firstXPathTextResult(context, "./table[1]//td[2]"));
tank.tier = (byte) Conversion.romanToDecimal(firstXPathTextResult(context, "./table[1]//td[3]").split(" ")[1]);
// prepare for details
Node context2 = evaluateXPath(context, "./table[2]/tbody").get(0);
// battle tier
tank.battleTierMin = Byte.parseByte(firstXPathTextResult(context2, ".//td[@style=\"background-color:#A29C84;\"]"));
tank.battleTierMax = (byte) (evaluateXPath(context2, ".//td[@style=\"background-color:#A29C84;\"]").size() + tank.battleTierMin - 1);
// money
String cost = firstCellAfterHeader(context2, "Cost");
String costTest = cost.toLowerCase();
if (costTest.length() < 3 || costTest.contains("error") || costTest.contains("not available")) {
// no currency value -> regular for free tanks or tanks that are not available for the masses
tank.cost = 0;
tank.currency = Currency.Credits;
} else {
// gift tank?
if (cost.toLowerCase().contains("gift")) {
tank.gift = true;
tank.cost = format.parse(cost.substring(cost.indexOf('(') + 1, cost.indexOf(')'))).intValue();
} else {
tank.cost = format.parse(cost).intValue();
}
// currency
tank.currency = BaseProperties.Currency.parse(evaluateXPath(context2, ".//th[text() = \"Cost\"]/following-sibling::td/img/@alt").get(0).getTextContent());
}
// crew
tank.crewMembers = (byte) (evaluateXPath(context2, ".//th[text() = \"Crew\"]/../following-sibling::tr[1]//br").size() + 1);
// speed (new format, e.g.: 58/20 km/h)
String speed = firstCellAfterHeader(context2, "Speed Limit").split(" ")[0];
tank.speed = Double.parseDouble(speed.contains("/") ? speed.substring(0, speed.indexOf('/')) : speed);
// hull
String[] hull = firstCellAfterHeader(context2, "Hull Armor").split("/");
tank.hullFront = Double.parseDouble(hull[0]);
tank.hullSide = Double.parseDouble(hull[1]);
tank.hullRear = Double.parseDouble(hull[2].split(" ")[0]);
// gunarc
String[] gunarc = firstCellAfterHeader(context2, "Gun Arc").split("/");
if(gunarc.length == 1) {
// usually 360 here, so make it 0-360
tank.gunArcLeft = 0;
tank.gunArcRight = gunarc[0].contains("?") ? 0 : Double.parseDouble(gunarc[0]);
} else {
if(gunarc[0].contains("?") || gunarc[1].contains("?")) {
tank.gunArcLeft = tank.gunArcRight = 0;
} else {
tank.gunArcLeft = Double.parseDouble(gunarc[0]);
tank.gunArcRight = Double.parseDouble(gunarc[1]);
}
}
// depending equipment
tank.equipmentStock = crawlTankEquipment(context2, BaseProperties.Development.Stock);
tank.equipmentTop = crawlTankEquipment(context2, BaseProperties.Development.Top);
// read parent and children names, link them later
String base = ".//tr/th[text() = \"%s\"]/following-sibling::td[1]//a/@title";
List<Node> parents = evaluateXPath(context, String.format(base, "Parent"));
for (Node node : parents) {
tank.addParentName(node.getTextContent());
}
List<Node> children = evaluateXPath(context, String.format(base, "Child"));
for (Node node : children) {
tank.addChildName(node.getTextContent());
}
return tank;
}
/**
* Reads the additional equipment, that can be either in the stock- or top-
* development
* @param context the context from where the details are read
* @param dev the development (stock or top)
* @return all the details of the given development
*/
protected Equipment crawlTankEquipment(Node context, BaseProperties.Development dev) throws NumberFormatException, ParseException {
Equipment equip = new Equipment();
equip.development = dev;
// hitpoints
equip.hitpoints = format.parse(firstCellAfterHeader(context, "Hit Points", dev)).intValue();
// weight and limit
String[] weightLoad = firstCellAfterHeader(context, "Weight Limit", dev).split("/");
equip.weight = Double.parseDouble(weightLoad[0]);
if(weightLoad.length > 1) {
equip.weightLimit = Double.parseDouble(weightLoad[1]);
} else {
// if no second value is given, set equal...
equip.weightLimit = equip.weight;
}
// elevation
String[] elevation = firstCellAfterHeader(context, "Elevation Arc", dev).split("/");
if(!elevation[0].contains("?") && !elevation[1].contains("?")) {
equip.gunElevationLow = Double.parseDouble(elevation[0].replace("--", "-")); // fix for bug in wiki...
equip.gunElevationHigh = Double.parseDouble(elevation[1]);
}
// view
String view = firstXPathTextResult(context, String.format(".//tr/th[text() = \"View Range\"]/following-sibling::td/span[@class=\"%s\"]/div/text()", dev), "0");
equip.viewRange = format.parse(view).doubleValue();
if(equip.viewRange < 1) { // use old method for unrealistic values
equip.viewRange = format.parse(firstCellAfterHeader(context, "View Range", dev)).doubleValue();
}
return equip;
}
/**
* Adds the correct Parent- and Child-Relations to the list of tanks.
* As the used list implementation is mutable and ephemeral, no return
* is required, the changes are made in-place...
* @param tanks
*/
protected void linkTankRelations(List<Tank> tanks) {
for (Tank tank : tanks) {
for (String parentName : tank.parentNames) {
tank.addParent(tankMapping.get(parentName));
}
for (String childName : tank.childrenNames) {
tank.addChild(tankMapping.get(childName));
}
}
}
// ----------- module details -----------
/**
* Crawls all available modules and writes them into the TanksDB
* @return the Modules object, containing all modules in the game
*/
@SuppressWarnings("unchecked")
public Modules crawlModules() {
Modules mods = new Modules();
mods.engines = crawlModuleType(ModuleType.Engine);
mods.guns = crawlModuleType(ModuleType.Gun);
mods.radios = crawlModuleType(ModuleType.Radio);
mods.suspensions = crawlModuleType(ModuleType.Suspension);
mods.turrets = crawlModuleType(ModuleType.Turret);
return mods;
}
/**
* Returns a list of all Modules of the given type. As only Modules of the
* specified type are crawled, the resulting list could be as well assigned
* to variables that are typed in the corresponding subclass.
* Due to restrictions in java generics, this problem cannot be resolved
* on compile-time...
* @param type the type of Module to crawl
* @return list of all modules of the specified type
*/
@SuppressWarnings("rawtypes")
protected List crawlModuleType(ModuleType type) {
System.out.println("- Crawling Modules: " + type.getOverviewPage());
List<Module> modules = new ArrayList<Module>(100);
List<URL> source = getModuleOverviewURLs(type);
try {
if (source.size() == 1) {
System.out.println(" * Retrieving... ");
Document moduleSite = Parser.parseHTML(buildURL(type.getOverviewPage()));
Node context = firstXPathResult(moduleSite, "//div[@class=\"mw-content-ltr\"]");
System.out.print(" * Parsing... ");
List<Node> perNation = evaluateXPath(context, ".//div[@class = \"ModuleList\"]");
for (Node node : perNation) {
crawlModuleNation(type, modules, node);
}
} else {
System.out.print(" * Processing... ");
for (URL url : source) {
Document moduleSite = Parser.parseHTML(url);
Node context = firstXPathResult(moduleSite, "//div[@class = \"ModuleList\"]");
crawlModuleNation(type, modules, context);
}
}
} catch (IOException ex) {
log.error("Crawling of modules failed", ex);
} catch (ParserConfigurationException ex) {
log.error("Crawling of modules failed", ex);
} catch (SAXException ex) {
log.error("Crawling of modules failed", ex);
}
System.out.println("done.");
return modules;
}
/**
* Crawls all Modules from the given context node and associates them with the
* nation that is found at the context node.
* Adds all modules to the given (mutable) list, so no return type required...
* @param type the type of module to crawl
* @param modules mutable list of modules. results will be added to this
* @param context from this context node the search is starting
*/
protected void crawlModuleNation(ModuleType type, List<Module> modules, Node context) {
String nationString = "";
try {
nationString = firstXPathResult(context, "./h3/span/@id").getTextContent();
} catch (NullPointerException ex) {
// ignore, for some entries there is no nation defined...
}
Nation nation = Nation.parseAdvanced(nationString);
System.out.print(nation + ".. ");
List<Node> rows = evaluateXPath(context, "./table/tbody/tr[not(@*)]");
for (Node row : rows) {
try {
List<Node> cells = evaluateXPath(row, "./td");
// switching on each row sucks ass, but doing it before makes
// code massively redundant...
switch(type) {
case Engine: modules.add(crawlSingleEngine(cells, nation)); break;
case Gun: modules.add(crawlSingleGun(cells, nation)); break;
case Radio: modules.add(crawlSingleRadio(cells, nation)); break;
case Suspension: modules.add(crawlSingleSuspension(cells, nation)); break;
case Turret: modules.add(crawlSingleTurret(cells, nation)); break;
default: log.warn("Unrecognized Module Type: {}", type.toString());
}
} catch (ParseException ex) {
log.error("Crawling of module details failed", ex);
} catch (IllegalAccessException ex) {
log.error("Crawling of module details failed", ex);
}
}
}
// single modules
/**
* Crawls the details for a single Engine
* @param cells the cells of the row containing the details of this engine
* @param nation the nation this engine belongs to
* @return a completely filled engine object
* @throws ParseException if a number cannot be parsed
* @throws IllegalAccessException if an enum cannot be parsed
*/
protected Engine crawlSingleEngine(List<Node> cells, Nation nation) throws ParseException, IllegalAccessException {
Engine e = new Engine();
e.nation = nation;
e.tier = parseTier(cells, 1);
e.wikiURL = parseModuleUrl(ModuleType.Engine, cells);
e.name = parseStringBold(cells, 2);
e.power = parseInt(cells, 3);
e.firechance = parseInt(cells, 4);
// gas
String gas = parseString(cells, 5);
e.gas = (gas.equals("--")) ? null : Gas.parse(gas);
// cost
parseCost(cells, 6, e);
// weight
String weight = parseString(cells, 7);
e.weight = (weight.equals("--")) ? 0 : format.parse(weight).doubleValue();
// compatibility
e.compatibility = parseCompatibility(cells, 8);
return e;
}
/**
* Crawls the details for a single Gun
* @param cells the cells of the row containing the details of this engine
* @param nation the nation this engine belongs to
* @return a completely filled gun object
* @throws ParseException if a number cannot be parsed
*/
protected Gun crawlSingleGun(List<Node> cells, Nation nation) throws ParseException {
Gun g = new Gun();
g.nation = nation;
g.tier = parseTier(cells, 1);
g.wikiURL = parseModuleUrl(ModuleType.Gun, cells);
g.name = parseStringBold(cells, 2);
// ammo
Tuple<String,String> ammo = resolveStringTuple(parseStringBold(cells, 3), "-");
g.ammoCapacityMin = ammo.fst().equals("?") ? -1 : Integer.parseInt(ammo.fst());
g.ammoCapacityMax = ammo.snd().equals("?") ? -1 : Integer.parseInt(ammo.snd());
// damage
g.dmgAP = format.parse(firstXPathTextResult(cells.get(3), "./span[@class = \"ammoAP\"]", "0")).intValue();
g.dmgAPCR = format.parse(firstXPathTextResult(cells.get(3), "./span[@class = \"ammoAPCR\"]", "0")).intValue();
g.dmgHE = format.parse(firstXPathTextResult(cells.get(3), "./span[@class = \"ammoHE\"]", "0")).intValue();
g.dmgHEAT = format.parse(firstXPathTextResult(cells.get(3), "./span[@class = \"ammoHEAT\"]", "0")).intValue();
// penetration
g.penAP = format.parse(firstXPathTextResult(cells.get(4), "./span[@class = \"ammoAP\"]", "0")).intValue();
g.penAPCR = format.parse(firstXPathTextResult(cells.get(4), "./span[@class = \"ammoAPCR\"]", "0")).intValue();
g.penHE = format.parse(firstXPathTextResult(cells.get(4), "./span[@class = \"ammoHE\"]", "0")).intValue();
g.penHEAT = format.parse(firstXPathTextResult(cells.get(4), "./span[@class = \"ammoHEAT\"]", "0")).intValue();
// fire rate
Tuple<String,String> fireRate = resolveStringTuple(parseString(cells, 6), "-");
g.fireRateMin = Double.parseDouble(fireRate.fst());
g.fireRateMax = Double.parseDouble(fireRate.snd());
// accuracy
Tuple<String,String> accuracy = resolveStringTuple(parseString(cells, 7), "-");
g.accuracyMin = Double.parseDouble(accuracy.fst());
g.accuracyMax = Double.parseDouble(accuracy.snd());
// aim time
Tuple<String,String> aimTime = resolveStringTuple(parseString(cells, 8), "-");
g.aimTimeMin = Double.parseDouble(aimTime.fst());
g.aimTimeMax = Double.parseDouble(aimTime.snd());
// cost
parseCost(cells, 9, g);
// weight
String weight = parseString(cells, 10);
g.weight = (weight.equals("--")) ? 0 : format.parse(weight).doubleValue();
// compatibility
g.compatibility = parseCompatibility(cells, 11);
return g;
}
/**
* Crawls the details for a single Radio
* @param cells the cells of the row containing the details of this engine
* @param nation the nation this engine belongs to
* @return a completely filled radio object
* @throws ParseException if a number cannot be parsed
*/
protected Radio crawlSingleRadio(List<Node> cells, Nation nation) throws ParseException {
Radio r = new Radio();
r.nation = nation;
r.tier = parseTier(cells, 1);
r.wikiURL = parseModuleUrl(ModuleType.Radio, cells);
r.name = parseStringBold(cells, 2);
r.range = parseInt(cells, 3);
parseCost(cells, 4, r);
String weight = parseString(cells, 5);
r.weight = (weight.equals("--")) ? 0 : format.parse(weight).doubleValue();
r.compatibility = parseCompatibility(cells, 6);
return r;
}
/**
* Crawls the details for a single Suspension
* @param cells the cells of the row containing the details of this engine
* @param nation the nation this engine belongs to
* @return a completely filled Suspension object
* @throws ParseException if a number cannot be parsed
*/
protected Suspension crawlSingleSuspension(List<Node> cells, Nation nation) throws ParseException {
Suspension s = new Suspension();
s.nation = nation;
s.tier = parseTier(cells, 1);
s.wikiURL = parseModuleUrl(ModuleType.Suspension, cells);
s.name = parseStringBold(cells, 2);
s.load = parseDouble(cells, 3);
s.traverse = parseInt(cells, 4);
parseCost(cells, 5, s);
String weight = parseString(cells, 6);
s.weight = (weight.equals("--")) ? 0 : format.parse(weight).doubleValue();
s.compatibility = parseCompatibility(cells, 7);
return s;
}
/**
* Crawls the details for a single Turret
* @param cells the cells of the row containing the details of this engine
* @param nation the nation this engine belongs to
* @return a completely filled Turret object
* @throws ParseException if a number cannot be parsed
*/
protected Turret crawlSingleTurret(List<Node> cells, Nation nation) throws ParseException {
Turret t = new Turret();
t.nation = nation;
t.tier = parseTier(cells, 1);
t.wikiURL = parseModuleUrl(ModuleType.Turret, cells);
t.name = parseStringBold(cells, 2);
// armor
String[] armor = parseString(cells, 3).split("/");
if (armor.length == 3) {
t.armorFront = Double.parseDouble(armor[0]);
t.armorSide = Double.parseDouble(armor[1]);
t.armorRear = Double.parseDouble(armor[2]);
} else {
log.warn("Error parsing armor: not enough values");
}
t.traverse = parseDouble(cells, 4);
t.viewRange = parseDouble(cells, 5);
parseCost(cells, 6, t);
String weight = parseString(cells, 7);
t.weight = (weight.equals("--")) ? 0 : format.parse(weight).doubleValue();
t.compatibility = parseCompatibility(cells, 8);
return t;
}
// ----------- generic parsers -----------
/**
* Parses the Tier of an Object.
* In fact, this method is just a generic roman number parses, that takes
* a number of cells as input and an index that decides in which cell the
* roman number can be found.
* @param cells the cells of the row containing the required value
* @param cell the position (starting by 1) of the required value in the list of cells
* @return the converted roman number (as byte)
*/
protected byte parseTier(List<Node> cells, int cell) {
return (byte) Conversion.romanToDecimal(allXPathTextResult(getRelevantContext(cells, cell), "./span/b/text()"));
}
/**
* Parses a simple String from a given cell
* @param cells the cells of the row containing the required value
* @param cell the position (starting by 1) of the required value in the list of cells
* @return the string value of the selected cell
*/
protected String parseString(List<Node> cells, int cell) {
return allXPathTextResult(getRelevantContext(cells, cell), "./text()");
}
/**
* Parses a simple String in within <b>-tags from a given cell
* @param cells the cells of the row containing the required value
* @param cell the position (starting by 1) of the required value in the list of cells
* @return the string value (without <b>-tags) of the selected cell
*/
protected String parseStringBold(List<Node> cells, int cell) {
return allXPathTextResult(getRelevantContext(cells, cell), "./b/text()");
}
/**
* Parses an integer value from a given cell
* @param cells the cells of the row containing the required value
* @param cell the position (starting by 1) of the required value in the list of cells
* @return the integer value of the selected cell
* @throws ParseException if the integer cannot be parsed
*/
protected int parseInt(List<Node> cells, int cell) throws ParseException {
return format.parse(allXPathTextResult(getRelevantContext(cells, cell), "./text()")).intValue();
}
/**
* Parses a double value from a given cell
* @param cells the cells of the row containing the required value
* @param cell the position (starting by 1) of the required value in the list of cells
* @return the double value of the selected cell
* @throws ParseException if the double cannot be parsed
*/
protected double parseDouble(List<Node> cells, int cell) throws ParseException {
return format.parse(allXPathTextResult(getRelevantContext(cells, cell), "./text()")).doubleValue();
}
/**
* Parses the cost and currency of a module and writes them directly in the
* given Module object (to avoid tuple return values...)
* @param cells the cells of the row containing the required values
* @param cell the position (starting by 1) of the required values in the list of cells
* @throws ParseException if a number cannot be parsed
*/
protected void parseCost(List<Node> cells, int cell, Module mod) throws ParseException {
Node context = getRelevantContext(cells, cell);
String currency = firstXPathTextResult(context, "./span/img/@alt").toLowerCase();
if(currency.contains("credit")) {
String cost = allXPathTextResult(context, "./text()");
mod.cost = (cost.equals("--")) ? 0 : format.parse(cost).intValue();
mod.currency = Currency.Credits;
} else if(currency.contains("premium")) {
mod.cost = 0;
mod.currency = Currency.Premium;
} else {
log.warn("error parsing cost, currency {} not recognized.", currency);
}
}
/**
* Parses the id of the selected module and generates a wikilink from it.
* @param type the type of module that is requested
* @param cells the cells of the row containing the details of this engine
* @return the wikilink to the module
*/
protected String parseModuleUrl(ModuleType type, List<Node> cells) {
return type.toString() + "#" + firstXPathTextResult(cells.get(1), ".//div/@id", "");
}
/**
* Parses a list of tanks that are compatible to a module
* @param cells the cells of the row containing the required value
* @param cell the position (starting by 1) of the required value in the list of cells
* @return the list of tank references
*/
protected List<TankRef> parseCompatibility(List<Node> cells, int cell) {
List<Node> links = evaluateXPath(getRelevantContext(cells, cell), "./a/@title");
List<TankRef> compat = new ArrayList<TankRef>(links.size());
for (Node tankWikiLink : links) {
compat.add(getTankRefByUrl(tankWikiLink.getTextContent()));
}
return compat;
}
// parser helper
/**
* Gets the context Node of a specific cell from a list of cells. If the
* cell contains a <center> tag, it is removed and only it's actual contents
* are shown.
* @param cells the cells of the row containing the required value
* @param cell the position (starting by 1) of the required value in the list of cells
* @return the relevant context node at the specified position
*/
private Node getRelevantContext(List<Node> cells, int cell) {
return firstXPathResult(cells.get(cell-1), "./center", ".");
}
/**
* Separates an input string using the specified delimiter and returns
* a tuple of the first two results of this split. If there is no second
* value, the first value will be copied to the second position instead.
* @param input the input string
* @param delimiter the delimiter to split the string
* @return a tuple containing the first two values of the split
*/
protected Tuple<String, String> resolveStringTuple(String input, String delimiter) {
if (input.contains(delimiter)) {
String[] ab = input.split(delimiter);
return Tuple.of(ab[0], ab.length > 1 ? ab[1] : ab[0]);
} else {
return Tuple.of(input, input);
}
}
/**
* Returns a tank reference that points to the tank of the specified name
* @param wikiURL the wiki url of the tank to refer to
* @return the corresponding tank reference
*/
protected TankRef getTankRefByUrl(String wikiURL) {
return new TankRef(tankMapping.get(wikiURL));
}
// -------------------- XPath --------------------
/**
* the XPath-Factory can create new XPath queries. No need to initialize
* it more than once...
*/
protected final static XPathFactory xpf = XPathFactory.newInstance();
/**
* Evaluates a given XPath query in the given context (must be an Element,
* or a Node or something...) and returns a (possibly empty) list of results.
* @param context the context where to start the query from. Will be ignored
* of course, if the query starts with / or something. Use . to address the
* context node.
* @param expression the XPath expression as String
* @return list of results (as Node-objects)
*/
protected List<Node> evaluateXPath(Object context, String expression) {
try {
// evaluate xpath
XPath xpath = xpf.newXPath();
XPathExpression expr = xpath.compile(expression);
NodeList result = (NodeList) expr.evaluate(context, XPathConstants.NODESET);
// store result
List<Node> nodes = new ArrayList<Node>(result.getLength());
for (int i = 0; i < result.getLength(); i++) {
nodes.add(result.item(i));
}
return nodes;
} catch (XPathExpressionException ex) {
log.error("Evaluating of XPath expression failed", ex);
}
// return null, if xpath expression fails
return null;
}
/**
* Evaluates the given XPath expression and returns the first resulting node.
* Warning: returns null if no results were found
* @param context the context where to start the query from
* @param expression the XPath expression as String
* @return the first resulting Node
*/
protected Node firstXPathResult(Object context, String expression) {
try {
return evaluateXPath(context, expression).get(0);
} catch(IndexOutOfBoundsException e) {
// null results are expected...
return null;
}
}
/**
* Evaluates the XPath expressions in their given order and returns the
* first resulting node.
* Warning: returns null if no results were found
* @param context the context where to start the query from
* @param expression the XPath expression as String
* @return the first resulting Node
*/
protected Node firstXPathResult(Object context, String... expressions) {
for (String expr : expressions) {
List<Node> results = evaluateXPath(context, expr);
if(!results.isEmpty()) {
return results.get(0);
}
}
return null;
}
/**
* Returns the full textual representation of the first resulting Node,
* including the text contents of all subnodes.
* @param context the context where to start the query from
* @param expression the XPath expression as String
* @return all text contents of the resulting node + subnodes. Trimmed.
*/
protected String firstXPathTextResult(Object context, String expression) {
return firstXPathResult(context, expression).getTextContent().trim();
}
/**
* Returns the full textual representation of the first resulting Node,
* including the text contents of all subnodes.
* If there is no result, the value in returnOnError is returned
* @param context the context where to start the query from
* @param expression the XPath expression as String
* @param returnOnError the value to return if no result is found
* @return all text contents of the resulting node + subnodes. Trimmed.
* (or the backup value, in case no result is found)
*/
protected String firstXPathTextResult(Object context, String expression, String returnOnError) {
try {
return firstXPathTextResult(context, expression);
} catch(NullPointerException ex) {
return returnOnError;
}
}
/**
* Returns the combined textual representation of all resulting Nodes.
* Good if you want to append several text() Nodes that are split by
* other non-text nodes, like
* <p>Hello, <br/>Mike!<p>
* @param context the context where to start the query from
* @param expression the XPath expression as String
* @return all text contents of all resulting nodes, combined without
* whitespaces. Result is trimmed.
*/
protected String allXPathTextResult(Object context, String expression) {
List<Node> nodes = evaluateXPath(context, expression);
StringBuilder sb = new StringBuilder(nodes.size());
for (Node node : nodes) {
sb.append(node.getTextContent());
}
return sb.toString().trim();
}
/**
* Special construct: Searches a <th> with text content == head. Picks
* it's first following sibling <td> and returns it's text content
* @param context the context where to start the query from
* @param head the name of the table header to search
* @return the information associated with this header
*/
protected String firstCellAfterHeader(Object context, String head) {
return allXPathTextResult(context, String.format(".//tr/th[text() = \"%s\"]/following-sibling::td[1]/text()", head));
}
/**
* Special construct: Searches a <th> with text content == head. Picks
* it's first following sibling <td> and returns the text content of
* it's inner span with class == dev
* @param context the context where to start the query from
* @param head the name of the table header to search
* @param dev the development to retrieve (stock or top)
* @return the information associated with this header as stock or top
*/
protected String firstCellAfterHeader(Object context, String head, BaseProperties.Development dev) {
String xpath = String.format(".//tr/th[text() = \"%s\"]/following-sibling::td/span[@class=\"%s\"]/text()", head, dev);
return allXPathTextResult(context, xpath);
}
// -------------------- Helpers --------------------
/**
* Builds the URL of the specified wiki page name, according to the settings
* of this class (this means, it could return a http-URL pointing to the wot
* wiki or a file-URL pointing to the local directory where the wiki pages
* are stored)
* @param siteName the name of the site in the wot wiki
* @return the url of this site (file or http)
* @throws MalformedURLException if there is something heavily wrong with the
* site name (illegal characters and stuff...)
*/
protected URL buildURL(String siteName) throws MalformedURLException {
switch(src) {
case FILE:
return findLocalFile(siteName).toURI().toURL();
case URL:
return buildWikiLink(siteName);
default:
log.warn("Unknown enum value: " + src.toString());
return null;
}
}
/**
* Searches for a local file for the given site name in the commonly used
* folders. Throws an exception, if the file is not found.
* @param siteName the name of the site in the wot wiki
* @return the local copy of the wiki-page
*/
protected File findLocalFile(String siteName) {
String file = siteToFileName(siteName);
for (File parent : localParents) {
File f = new File(parent, file);
if(f.exists()) {
return f;
}
}
throw new RuntimeException("File for " + siteName + " was not found");
}
/**
* Decices, if there is a local copy of a wikipage for the given page name
* @param siteName the name of the site in the wot wiki
* @return true, if the local copy exists
*/
protected boolean existsLocalFile(String siteName) {
String file = siteToFileName(siteName);
for (File parent : localParents) {
File f = new File(parent, file);
if(f.exists()) {
return true;
}
}
return false;
}
/**
* Builds the URL of the specified wiki page name, always HTTP
* @param siteName the name of the site in the wot wiki
* @return the url of this site (http)
* @throws MalformedURLException if there is something heavily wrong with the
* site name (illegal characters and stuff...)
*/
public static URL buildWikiLink(String siteName) throws MalformedURLException {
return new URL(String.format("%s/%s", Crawler.baseURL, siteName));
}
/**
* Generates a tank ID that conforms to the rules of NCName, as defined by
* the W3C: http://www.w3.org/TR/1999/REC-xml-names-19990114/#NT-NCName
* the NCName restriction is required for ID and IDREF values
* @param name the name of the tank
* @return the NCName conform ID generated from this name
*/
protected static String generateTankID(String name) {
return "_" + cleanFileName(name).replace(" ", "").replace("(", "").replace(")", "").replace(".", "");
}
/**
* Adds a ".html" prefix and removes some characters from the sitename,
* that would make it illegal to store in a filesystem (like / or \)
* @param siteName the name of the site
* @return the filename generated from the site name
*/
public static String siteToFileName(String siteName) {
return cleanFileName(siteName) + ".html";
}
/**
* A set of illegal chars in filenames (unified list of unix and windows chars)
*/
final static int[] illegalChars = {34, 60, 62, 124, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 58, 42, 63, 92, 47};
static {
// need this to ensure that binarySearch can work!
Arrays.sort(illegalChars);
}
/**
* Removes all illegal chars from a filename and replaces them with '_' at
* their original index
* @param badFileName the old (possibly faulty) filename
* @return a possibly modified version of the file name that works with
* every file system
*/
public static String cleanFileName(String badFileName) {
StringBuilder cleanName = new StringBuilder();
for (int i = 0; i < badFileName.length(); i++) {
int c = badFileName.charAt(i);
if (Arrays.binarySearch(illegalChars, c) < 0) {
cleanName.append((char) c);
} else {
// replace all illegal chars by '_'
cleanName.append('_');
}
}
return cleanName.toString();
}
/**
* Decides, if a crawler recieves files from the local filesystem or
* directly from the web.
*/
enum Source {
FILE,
URL;
}
}