/** * File ./src/main/java/de/lemo/dms/connectors/chemgapedia/fizHelper/XMLPackageParser.java * Lemo-Data-Management-Server for learning analytics. * Copyright (C) 2013 * Leonard Kappe, Andreas Pursian, Sebastian Schwarzrock, Boris Wenzlaff * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. **/ /** * File ./main/java/de/lemo/dms/connectors/chemgapedia/fizHelper/XMLPackageParser.java * Date 2013-01-24 * Project Lemo Learning Analytics */ package de.lemo.dms.connectors.chemgapedia.fizHelper; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import org.apache.log4j.Logger; import org.hibernate.Session; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.NamedNodeMap; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.xml.sax.SAXException; import org.xml.sax.SAXParseException; import de.lemo.dms.connectors.IConnector; import de.lemo.dms.core.Clock; import de.lemo.dms.core.config.ServerConfiguration; import de.lemo.dms.db.EQueryType; import de.lemo.dms.db.IDBHandler; import de.lemo.dms.db.mapping.CourseMining; import de.lemo.dms.db.mapping.CourseResourceMining; import de.lemo.dms.db.mapping.IDMappingMining; import de.lemo.dms.db.mapping.LevelAssociationMining; import de.lemo.dms.db.mapping.LevelCourseMining; import de.lemo.dms.db.mapping.LevelMining; import de.lemo.dms.db.mapping.ResourceMining; /** * The Class XMLPackageParser. Created for the processing of Chemgapedia's VLU-files */ public class XMLPackageParser { /** The level objects. */ private final Map<String, LevelMining> levelObj = new HashMap<String, LevelMining>(); /** The level association objects . */ private final Map<Long, LevelAssociationMining> levelAssociations = new HashMap<Long, LevelAssociationMining>(); /** The level course objects. */ private final Map<Long, LevelCourseMining> levelCourses = new HashMap<Long, LevelCourseMining>(); /** The largest level id of previous runs. */ private Long levId = 0L; /** The largest level association id of previous runs. */ private Long levAscId = 0L; /** The largest level course id of previous runs. */ private Long levCouId = 0L; /** The list of course objects. */ private final Map<String, CourseMining> courseObj = new HashMap<String, CourseMining>(); /** The course resources objects. */ private final Map<Long, CourseResourceMining> courseResources = new HashMap<Long, CourseResourceMining>(); /** The list of resource objects. */ private final Map<String, ResourceMining> resourceObj = new HashMap<String, ResourceMining>(); /** The list of file names */ private final List<String> fileNames = new ArrayList<String>(); /** The map containing the IDMapping-objects. */ private final Map<String, IDMappingMining> idmapping; /** The largest course id of previous runs. */ private Long couId = 0L; /** The resource level id of previous runs. */ private Long resId = 0L; /** The largest course resource id of previous runs. */ private Long couResId = 0L; private final IConnector connector; private final IDBHandler dbHandler; private final Logger logger = Logger.getLogger(this.getClass()); /** * Constructor. Creates an object of XMLPackageParser. * * @param platform * Name of the Chemgapedia-platform */ @SuppressWarnings("unchecked") public XMLPackageParser(final IConnector connector, final List<Long> courses) { this.connector = connector; final long platformId = connector.getPlatformId(); this.dbHandler = ServerConfiguration.getInstance().getMiningDbHandler(); final Session session = this.dbHandler.getMiningSession(); final List<IDMappingMining> ids = (List<IDMappingMining>) this.dbHandler.performQuery(session, EQueryType.HQL, "from IDMappingMining x where x.platform=" + platformId + " order by x.id asc"); this.idmapping = new HashMap<String, IDMappingMining>(); for (int i = 0; i < ids.size(); i++) { this.idmapping.put(ids.get(i).getHash(), ids.get(i)); } logger.info("Loaded " + this.idmapping.size() + " IDMappingMining objects from the mining database."); final List<LevelMining> levs = (List<LevelMining>) this.dbHandler.performQuery(session, EQueryType.HQL, "FROM LevelMining x where x.platform=" + platformId + " order by x.id asc"); if (levs.size() > 0) { this.levId = levs.get(levs.size() - 1).getId(); } for (int i = 0; i < levs.size(); i++) { this.levelObj.put(levs.get(i).getTitle(), levs.get(i)); } logger.info("Loaded " + this.levelObj.size() + " LevelMining objects from the mining database."); final List<CourseMining> cous = (List<CourseMining>) this.dbHandler.performQuery(session, EQueryType.HQL, "FROM CourseMining x where x.platform=" + platformId + " order by x.id asc"); if (cous.size() > 0) { this.couId = cous.get(cous.size() - 1).getId(); } for (int i = 0; i < cous.size(); i++) { this.courseObj.put(cous.get(i).getTitle(), cous.get(i)); } logger.info("Loaded " + this.courseObj.size() + " CourseMining objects from the mining database."); final List<ResourceMining> ress = (List<ResourceMining>) this.dbHandler.performQuery(session, EQueryType.HQL, "FROM ResourceMining x where x.platform=" + platformId + " order by x.id asc"); if (ress.size() > 0) { this.resId = ress.get(ress.size() - 1).getId(); } for (int i = 0; i < ress.size(); i++) { this.resourceObj.put(ress.get(i).getUrl(), ress.get(i)); } logger.info("Loaded " + this.resourceObj.size() + " ResourceMining objects from the mining database."); final List<LevelAssociationMining> levAsc = (List<LevelAssociationMining>) this.dbHandler.performQuery(session, EQueryType.HQL, "FROM LevelAssociationMining x where x.platform=" + platformId + " order by x.id asc"); if (levAsc.size() > 0) { this.levAscId = levAsc.get(levAsc.size() - 1).getId(); } for (int i = 0; i < levAsc.size(); i++) { this.levelAssociations.put(levAsc.get(i).getLower().getId(), levAsc.get(i)); } logger.info("Loaded " + this.levelAssociations.size() + " LevelAssociationMining objects from the mining database."); final List<LevelCourseMining> levCou = (List<LevelCourseMining>) this.dbHandler.performQuery(session, EQueryType.HQL, "FROM LevelCourseMining x where x.platform=" + platformId + " order by x.id asc"); if (levCou.size() > 0) { this.levCouId = levCou.get(levCou.size() - 1).getId(); } for (int i = 0; i < levCou.size(); i++) { this.levelCourses.put(levCou.get(i).getCourse().getId(), levCou.get(i)); } logger.info("Loaded " + this.levelCourses.size() + " LevelCourseMining objects from the mining database."); final List<CourseResourceMining> couRes = (List<CourseResourceMining>) this.dbHandler.performQuery(session, EQueryType.HQL, "FROM CourseResourceMining x where x.platform=" + platformId + " order by x.id asc"); if (couRes.size() > 0) { this.couResId = couRes.get(couRes.size() - 1).getId(); } for (int i = 0; i < couRes.size(); i++) { this.courseResources.put(couRes.get(i).getResource().getId(), couRes.get(i)); } logger.info("Loaded " + this.courseResources.size() + " CourseResourceMining objects from the mining database."); this.dbHandler.closeSession(session); } /** * Opens a vlu - file, creates objects of the type "Department","Degree","Course","Resource", * "DepartmentDegree", "DegreeCourse" and "CourseResource" containing the given information and saves the objects to * the global lists. * * @param filename * Absolute path of the file containing the data. */ private void readVLU(final String filename) { final ResourceMining resource = new ResourceMining(); try { // ---- Parse XML file ---- final DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); final DocumentBuilder builder = factory.newDocumentBuilder(); final Document document = builder.parse(new File(filename)); // ---- Get list of nodes to given element tag name ---- NamedNodeMap t = null; final NodeList content = document.getElementsByTagName("content"); // Set resource type resource.setType("VLU"); // set resource title if (document.getElementsByTagName("title").getLength() > 0) { resource.setTitle(document.getElementsByTagName("title").item(0).getTextContent()); } // Set resource difficulty if (document.getElementsByTagName("audience").getLength() > 0) { resource.setDifficulty(((Element) document.getElementsByTagName("audience").item(0)) .getAttribute("level")); } try { if (document.getElementsByTagName("time").getLength() > 0) { resource.setProcessingTime(Long.valueOf(((Element) document.getElementsByTagName("time").item(0)) .getAttribute("value"))); } } catch (final NumberFormatException e) { resource.setProcessingTime(0L); } String level1 = ""; String level2 = ""; String level3 = ""; if (document.getElementsByTagName("subject").getLength() > 0) { level1 = ((Element) document.getElementsByTagName("subject").item(0)).getAttribute("name"); } if (document.getElementsByTagName("subject").getLength() > 0) { level2 = ((Element) document.getElementsByTagName("subject").item(0)).getAttribute("area"); } if (document.getElementsByTagName("subject").getLength() > 0) { level3 = ((Element) document.getElementsByTagName("subject").item(0)).getAttribute("specialism"); } LevelMining lev1 = new LevelMining(); LevelMining lev2 = new LevelMining(); LevelMining lev3 = new LevelMining(); CourseMining course = new CourseMining(); final Long platformId = this.connector.getPlatformId(); final Long platformPrefix = this.connector.getPrefix(); if (this.courseObj.get(resource.getTitle()) == null) { course.setTitle(resource.getTitle()); course.setId(Long.valueOf(platformPrefix + "" + (this.couId + 1))); course.setPlatform(platformId); this.couId++; this.courseObj.put(course.getTitle(), course); } else { course = this.courseObj.get(resource.getTitle()); } if (this.levelObj.get(level1) == null ) { lev1.setTitle(level1); lev1.setId(Long.valueOf(platformPrefix + "" + (this.levId + 1))); lev1.setPlatform(platformId); lev1.setDepth(1); this.levId++; this.levelObj.put(lev1.getTitle(), lev1); } else { lev1 = this.levelObj.get(level1); } if (this.levelObj.get(level2) == null) { lev2.setTitle(level2); lev2.setId(Long.valueOf(platformPrefix + "" + (this.levId + 1))); lev2.setPlatform(platformId); lev2.setDepth(2); this.levId++; this.levelObj.put(lev2.getTitle(), lev2); } else { lev2 = this.levelObj.get(level2); } if (this.levelObj.get(level3) == null) { lev3.setTitle(level3); lev3.setId(Long.valueOf(platformPrefix + "" + (this.levId + 1))); lev3.setPlatform(platformId); lev3.setDepth(3); this.levId++; this.levelObj.put(lev3.getTitle(), lev3); } else { lev3 = this.levelObj.get(level3); } // Set URL final NodeList root = document.getElementsByTagName("vlunode"); for (int i = 0; i < root.getLength(); i++) { t = root.item(i).getAttributes(); for (int j = 0; j < t.getLength(); j++) { if (t.item(j).getNodeName().equals("xlink:href")) { resource.setUrl("http://www.chemgapedia.de/vsengine/vlu" + t.item(j).getNodeValue() + ".html"); break; } } } if (!resource.getUrl().contains("/0/")) { resource.setPosition(0); this.resourceObj.get(resource.getUrl()); long rid = -1; if (this.idmapping.get(resource.getUrl()) != null) { rid = this.idmapping.get(resource.getUrl()).getId(); resource.setId(rid); } if (rid == -1) { rid = this.resId + 1; this.resId = rid; rid = Long.valueOf(this.connector.getPrefix() + "" + rid); this.idmapping.put(resource.getUrl(), new IDMappingMining(rid, resource.getUrl(), platformId)); resource.setId(rid); } resource.setPlatform(this.connector.getPlatformId()); this.resourceObj.put(resource.getUrl(), resource); this.fileNames.add(filename); // Save department - degree relation locally if (this.levelAssociations.get(lev2.getId()) == null) { final LevelAssociationMining ddm = new LevelAssociationMining(); ddm.setLower(lev2); ddm.setUpper(lev1); ddm.setId(Long.valueOf(this.connector.getPrefix() + "" + (this.levAscId + 1))); ddm.setPlatform(this.connector.getPlatformId()); this.levAscId++; this.levelAssociations.put(lev2.getId(), ddm); } if (this.levelAssociations.get(lev3.getId()) == null) { final LevelAssociationMining ddm = new LevelAssociationMining(); ddm.setLower(lev3); ddm.setUpper(lev2); ddm.setId(Long.valueOf(platformPrefix + "" + (this.levAscId + 1))); ddm.setPlatform(platformId); this.levAscId++; this.levelAssociations.put(lev3.getId(), ddm); } // Find out whether there already is a association between the course and a hierarchy layer. If not // create one. if (this.levelCourses.get(course.getId()) == null) { final LevelCourseMining dcm = new LevelCourseMining(); dcm.setLevel(lev3); dcm.setCourse(course); dcm.setId(Long.valueOf(platformPrefix + "" + (this.levCouId + 1))); dcm.setPlatform(platformId); this.levCouId++; this.levelCourses.put(course.getId(), dcm); } // Find out whether there already is a association between this resource and a course. If not create // one. if (this.courseResources.get(resource.getId()) == null) { final CourseResourceMining crm = new CourseResourceMining(); crm.setResource(resource); crm.setCourse(course); crm.setId(Long.valueOf(this.connector.getPrefix() + "" + (this.couResId + 1))); this.couResId++; crm.setPlatform(this.connector.getPlatformId()); this.courseResources.put(resource.getId(), crm); } int pos = 1; // Create Resource-objects for the all the pages included in the vlu final ArrayList<ResourceMining> tempRes = new ArrayList<ResourceMining>(); if (content.getLength() > 0) { for (int i = 0; i < content.item(0).getChildNodes().getLength(); i++) { if (content.item(0).getChildNodes().item(i).hasAttributes()) { final ResourceMining r1 = new ResourceMining(); t = content.item(0).getChildNodes().item(i).getAttributes(); for (int j = 0; j < t.getLength(); j++) { final Node node = t.item(j); if (node.getTextContent() != null) { if (node.getNodeName().equals("xlink:href")) { r1.setDifficulty(resource.getDifficulty()); r1.setType("Page"); r1.setUrl(resource.getUrl().substring(0, resource.getUrl().length() - 5) + "/Page" + node.getTextContent() + ".html"); } if (node.getNodeName().equals("xlink:title")) { r1.setTitle(node.getTextContent()); } } } long resourceid = -1; if (this.idmapping.get(r1.getUrl()) != null) { resourceid = this.idmapping.get(r1.getUrl()).getId(); r1.setId(resourceid); } if (resourceid == -1) { resourceid = this.resId + 1; this.resId = resourceid; resourceid = Long.valueOf(this.connector.getPrefix() + "" + resourceid); this.idmapping.put(r1.getUrl(), new IDMappingMining(resourceid, r1.getUrl(), platformId)); r1.setId(resourceid); } r1.setPosition(pos); r1.setPlatform(this.connector.getPlatformId()); tempRes.add(r1); this.resourceObj.put(r1.getUrl(), r1); this.fileNames.add(filename + "*"); if (this.courseResources.get(r1.getId()) == null) { final CourseResourceMining crm = new CourseResourceMining(); crm.setResource(r1); crm.setCourse(course); crm.setId(Long.valueOf(this.connector.getPrefix() + "" + (this.couResId + 1))); this.couResId++; crm.setPlatform(this.connector.getPlatformId()); this.courseResources.put(r1.getId(), crm); } pos++; } } long posT = 0; try { posT = (resource.getProcessingTime() / pos) - 1; } catch (final NumberFormatException e) { } for (int i = 0; i < tempRes.size(); i++) { tempRes.get(i).setProcessingTime(posT); } // Add the unlisted summary for each vlu final ResourceMining r1 = new ResourceMining(); r1.setDifficulty(resource.getDifficulty()); r1.setTitle(resource.getTitle()); r1.setType("Summary"); r1.setProcessingTime(resource.getProcessingTime() / content.getLength()); r1.setUrl(resource.getUrl().substring(0, resource.getUrl().length() - 5) + "/Page/summary.html"); if (this.resourceObj.get(r1.getUrl()) == null) { r1.setId(Long.valueOf(this.connector.getPrefix() + "" + (this.resId + 1))); this.resId++; } r1.setPosition(pos); if (this.resourceObj.get(r1.getUrl()) == null) { r1.setPlatform(this.connector.getPlatformId()); this.resourceObj.put(r1.getUrl(), r1); this.idmapping.put(r1.getUrl(), new IDMappingMining(r1.getId(), r1.getUrl(), this.connector.getPlatformId())); this.fileNames.add(filename + "*"); final CourseResourceMining crm = new CourseResourceMining(); crm.setResource(r1); crm.setCourse(course); crm.setId(Long.valueOf(this.connector.getPrefix() + "" + (this.couResId + 1))); this.couResId++; crm.setPlatform(this.connector.getPlatformId()); this.courseResources.put(r1.getId(), crm); } } } // ---- Error handling ---- } catch (final SAXParseException spe) { logger.info("\n** Parsing error, line " + spe.getLineNumber() + ", uri " + spe.getSystemId()); } catch (final SAXException sxe) { logger.info("\n** SAX error!"); } catch (final ParserConfigurationException pce) { logger.info("ParserConfigurationException: " + pce.getMessage()); } catch (final IOException ioe) { logger.info("IOException: " + ioe.getMessage()); } } /** * Returns a list of all files with the specified suffix in the directory and its subdirectories. * * @param directory * Directory containing the files and subdirectories. * @param suffix * File extension (returns all files if the string is empty) * @return An ArrayList containing all file names (absolute paths) contained in the given directory. */ private List<String> getFilenames(final String directory, final String suffix) { final ArrayList<String> all = new ArrayList<String>(); try { File f = new File(directory); logger.info("Gathering filenames from path: " + f.getAbsolutePath()); final ArrayList<String> dirs = new ArrayList<String>(); for (int i = 0; i < f.list().length; i++) { dirs.add(i, directory + "\\" + f.list()[i]); } while (!dirs.isEmpty()) { f = new File(dirs.get(0)); if (f.isDirectory()) { for (int i = 0; i < f.list().length; i++) { dirs.add(f + "\\" + f.list()[i]); } } if (f.isFile() && f.toString().endsWith(suffix)) { all.add(f.toString()); } dirs.remove(0); } } catch (final Exception e) { logger.info("Exception @ getFilenames" + e.getMessage()); } return all; } /** * Saves all objects of the XMLPackageParser into the Database. * Affected tables: Resource, DepartmentDegree, DegreeCourse, CourseResource * * @return Largest id in the id-mapping-table */ public void saveAllToDB() { final List<Collection<?>> li = new ArrayList<Collection<?>>(); li.add(this.levelObj.values()); li.add(this.courseObj.values()); li.add(this.resourceObj.values()); li.add(this.levelAssociations.values()); li.add(this.levelCourses.values()); li.add(this.courseResources.values()); li.add(this.idmapping.values()); final Session session = this.dbHandler.getMiningSession(); this.dbHandler.saveCollectionToDB(session, li); } /** * Reads all VLUs contained in the specified directory. * * @param directory * the directory */ public void readAllVlus(final String directory) { try { final Clock c = new Clock(); this.logger.info("Gathering filenames in directory..."); final List<String> all = this.getFilenames(directory, ".vlu"); Collections.sort(all); this.logger.info("Found " + all.size() + " files in directory." + c.getAndReset()); this.logger.info("Reading all vlu-files in directory..."); for (int i = 0; i < all.size(); i++) { this.readVLU(all.get(i)); if ((i > 0) && ((i % (all.size() / 10)) == 0)) { this.logger.info("Read " + (i / (all.size() / 10)) + "0 % in " + c.get()); } } } catch (final Exception e) { this.logger.error("Exception @ readAllVLUs ", e); } } public void clearMaps() { this.courseObj.clear(); this.courseResources.clear(); this.fileNames.clear(); this.idmapping.clear(); this.levelAssociations.clear(); this.levelCourses.clear(); this.levelObj.clear(); this.resourceObj.clear(); } }