/**
* @(#) CourseParser.java
*
* This file is part of the Course Scheduler, an open source, cross platform
* course scheduling tool, configurable for most universities.
*
* Copyright (C) 2010-2014 Devyse.io; All rights reserved.
*
* @license GNU General Public License version 3 (GPLv3)
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see http://www.gnu.org/licenses/.
*/
package io.devyse.scheduler.parse.jsoup.banner;
import io.devyse.scheduler.parse.jsoup.AbstractParser;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Scanner;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.parser.Tag;
import org.jsoup.select.Elements;
import org.slf4j.ext.XLogger;
import org.slf4j.ext.XLoggerFactory;
/**
* A CourseParser parses a sub-document extracted from the main course search results document
* in order to find the course data.
*
* @author Mike Reinhold
* @since 4.12.4
*/
public class CourseParser extends AbstractParser<Map<String, String>>{
/**
* Static logger
*/
private static XLogger logger = XLoggerFactory.getXLogger(CourseParser.class);
/**
* Serial Version UID
*/
private static final long serialVersionUID = 1L;
/**
* Create a new CourseParser for parsing the Course document that is extracted
* from the main course search results page
*
* @param document the course document
* @param timeout the socket connection timeout for any created connections
*/
public CourseParser(Document document, int timeout){
super(document, timeout);
}
/* (non-Javadoc)
* @see io.devyse.scheduler.parse.jsoup.AbstractParser#parse(org.jsoup.nodes.Document)
*/
protected void parse(Document document) throws IOException{
Map<String, String> values = new HashMap<>();
Element sectionHeaderElement = document.select("tr > th.ddtitle > a[href]").first();
String sectionDetailURL = sectionHeaderElement.absUrl("href");
String sectionHeaderText = sectionHeaderElement.text();
values.put("header", sectionHeaderText);
values.put("url", sectionDetailURL);
Element termElement = document.select("tr > td.dddefault > span:containsOwn(Associated Term)").first();
String term = termElement.nextSibling().toString();
values.put("term", term);
Element registrationElement = document.select("tr > td.dddefault > span:containsOwn(Registration Dates)").first();
String registration = registrationElement.nextSibling().toString();
values.put("registration", registration);
Element levelsElement = document.select("tr > td.dddefault > span:containsOwn(Levels)").first();
String levels = levelsElement.nextSibling().toString();
values.put("levels", levels);
Element campusElement = document.select("tr > td.dddefault > br + br").first();
String campus = campusElement.nextSibling().toString();
values.put("campus", campus);
Element scheduleTypeElement = campusElement.nextElementSibling();
String scheduleType = scheduleTypeElement.nextSibling().toString();
values.put("type", scheduleType);
Element creditElement = scheduleTypeElement.nextElementSibling();
String credit = creditElement.nextSibling().toString();
values.put("credit", credit);
Element catalogEntryElement = document.select("tr > td.dddefault > a[href]").first();
String catalogEntryURL = catalogEntryElement.absUrl("href");
values.put("catalog", catalogEntryURL);
parseCourseDetail(Jsoup.connect(sectionDetailURL).timeout(this.getTimeout()).get(), values);
parseCatalogEntry(Jsoup.connect(catalogEntryURL).timeout(this.getTimeout()).get(), values);
Elements meetingHeaderElements = document.select("table.datadisplaytable:has(caption:containsOwn(Scheduled Meeting Times)) th.ddheader");
Elements meetingRowElements = document.select("table.datadisplaytable:has(caption:containsOwn(Scheduled Meeting Times)) tr:has(td.dddefault)");
List<String> headers = new ArrayList<>();
for(Element meetingHeader : meetingHeaderElements){
headers.add(meetingHeader.text());
}
int row = 0;
for(Element meetingRow : meetingRowElements){
Elements meetingValues = meetingRow.select("td");
int index=0;
values.put("meeting."+row, Boolean.TRUE.toString());
for(Element meetingValue: meetingValues){
values.put("meeting."+row+"."+headers.get(index), meetingValue.text());
index++;
}
row++;
}
this.setRawResult(values);
}
/**
* Parse the Catalog Entry page for a given course to retrieve the long description of the course, the credit
* hour breakdown, and the department of the course
*
* @param document the Catalog Entry page HTML document
* @param values the retrieved course data set, including the newly added Catalog Entry values
*/
private void parseCatalogEntry(Document document, Map<String, String> values){
//Long description is in the first text node in the table
Element longDetailElement = document.select("table.datadisplaytable td.ntdefault").first();
String longDetail = longDetailElement.textNodes().get(0).toString();
values.put("description", longDetail);
//Credit hours are in TextNodes following the long description
List<TextNode> creditNodes = longDetailElement.textNodes();
for(TextNode creditNode : creditNodes){
String text = creditNode.text();
try(Scanner scanner = new Scanner(text);){
scanner.useDelimiter(" ");
if(text.contains("TO")){
logger.debug("Found credit range entry, will attempt to use max value in range. Range: {}", text);
//Some catalog entries use the "X.000 TO Y.000" Credits format for the credit hours
//in almost all cases, X is 0, so we take Y as the credit count - skip "X.000" and "TO"
scanner.next();
scanner.next();
}
if(scanner.hasNextDouble()){
double value = scanner.nextDouble();
String component = scanner.next();
values.put("credit."+component, Double.toString(value));
logger.debug("Found credit hour entry: {}={}", component, value);
}else{
logger.debug("Expected credit hour text node, found instead: {}", text);
}
}
}
//Department always seems to be 3rd text node from the end of the table
String department = longDetailElement.textNodes().get(longDetailElement.textNodes().size()-3).toString();
values.put("department", department);
}
/**
* Parse the Section Detail information page to retrieve the seating availability, registration restrictions,
* and prerequisites information
*
* @param document the Section Detail page HTML document
* @param values the retrieved course data set, including the newly added Section Detail values
*/
private void parseCourseDetail(Document document, Map<String, String> values){
Elements availabilityHeaders = document.select("caption:containsOwn(Registration Availability) + tbody th.ddheader span");
Elements availabilityValues = document.select("caption:containsOwn(Registration Availability) + tbody td.dddefault");
for(int pos = 0; pos < availabilityHeaders.size(); pos++){
String header = availabilityHeaders.get(pos).text();
String value = availabilityValues.get(pos).text();
values.put("seating." + header, value);
}
Element restrictionElement = document.select("span:containsOwn(Restriction)").first();
try{
for(Node node = restrictionElement.nextSibling(); !(node instanceof Element && ((Element)node).tag().equals(Tag.valueOf("span"))); node = node.nextSibling()){
logger.debug("Restriction: {}", node);
///TODO handle the restrictions list - grouping of restrictions (or restriction list elements) indicated by indentation
}
}catch(NullPointerException e){
//Not all courses will have restrictions and this element is only present if restrictions exist
logger.debug("No restriction found", e);
}
Element prerequisiteElement = document.select("span:containsOwn(Prerequisite)").first();
try{
for(Node node = prerequisiteElement.nextSibling(); node != null; node = node.nextSibling()){
logger.debug("Prereq: {}", node);
//TODO handle the prerequisite list - can be AND-OR or OR-AND formatted (keywords 'and' 'or' present to indicate w/ parentheses for grouping
}
} catch(Exception e){
//Not all courses will have prerequisites and this element is only present if prerequisites exist
logger.debug("No prequisite found", e);
}
}
}