CourseParser.java example

Explorer
scheduler-legacy-master
- src
  - main
    - java
/**
 * @(#) CourseParser.java
 *
 * This file is part of the Course Scheduler, an open source, cross platform
 * course scheduling tool, configurable for most universities.
 *
 * Copyright (C) 2010-2014 Devyse.io; All rights reserved.
 *
 * @license GNU General Public License version 3 (GPLv3)
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see http://www.gnu.org/licenses/.
 */
package io.devyse.scheduler.parse.jsoup.banner;

import io.devyse.scheduler.parse.jsoup.AbstractParser;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Scanner;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.parser.Tag;
import org.jsoup.select.Elements;
import org.slf4j.ext.XLogger;
import org.slf4j.ext.XLoggerFactory;

/**
 * A CourseParser parses a sub-document extracted from the main course search results document
 * in order to find the course data.
 * 
 * @author Mike Reinhold
 * @since 4.12.4
 */
public class CourseParser extends AbstractParser<Map<String, String>>{

	/**
	 * Static logger
	 */
	private static XLogger logger = XLoggerFactory.getXLogger(CourseParser.class);
	
	/**
	 * Serial Version UID
	 */
	private static final long serialVersionUID = 1L;

	/**
	 * Create a new CourseParser for parsing the Course document that is extracted
	 * from the main course search results page
	 * 
	 * @param document the course document
	 * @param timeout the socket connection timeout for any created connections
	 */
	public CourseParser(Document document, int timeout){
		super(document, timeout);
	}

	/* (non-Javadoc)
	 * @see io.devyse.scheduler.parse.jsoup.AbstractParser#parse(org.jsoup.nodes.Document)
	 */
	protected void parse(Document document) throws IOException{
		Map<String, String> values = new HashMap<>();
		Element sectionHeaderElement = document.select("tr > th.ddtitle > a[href]").first();
		
		String sectionDetailURL = sectionHeaderElement.absUrl("href");
		String sectionHeaderText = sectionHeaderElement.text();
		values.put("header", sectionHeaderText);
		values.put("url", sectionDetailURL);
		
		Element termElement = document.select("tr > td.dddefault > span:containsOwn(Associated Term)").first();
		String term = termElement.nextSibling().toString();
		values.put("term", term);
		
		Element registrationElement = document.select("tr > td.dddefault > span:containsOwn(Registration Dates)").first();
		String registration = registrationElement.nextSibling().toString();
		values.put("registration", registration);
		
		Element levelsElement = document.select("tr > td.dddefault > span:containsOwn(Levels)").first();
		String levels = levelsElement.nextSibling().toString();
		values.put("levels", levels);
		
		Element campusElement = document.select("tr > td.dddefault > br + br").first();
		String campus = campusElement.nextSibling().toString();
		values.put("campus", campus);
				
		Element scheduleTypeElement = campusElement.nextElementSibling();
		String scheduleType = scheduleTypeElement.nextSibling().toString();
		values.put("type", scheduleType);
		
		Element creditElement = scheduleTypeElement.nextElementSibling();
		String credit = creditElement.nextSibling().toString();
		values.put("credit", credit);
		
		Element catalogEntryElement = document.select("tr > td.dddefault > a[href]").first();
		String catalogEntryURL = catalogEntryElement.absUrl("href");
		values.put("catalog", catalogEntryURL);
		
		parseCourseDetail(Jsoup.connect(sectionDetailURL).timeout(this.getTimeout()).get(), values);
		parseCatalogEntry(Jsoup.connect(catalogEntryURL).timeout(this.getTimeout()).get(), values);
	
		Elements meetingHeaderElements = document.select("table.datadisplaytable:has(caption:containsOwn(Scheduled Meeting Times)) th.ddheader");
		Elements meetingRowElements = document.select("table.datadisplaytable:has(caption:containsOwn(Scheduled Meeting Times)) tr:has(td.dddefault)");
		List<String> headers = new ArrayList<>();
		
		for(Element meetingHeader : meetingHeaderElements){
			headers.add(meetingHeader.text());
		}
		
		int row = 0;
		for(Element meetingRow : meetingRowElements){
			Elements meetingValues = meetingRow.select("td");
			
			int index=0;
			values.put("meeting."+row, Boolean.TRUE.toString());
			for(Element meetingValue: meetingValues){
				values.put("meeting."+row+"."+headers.get(index), meetingValue.text());
				index++;
			}
			row++;
		}
		
		this.setRawResult(values);
	}
	
	/**
	 * Parse the Catalog Entry page for a given course to retrieve the long description of the course, the credit 
	 * hour breakdown, and the department of the course
	 * 
	 * @param document the Catalog Entry page HTML document
	 * @param values the retrieved course data set, including the newly added Catalog Entry values
	 */
	private void parseCatalogEntry(Document document, Map<String, String> values){
		//Long description is in the first text node in the table
		Element longDetailElement = document.select("table.datadisplaytable td.ntdefault").first();
		String longDetail = longDetailElement.textNodes().get(0).toString();
		values.put("description", longDetail);
		
		//Credit hours are in TextNodes following the long description
		List<TextNode> creditNodes = longDetailElement.textNodes();
		for(TextNode creditNode : creditNodes){
			String text = creditNode.text();
			try(Scanner scanner = new Scanner(text);){
				scanner.useDelimiter(" ");
				if(text.contains("TO")){
					logger.debug("Found credit range entry, will attempt to use max value in range. Range: {}", text);
					//Some catalog entries use the "X.000 TO Y.000" Credits format for the credit hours
					//in almost all cases, X is 0, so we take Y as the credit count - skip "X.000" and "TO"
					scanner.next();
					scanner.next();
				} 
				
				if(scanner.hasNextDouble()){
					double value = scanner.nextDouble();
					String component = scanner.next();
					values.put("credit."+component, Double.toString(value));
					logger.debug("Found credit hour entry: {}={}", component, value);
				}else{
					logger.debug("Expected credit hour text node, found instead: {}", text);
				}
			}
		}
		
		//Department always seems to be 3rd text node from the end of the table
		String department = longDetailElement.textNodes().get(longDetailElement.textNodes().size()-3).toString();
		values.put("department", department);
	}
	
	/**
	 * Parse the Section Detail information page to retrieve the seating availability, registration restrictions,
	 * and prerequisites information
	 * 
	 * @param document the Section Detail page HTML document
	 * @param values the retrieved course data set, including the newly added Section Detail values
	 */
	private void parseCourseDetail(Document document, Map<String, String> values){
		Elements availabilityHeaders = document.select("caption:containsOwn(Registration Availability) + tbody th.ddheader span");
		Elements availabilityValues = document.select("caption:containsOwn(Registration Availability) + tbody td.dddefault");
				
		for(int pos = 0; pos < availabilityHeaders.size(); pos++){
			String header = availabilityHeaders.get(pos).text();
			String value = availabilityValues.get(pos).text();
			values.put("seating." + header, value);
		}
		
		Element restrictionElement = document.select("span:containsOwn(Restriction)").first();
		try{
			for(Node node = restrictionElement.nextSibling(); !(node instanceof Element && ((Element)node).tag().equals(Tag.valueOf("span"))); node = node.nextSibling()){
				logger.debug("Restriction: {}", node);
				///TODO handle the restrictions list - grouping of restrictions (or restriction list elements) indicated by indentation
			}
		}catch(NullPointerException e){
			//Not all courses will have restrictions and this element is only present if restrictions exist
			logger.debug("No restriction found", e);
		}
		
		Element prerequisiteElement = document.select("span:containsOwn(Prerequisite)").first();
		try{
			for(Node node = prerequisiteElement.nextSibling(); node != null; node = node.nextSibling()){
				logger.debug("Prereq: {}", node);
				//TODO handle the prerequisite list - can be AND-OR or OR-AND formatted (keywords 'and' 'or' present to indicate w/ parentheses for grouping
			}
		} catch(Exception e){
			//Not all courses will have prerequisites and this element is only present if prerequisites exist
			logger.debug("No prequisite found", e);
		}
	}
}