EventExtractor.java example

/*
 * Copyright 2011 Stefan Partusch
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package de.spartusch.nasfvi;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PrintStream;
import java.io.StringReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import de.spartusch.StringMethods;

/**
 * Extracts data of events from the university calendar of the University of
 * Munich. This data may be ingested into the
 * {@link de.spartusch.nasfvi.server.XmlIndex index}. This class is designed
 * as a stand-alone application. Start it from the command line and feed it
 * URLs pointing directly to the details of an event. The URLs are read from
 * the standard input. The extracted data is written to the standard output.
 * <p>The XML format written by this class is like this:</p>
 * <pre>
 * <veranstaltung>
 * 	<semester>...</semester>
 * 	<titel>...</titel>
 * 	<dozent>...</dozent>
 * 	<typ>...</typ>
 * 	<termin>...</termin>
 * 	<beschreibung>...</beschreibung>
 * </veranstaltung>
 * </pre> 
 * @author Stefan Partusch
 * @see <a href="https://lsf.verwaltung.uni-muenchen.de/qisserver/rds?state=user&type=0">University calendar, University of Munich</a>
 *
 */
public final class EventExtractor {
	/** Default charset for webpages and for output. */
	private static final String DEFAULT_CHARSET = "UTF-8";
	/** Regular expression to extract the type, title, semester and
	 * description of the event. */
	private static final Pattern FIELDS =
		Pattern.compile("<th[^>]*>(\\p{L}+)</th>\\s*<td[^>]*>(.+?)</td",
				Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
	/** Regular expression to split the value of the semester field. */
	private static final Pattern SEMESTER =
		Pattern.compile("^(WS|SS) (\\d\\d)(\\d\\d)$");
	/** A map from the name of a field to the corresponding XML tag. */
	private static final Map<String, String> FIELDS_TO_TAGS;
	static {
		FIELDS_TO_TAGS = new HashMap<String, String>();
		FIELDS_TO_TAGS.put("Veranstaltungsart", "typ");
		FIELDS_TO_TAGS.put("Langtext", "titel");
		FIELDS_TO_TAGS.put("Semester", "semester");
		FIELDS_TO_TAGS.put("Kommentar", "beschreibung");
	}

	private EventExtractor() {
		throw new AssertionError();
	}

	/**
	 * Loads the content of an URL using a HTTP GET request.
	 * @param url URL to load
	 * @return Content of <code>url</code>
	 * @throws IOException If some IO problem occurs
	 */
	private static String load(final String url) throws IOException {
		HttpURLConnection conn =
			(HttpURLConnection) new URL(url).openConnection();

		conn.setDoInput(true);
		conn.setDoOutput(false);
		conn.setRequestMethod("GET");
		conn.connect();

		int respCode = conn.getResponseCode();
		if (respCode != HttpURLConnection.HTTP_OK) {
			throw new IOException(conn.getResponseMessage());
		}

		String contentType =
			conn.getHeaderField("Content-Type").toUpperCase(Locale.ENGLISH);
		String csName = DEFAULT_CHARSET;
		int csPos = contentType.lastIndexOf("CHARSET=");
		if (csPos != -1) {
			csName = contentType.substring(csPos + "CHARSET=".length()).trim();
		}
		Charset charset = Charset.forName(csName);

		InputStream is = conn.getInputStream();
		if (is == null) {
			throw new IOException("No input stream");
		}
		InputStreamReader isr = new InputStreamReader(is, charset);
		StringBuilder sb = new StringBuilder();
		try {
			int c;
			while ((c = isr.read()) != -1) {
				sb.appendCodePoint(c);
			}
		} finally {
			isr.close();
		}

		return sb.toString();
	}

	/**
	 * Extracts the type, title, semester and description of the event. The
	 * extracted values and corresponding tags are added to the StringBuilder.
	 * @param xml StringBuilder to add the extracted data to
	 * @param input Source code of the webpage
	 */
	private static void addFields(StringBuilder xml, final String input) {
		Matcher m = FIELDS.matcher(input);

		while (m.find()) {
			String tag = FIELDS_TO_TAGS.get(m.group(1));

			if (tag != null) {
				String value = m.group(2);

				// test for semester
				Matcher semMatcher = SEMESTER.matcher(value);
				if (semMatcher.find()) {
					if (semMatcher.group(1).equals("WS")) {
						value = "20" + semMatcher.group(2)
							+ "/20" + semMatcher.group(3);
					} else {
						value = semMatcher.group(2) + semMatcher.group(3);
					}
				}

				addTag(xml, tag, value);
			}
		}
	}

	/**
	 * Extracts the names of the lecturers of the event. The extracted values
	 * and corresponding tags are added to the StringBuilder.
	 * @param xml StringBuilder to add the extracted data to
	 * @param input Source code of the webpage
	 * @throws IOException If an IO exception occurs
	 */
	private static void addLecturers(StringBuilder xml,
			final String input) throws IOException {
		String table = StringMethods.getDelimitedSubstring(input,
				"<table summary=\"Verantwortliche Dozenten\">",
				"</table>");
		BufferedReader br = new BufferedReader(
				new StringReader(StringMethods.stripHTML(table, false)));

		String line;
		boolean first = true; // ignore the caption
		while ((line = br.readLine()) != null) {
			if (first) {
				first = false;
			} else {
				addTag(xml, "dozent", line);
			}
		}
	}

	/**
	 * Extracts the dates and rooms of the event. The extracted values and
	 * corresponding tags are added to the StringBuilder.
	 * @param xml StringBuilder to add the extracted data to
	 * @param input Source code of the webpage
	 */
	private static void addDates(StringBuilder xml, final String input) {
		String table = StringMethods.getDelimitedSubstring(input,
				"<table summary=\"Übersicht über alle Veranstaltungstermine\">",
				"</table>");
		
		if(table == null)
			return;
		
		String[] rows = table.split("</?tr>");
		
		for (int i = 2; i < rows.length; i++) {
			// ignore first row
			String[] cells = rows[i].split("</?td>");
			
			if (cells.length > 4) {
				addTag(xml, "termin", cells[0] + "<br>" + cells[4]);
			}
		}
	}

	/**
	 * Adds a tag to the StringBuilder.
	 * @param xml StringBuilder to add the tag to
	 * @param name Name of the tag
	 * @param value Value of the tag
	 */
	private static void addTag(StringBuilder xml, final String name,
			final String value) {
		String v = StringMethods.stripHTML(value, true);
		
		v = v.replace("&", "&");
		v = v.replace("<", "<").replace(">", ">");
		v = v.replaceAll("\\s+", " ");

		xml.append("\t<").append(name).append(">");
		xml.append(v.trim());
		xml.append("</").append(name).append(">\n");
	}

	/**
	 * Extracts all relevant data from the event.
	 * @param input Source code of the webpage with all details of the event
	 * @return Extracted data in the required XML format
	 * @throws IOException If an IO error occurs
	 */
	private static String parse(final String input) throws IOException {
		StringBuilder xml = new StringBuilder("<veranstaltung>\n");
		addFields(xml, input);
		addLecturers(xml, input);
		addDates(xml, input);
		return xml.append("</veranstaltung>").toString();
	}

	/**
	 * Entry point of the stand-alone application.
	 * @param args Not used
	 * @throws IOException If an IO error occurs
	 */
	public static void main(final String[] args) throws IOException {
		BufferedReader in =
			new BufferedReader(new InputStreamReader(System.in));
		PrintStream out = new PrintStream(System.out, true, DEFAULT_CHARSET);

		String line;
		while ((line = in.readLine()) != null) {
			out.println(parse(load(line)));
		}
	}
}