HTMLParser.java example

Explorer
MP3ToolKit-master
- src
  - main
    - java
package model.collector.lyricstime;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Pattern;

import model.collector.Lyrics;
import model.regex.Regex;

public class HTMLParser {

	/**
	 * the logger
	 */
	private final Logger logger = Logger.getLogger(this.getClass().getName());

	/**
	 * the lyrics
	 */
	private Lyrics lyrics;

	/**
	 * the regex the get the lyrics out of the html code
	 */
	private final static Regex regex = new Regex(".*<div id=\"songlyrics\" style=.*?>(.*?)</div>.*", Pattern.DOTALL);

	/**
	 * gets the lyrics
	 * 
	 * @return the lyrics
	 */
	public Lyrics getLyrics() {
		return this.lyrics;
	}

	/**
	 * Constructor
	 * 
	 * @param url
	 *            the url where to find the html code
	 * 
	 * @throws IOException
	 */
	public HTMLParser(String url) throws IOException {
		logger.log(Level.FINER, "parse html. URL: " + url);

		InputStream is = new URL(url).openConnection().getInputStream();
		BufferedReader reader = new BufferedReader(new InputStreamReader(is, "UTF-8"));

		StringBuffer sb = new StringBuffer();
		String line;
		while ((line = reader.readLine()) != null) {
			sb.append(line).append("\n");
		}

		reader.close();

		String html = sb.toString().trim();
		lyrics = new Lyrics();

		extractLyrics(html);
	}

	/**
	 * extracts the lyrics by using the defined regex and replacing all html
	 * related tags
	 * 
	 * @param html
	 *            the html code of the website
	 */
	private void extractLyrics(String html) {
		if (regex.matches(html)) {
			String lyrics = regex.getGroup(1);
			lyrics = lyrics.replace("<p>", "");
			lyrics = lyrics.replace("</p>", "");
			lyrics = lyrics.replace("<br />", "");

			this.lyrics.setLyrics(lyrics.trim());
		}
	}
}