package model.collector.lyricstime;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Pattern;
import model.collector.Lyrics;
import model.regex.Regex;
public class HTMLParser {
/**
* the logger
*/
private final Logger logger = Logger.getLogger(this.getClass().getName());
/**
* the lyrics
*/
private Lyrics lyrics;
/**
* the regex the get the lyrics out of the html code
*/
private final static Regex regex = new Regex(".*<div id=\"songlyrics\" style=.*?>(.*?)</div>.*", Pattern.DOTALL);
/**
* gets the lyrics
*
* @return the lyrics
*/
public Lyrics getLyrics() {
return this.lyrics;
}
/**
* Constructor
*
* @param url
* the url where to find the html code
*
* @throws IOException
*/
public HTMLParser(String url) throws IOException {
logger.log(Level.FINER, "parse html. URL: " + url);
InputStream is = new URL(url).openConnection().getInputStream();
BufferedReader reader = new BufferedReader(new InputStreamReader(is, "UTF-8"));
StringBuffer sb = new StringBuffer();
String line;
while ((line = reader.readLine()) != null) {
sb.append(line).append("\n");
}
reader.close();
String html = sb.toString().trim();
lyrics = new Lyrics();
extractLyrics(html);
}
/**
* extracts the lyrics by using the defined regex and replacing all html
* related tags
*
* @param html
* the html code of the website
*/
private void extractLyrics(String html) {
if (regex.matches(html)) {
String lyrics = regex.getGroup(1);
lyrics = lyrics.replace("<p>", "");
lyrics = lyrics.replace("</p>", "");
lyrics = lyrics.replace("<br />", "");
this.lyrics.setLyrics(lyrics.trim());
}
}
}