/*
Copyright (C) 2011 Josh Schreuder
This file is part of SMSnatcher.
SMSnatcher is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
SMSnatcher is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with SMSnatcher. If not, see <http://www.gnu.org/licenses/>.
*/
package model;
import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.TextNode;
import org.jsoup.select.Elements;
public class LyricWikiScraper {
public static String getLyrics(String artist, String title) {
// Prepare artist and title for LyricWiki's URL format
artist = artist.replace(' ', '_');
String mod_title = title.replace(' ', '_');
Logger.LogToStatusBar("Getting lyrics ("+artist+" : " +mod_title+")!");
String url = "http://lyrics.wikia.com/"+artist+":"+mod_title;
Logger.LogToStatusBar(url);
String lyrics = "";
// Try to load page using Jsoup
try {
// Load page into Document
Document doc = Jsoup.connect(url).get();
// Get lyricBox from page
Elements lyricBox = doc.select("div.lyricbox");
//System.out.println(lyricBox.hasText());
if (!lyricBox.hasText()) {
Logger.LogToStatusBar("Lyrics not found!");
return "";
}
// Remove ads and junk
lyricBox.get(0).select("div.rtMatcher").remove();
lyricBox.get(0).select("div.lyricsbreak").remove();
// Remove comments
ParseUtils.removeComments(lyricBox.get(0));
// We now have almost perfect lyrics.
lyrics = lyricBox.get(0).html();
TextNode t = TextNode.createFromEncoded(lyrics, "lyricwiki");
lyrics = t.getWholeText();
//System.out.println(lyrics);
//Remove minimal HTML tags, leaving newlines intact
lyrics = lyrics.replaceAll("<br />", "");
lyrics = lyrics.replaceAll("<i>", "");
lyrics = lyrics.replaceAll("</i>", "");
lyrics = lyrics.replaceAll("<b>", "");
lyrics = lyrics.replaceAll("</b>", "");
lyrics = lyrics.replaceAll("<p>", "");
lyrics = lyrics.replaceAll("</p>", "");
lyrics = lyrics.replaceAll("<", "<");
lyrics = lyrics.replaceAll(">", ">");
lyrics = lyrics.replaceAll("�", "\'");
// Check if LyricWiki has full lyrics or only portion
if(lyrics.contains("we are not licensed to display the full lyrics")) {
return "";
}
else if(lyricBox.get(0).select("a").attr("title").contains("Instrumental")) {
return "Instrumental";
}
} catch (IOException e) {
// TODO Auto-generated catch block
Logger.LogToStatusBar("Lyrics not found!");
}
Logger.LogToStatusBar("Done");
return lyrics;
}
}