/*
Copyright (C) 2011 Josh Schreuder
This file is part of SMSnatcher.
SMSnatcher is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
SMSnatcher is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with SMSnatcher. If not, see <http://www.gnu.org/licenses/>.
*/
package model;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
public class SongMeaningsScraper {
public static String getLyrics(String artist, String title) throws Exception {
// Check if user has a correction saved
String artistMatch = DataManager.getArtistCorrections().get(artist);
// If no correction saved, we must try and find the artist on site
if(artistMatch == null)
artistMatch = validateArtist(artist);
// If finding artist didn't get an exact match, we must prompt user later
if(artistMatch.compareTo("") == 0) {
Logger.LogToStatusBar("No artist match, we'll have to prompt later...");
return "";
}
HashMap<String, String> songList = DataManager.getSongMap().get(artistMatch);
// If we don't have songs for this artist, get them
if(songList == null) {
Logger.LogToStatusBar("No track present, have to parse them...");
if(SMDataParser.parseSongsPage(artistMatch, DataManager.getArtistMap().get(artistMatch)) == 0) {
return "";
}
}
songList = DataManager.getSongMap().get(artistMatch);
if (songList == null) {
Logger.LogToStatusBar("Artist has no tracks to get lyrics from, returning");
return "";
}
Logger.LogToStatusBar("Tracks for artist " + artistMatch + " are present, trying to find the right one");
//System.out.println(songList);
Logger.LogToStatusBar("Checking that artist has song " + title);
String songTitle = validateSong(artistMatch, title);
// Try removing the parentheses
if(songTitle.compareTo("") == 0) {
Pattern p = Pattern.compile(" \\(.*\\)");
Matcher m = p.matcher(title);
if(m.find()) {
Logger.LogToStatusBar("Song title has brackets, we'll try searching without them.");
title = title.replaceAll(" \\(.*\\)","");
songTitle = validateSong(artistMatch, title);
}
}
// If finding song didn't get an exact match, we must prompt user later
if(songTitle.compareTo("") == 0) {
Logger.LogToStatusBar("No song match, we'll have to prompt later...");
return "";
}
Logger.LogToStatusBar("The closest match we found for " + title + " was " + songTitle);
String songURL = songList.get(songTitle);
songURL = "http://songmeanings.net" + songURL;
Logger.LogToStatusBar(songURL);
String lyrics = scrapeLyricsPage(songURL);
//System.out.println(lyrics);
return lyrics;
}
private static String scrapeLyricsPage(String songURL) {
String lyrics = "";
// Try to load page using Jsoup
try {
// Load page into Document
Document doc = Jsoup.connect(songURL).get();
// Get lyricBox from page
Elements lyricBox = doc.select("#textblock");
// Remove ads
lyricBox.get(0).getElementsByTag("div").remove();
// Remove comments
ParseUtils.removeComments(lyricBox.get(0));
// We now have almost perfect lyrics.
lyrics = lyricBox.html();
/*TextNode t = TextNode.createFromEncoded(lyrics, "songmeanings.net");
lyrics = t.getWholeText();
Remove minimal HTML tags, leaving newlines intact
*/
lyrics = lyrics.replaceAll("<br />", "");
lyrics = lyrics.replaceAll("<i>", "");
lyrics = lyrics.replaceAll("</i>", "");
lyrics = lyrics.replaceAll("<b>", "");
lyrics = lyrics.replaceAll("</b>", "");
lyrics = lyrics.replaceAll("<p>", "");
lyrics = lyrics.replaceAll("</p>", "");
lyrics = lyrics.replaceAll("<", "<");
lyrics = lyrics.replaceAll(">", ">");
lyrics = lyrics.replaceAll("�", "\'");
if(lyrics.contains("Due to copyright restrictions") || lyrics.contains("Due to a publisher block")) {
Logger.LogToStatusBar("Copyright restrictions on this track, bailing out!");
return "";
}
lyrics = " " + lyrics;
//System.out.println(lyrics);
} catch (IOException e) {
// TODO Auto-generated catch block
System.out.println("Lyrics not found!");
}
System.out.println("Done");
return lyrics;
}
private static String validateSong(String artist, String title) {
HashMap<String, String> songList = DataManager.getSongMap().get(artist);
for(String songFromMap : songList.keySet()) {
int levDist = StringUtils.getLevenshteinDistance(songFromMap.toUpperCase(), title.toUpperCase());
double ratio = (songFromMap.length() - levDist + 0.0) / (songFromMap.length() + 0.0);
if(ratio == 1.0) {
Logger.LogToStatusBar(songFromMap + " exactly matches");
return songFromMap;
}
else if(ratio >= 0.5) {
ArrayList<String> matches = DataManager.getSongMatches().get(artist + " " + title);
if(matches == null) {
matches = new ArrayList<String>();
matches.add(songFromMap);
DataManager.getSongMatches().put(artist + " " + title, matches);
}
else {
matches.add(songFromMap);
DataManager.getSongMatches().remove(artist + " " + title);
DataManager.getSongMatches().put(artist + " " + title, matches);
}
}
}
return "";
}
public static String validateArtist(String artist) {
HashMap<String,String> artists = DataManager.getArtistMap();
for(String artistFromMap : artists.keySet()) {
int levDist = StringUtils.getLevenshteinDistance(artistFromMap.toUpperCase(), artist.toUpperCase());
double ratio = (artistFromMap.length() - levDist + 0.0) / (artistFromMap.length() + 0.0);
if(ratio == 1.0) {
Logger.LogToStatusBar(artistFromMap + " exactly matches");
return artistFromMap;
}
else if(ratio >= 0.5) {
ArrayList<String> matches = DataManager.getArtistMatches().get(artist);
if(matches == null) {
matches = new ArrayList<String>();
matches.add(artistFromMap);
DataManager.getArtistMatches().put(artist, matches);
}
else {
matches.add(artistFromMap);
DataManager.getArtistMatches().remove(artist);
DataManager.getArtistMatches().put(artist, matches);
}
}
}
return "";
}
public static void addArtistCorrection(String original, String updated) {
DataManager.getArtistCorrections().put(original, updated);
}
}