package moviescraper.doctord.controller.siteparsingprofile.specific;
import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.Locale;
import java.util.regex.Pattern;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.lang3.text.WordUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import moviescraper.doctord.controller.languagetranslation.Language;
import moviescraper.doctord.controller.languagetranslation.TranslateString;
import moviescraper.doctord.controller.siteparsingprofile.SiteParsingProfile;
import moviescraper.doctord.model.SearchResult;
import moviescraper.doctord.model.dataitem.Actor;
import moviescraper.doctord.model.dataitem.Director;
import moviescraper.doctord.model.dataitem.Genre;
import moviescraper.doctord.model.dataitem.ID;
import moviescraper.doctord.model.dataitem.MPAARating;
import moviescraper.doctord.model.dataitem.OriginalTitle;
import moviescraper.doctord.model.dataitem.Outline;
import moviescraper.doctord.model.dataitem.Plot;
import moviescraper.doctord.model.dataitem.Rating;
import moviescraper.doctord.model.dataitem.ReleaseDate;
import moviescraper.doctord.model.dataitem.Runtime;
import moviescraper.doctord.model.dataitem.Set;
import moviescraper.doctord.model.dataitem.SortTitle;
import moviescraper.doctord.model.dataitem.Studio;
import moviescraper.doctord.model.dataitem.Tagline;
import moviescraper.doctord.model.dataitem.Thumb;
import moviescraper.doctord.model.dataitem.Title;
import moviescraper.doctord.model.dataitem.Top250;
import moviescraper.doctord.model.dataitem.Trailer;
import moviescraper.doctord.model.dataitem.Votes;
import moviescraper.doctord.model.dataitem.Year;
public class CaribbeancomPremiumParsingProfile extends SiteParsingProfile implements SpecificProfile {
private Document japaneseDocument;
private Thumb[] scrapedPosters;
private static final SimpleDateFormat caribbeanReleaseDateFormat = new SimpleDateFormat("yyyy-mm-dd", Locale.ENGLISH);
@Override
public Title scrapeTitle() {
String japaneseTitle = getJapaneseTitleText();
if(getScrapingLanguage() == Language.ENGLISH && japaneseTitle.length() > 0)
return new Title(WordUtils.capitalize(TranslateString.translateStringJapaneseToEnglish(japaneseTitle)));
else return new Title(japaneseTitle);
}
private String getJapaneseTitleText(){
initializeJapaneseDocument();
Element titleElement = japaneseDocument.select("div.video-detail h1").first();
if(titleElement != null)
{
return titleElement.text();
}
return "";
}
@Override
public OriginalTitle scrapeOriginalTitle() {
return new OriginalTitle(getJapaneseTitleText());
}
@Override
public SortTitle scrapeSortTitle() {
return SortTitle.BLANK_SORTTITLE;
}
@Override
public Set scrapeSet() {
//the studio is not on the english version of this page, so we need to go to the japanese one
initializeJapaneseDocument();
if (japaneseDocument != null)
{
Element setElement = japaneseDocument.select("div.movie-info dl dt:contains(シリーズ:) ~ dd a").first();
if(setElement != null)
{
String setElementTranslatedText = setElement.text().trim();
if(getScrapingLanguage() == Language.ENGLISH)
setElementTranslatedText = TranslateString.translateStringJapaneseToEnglish(setElement.text().trim());
if(setElementTranslatedText != null && setElementTranslatedText.length() > 0)
return new Set(setElementTranslatedText);
}
}
return Set.BLANK_SET;
}
@Override
public Rating scrapeRating() {
// this site does not have ratings, so just return some default values
return new Rating(0, "0");
}
@Override
public Year scrapeYear() {
return scrapeReleaseDate().getYear();
}
@Override
public ReleaseDate scrapeReleaseDate()
{
Element yearElement = document
.select("tr td:contains(Update:) ~ td:contains(-)")
.first();
if(yearElement != null && yearElement.text().length() > 4)
{
return new ReleaseDate(yearElement.text(), caribbeanReleaseDateFormat);
}
else return ReleaseDate.BLANK_RELEASEDATE;
}
@Override
public Top250 scrapeTop250() {
// This type of info doesn't exist on this site
return Top250.BLANK_TOP250;
}
@Override
public Votes scrapeVotes() {
// This type of info doesn't exist on this site
return Votes.BLANK_VOTES;
}
@Override
public Outline scrapeOutline() {
// This type of info doesn't exist on this site
return Outline.BLANK_OUTLINE;
}
@Override
public Plot scrapePlot() {
initializeJapaneseDocument();
Element plotElement = japaneseDocument.select("div.movie-comment p").first();
if(plotElement != null && plotElement.text().length() > 0)
{
if(getScrapingLanguage() == Language.ENGLISH)
return new Plot(TranslateString.translateStringJapaneseToEnglish(plotElement.text()));
else return new Plot(plotElement.text());
}
return Plot.BLANK_PLOT;
}
@Override
public Tagline scrapeTagline() {
//This type of info doesn't exist on this site
return Tagline.BLANK_TAGLINE;
}
@Override
public Runtime scrapeRuntime() {
initializeJapaneseDocument();
Element durationElement = japaneseDocument.select("div.movie-info dl dt:contains(再生時間:) + dd").first();
if(durationElement != null && durationElement.text().trim().length() > 0)
{
String [] durationSplitByTimeUnit = durationElement.text().split(":");
if(durationSplitByTimeUnit.length == 3)
{
int hours = Integer.parseInt(durationSplitByTimeUnit[0]);
int minutes = Integer.parseInt(durationSplitByTimeUnit[1]);
//we don't care about seconds
int totalMinutes = (hours * 60) + minutes;
return new Runtime(new Integer(totalMinutes).toString());
}
}
return Runtime.BLANK_RUNTIME;
}
@Override
public Thumb[] scrapePosters() {
List<Thumb> posters = new LinkedList<>();
Element posterElement = document
.select("td.detail_main a[href*=/images/")
.first();
if(posterElement != null)
{
String posterPath = posterElement.attr("abs:href");
String previewPath = posterElement.select("img").first().attr("abs:src");
try {
Thumb posterThumb = new Thumb(posterPath);
posterThumb.setPreviewURL(new URL(previewPath));
posters.add(posterThumb);
} catch (MalformedURLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
//get the extra 3 free images they give
ID id = scrapeID();
if(id != null)
{
for(int i = 1; i <= 3; i++)
{
String currentImagePath = "http://www.caribbeancompr.com/moviepages/" + id.getId() + "/images/l/00" + i + ".jpg";
String currentImagePathPreview = "http://www.caribbeancompr.com/moviepages/" + id.getId() + "/images/s/00" + i + ".jpg";
if(fileExistsAtURL(currentImagePath))
{
try {
Thumb currentImage = new Thumb(currentImagePath);
currentImage.setPreviewURL(new URL(currentImagePathPreview));
posters.add(currentImage);
} catch (MalformedURLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
}
scrapedPosters = posters.toArray(new Thumb[posters.size()]);
return scrapedPosters;
}
@Override
public Thumb[] scrapeFanart() {
//Believe it or not, the fanart (dvd cover) exists, but is normally only set as the preview of the trailer
//it follows a predictable URL structure though, so we can grab it anyways :)
//start by grabbing the ID part of the current page
String urlOfCurrentPage = document.location();
if(urlOfCurrentPage != null && urlOfCurrentPage.contains("moviepages"))
{
urlOfCurrentPage = urlOfCurrentPage.replaceFirst(Pattern.quote("http://en.caribbeancompr.com/eng/moviepages/"), "");
String movieID = urlOfCurrentPage.replaceFirst(Pattern.quote("/index.html"), "");
if(urlOfCurrentPage.length() > 1)
{
String imageURL = "http://www.caribbeancompr.com/moviepages/" + movieID + "/images/l_l.jpg";
try {
Thumb fanartThumbs[] = new Thumb[1];
Thumb fanartThumb = new Thumb(imageURL);
//also allow the user to use posters as the fanart
Thumb [] additionalPosterThumbs;
fanartThumbs[0] = fanartThumb;
additionalPosterThumbs = (scrapedPosters == null) ? scrapePosters() : scrapedPosters;
Thumb[] allCombinedFanart = ArrayUtils.addAll(fanartThumbs, additionalPosterThumbs);
return allCombinedFanart;
} catch (MalformedURLException e) {
e.printStackTrace();
return new Thumb[0];
}
}
}
return new Thumb[0];
}
@Override
public Thumb[] scrapeExtraFanart() {
String urlOfCurrentPage = document.location();
if(urlOfCurrentPage != null && urlOfCurrentPage.contains("moviepages"))
{
urlOfCurrentPage = urlOfCurrentPage.replaceFirst(Pattern.quote("http://en.caribbeancompr.com/eng/moviepages/"), "");
String movieID = urlOfCurrentPage.replaceFirst(Pattern.quote("/index.html"), "");
if(urlOfCurrentPage.length() > 1)
{
Thumb extraFanartThumbs[] = new Thumb[3];
for(int i = 1; i < 4; i++)
{
String extraThumbURL = "http://en.caribbeancompr.com/moviepages/" + movieID + "/images/l/00" + i + ".jpg";
try {
Thumb extraFanartThumb = new Thumb(extraThumbURL);
extraFanartThumbs[i-1] = extraFanartThumb;
} catch (MalformedURLException e) {
e.printStackTrace();
return new Thumb[0];
}
}
return extraFanartThumbs;
}
}
return new Thumb[0];
}
@Override
public MPAARating scrapeMPAA() {
return MPAARating.RATING_XXX;
}
@Override
public ID scrapeID() {
Element idElement = document
.select("tr td:contains(Movie ID:) ~ td:contains(_)")
.first();
if(idElement != null && idElement.text().length() > 0)
{
return new ID(idElement.text());
}
else return new ID("");
}
@Override
public ArrayList<Genre> scrapeGenres() {
ArrayList<Genre> genresReturned = new ArrayList<>();
initializeJapaneseDocument();
Elements genreElementsInJapanese = japaneseDocument.select("dl.movie-info-cat dd a");
for(Element currentGenre : genreElementsInJapanese)
{
if(getScrapingLanguage() == Language.ENGLISH)
{
//the genre is coded as a specific webpage number. we can call our helper function to translate a number
//like 1_1.html into the actual english genre this represents
String currentGenreCode = currentGenre.attr("href");
if(currentGenreCode.contains("/"))
{
//currentGenreCode will just be the numerical part after this function call (e.g. 1_1)
currentGenreCode = currentGenreCode.substring(currentGenreCode.lastIndexOf('/')).replaceFirst(Pattern.quote(".html"),"").replaceFirst(Pattern.quote("/"), "");
String englishGenreName = convertGenreCodeToDescription(currentGenreCode);
if(englishGenreName != null && !genresReturned.contains(englishGenreName))
genresReturned.add(new Genre(englishGenreName));
}
}
else if(getScrapingLanguage() == Language.JAPANESE)
{
genresReturned.add(new Genre(currentGenre.text().trim()));
}
}
return genresReturned;
}
private String convertGenreCodeToDescription(String currentGenreCode) {
switch(currentGenreCode)
{
case "1_1": return "Pornstar";
case "2_1": return "School Girls";
case "3_1": return "Amateur";
case "4_1": return "Sister";
case "5_1": return "Lolita";
case "6_1": return "MILF / Housewife";
case "8_1": return "Slut";
case "9_1": return "Big Tits";
case "10_1": return "Gonzo";
case "11_1": return "Creampie";
case "12_1": return "Squirting";
case "13_1": return "Orgy";
case "14_1": return "Cosplay";
case "15_1": return "Teen";
case "16_1": return "Gal";
case "17_1": return "Idol";
case "18_1": return "Teacher";
case "20_1": return "Big Tits";
case "21_1": return "Swimsuit";
case "22_1": return "Bondage";
case "24_1": return "Outdoor Exposure";
case "26_1": return "Documentary";
case "27_1": return "Seduction";
case "28_1": return "S&M";
case "29_1": return "Shaved Pussy";
case "30_1": return "Restraints";
case "31_1": return "Masturbation";
case "32_1": return "Vibrator";
case "33_1": return "Fucking";
case "34_1": return "Blowjob";
case "35_1": return "Semen";
case "36_1": return "Cum Swallow";
case "37_1": return "Golden Shower";
case "38_1": return "Handjob";
case "39_1": return "69";
case "40_1": return "Anal";
case "42_1": return "Cunnilingus";
case "43_1": return "Best / VA";
case "44_1": return "Bareback Fucking";
case "45_1": return "Nurse";
case "46_1": return "Bloomers";
case "47_1": return "Molester";
case "49_1": return "White Girl";
case "51_1": return "Anime";
case "52_1": return "Insult";
case "53_1": return "First Time Porn";
case "56_1": return "Uniforms";
case "55_1": return "Pornstar";
case "62_1": return "Ass";
case "64_1": return "Legs";
case "65_1": return "Bukkake";
case "67_1": return "Deep Throating";
case "69_1": return "Transsexual";
case "70_1": return "Teen";
case "71_1": return "Look-alike";
case "72_1": return "Small Tits";
case "73_1": return "Slender";
case "74_1": return "Car Sex";
case "75_1": return "Shaving";
case "77_1": return "Dirty Words";
case "78_1": return "Cumshot";
case "79_1": return "Facial";
case "80_1": return "Apron";
case "81_1": return "Glasses";
case "82_1": return "OL";
case "83_1": return "Maid";
case "84_1": return "Yukata / Kimono";
default:
break;
}
//System.out.println("No genre match for " + currentGenreCode);
return null;
}
@Override
public ArrayList<Actor> scrapeActors() {
ArrayList<Actor> actorList = new ArrayList<>();
initializeJapaneseDocument();
Element actorElement = document
.select("tr td:contains(Starring:) ~ td a")
.first();
Elements japaneseActors = japaneseDocument.select("div.movie-info dl dt:contains(出演:) ~ dd a");
String urlOfCurrentPage = document.location();
String actorThumbURL = null;
if(actorElement != null && getScrapingLanguage() == Language.ENGLISH)
{
String actorName = WordUtils.capitalize(actorElement.attr("title"));
//get the actor thumbnail associated with this page
if(urlOfCurrentPage != null && urlOfCurrentPage.contains("moviepages"))
{
urlOfCurrentPage = urlOfCurrentPage.replaceFirst(Pattern.quote("http://en.caribbeancompr.com/eng/moviepages/"), "http://www.caribbeancompr.com/moviepages/");
actorThumbURL = urlOfCurrentPage.replaceFirst(Pattern.quote("/index.html"), "/images/n.jpg");
}
//we will try to sort out multiple actors into seperate ones if the number of words is even so that each 2 words can be a complete name
String [] actorNamesSplitUp = actorName.split(" ");
if(actorNamesSplitUp.length >= 2 && (actorNamesSplitUp.length % 2 == 0))
{
for (int i = 0; i < actorNamesSplitUp.length; i+=2)
{
String currentActorName = actorNamesSplitUp[i] + " " + actorNamesSplitUp[i+1];
try {
actorList.add(new Actor(currentActorName,"",new Thumb(actorThumbURL)));
} catch (MalformedURLException e) {
actorList.add(new Actor(currentActorName,"",null));
}
}
return actorList;
}
else
{
actorList.add(new Actor(actorName,"",null));
}
}
else if(japaneseActors != null && getScrapingLanguage() == Language.JAPANESE)
{
if(urlOfCurrentPage != null && urlOfCurrentPage.contains("moviepages"))
{
urlOfCurrentPage = urlOfCurrentPage.replaceFirst(Pattern.quote("http://en.caribbeancompr.com/eng/moviepages/"), "http://www.caribbeancompr.com/moviepages/");
actorThumbURL = urlOfCurrentPage.replaceFirst(Pattern.quote("/index.html"), "/images/n.jpg");
}
for(Element japaneseActor : japaneseActors)
{
String actorName = japaneseActor.text();
try {
actorList.add(new Actor(actorName,"",new Thumb(actorThumbURL)));
} catch (MalformedURLException e) {
e.printStackTrace();
actorList.add(new Actor(actorName,"",null));
}
}
}
return actorList;
}
@Override
public ArrayList<Director> scrapeDirectors() {
return new ArrayList<>();
}
@Override
public Trailer scrapeTrailer()
{
initializeJapaneseDocument();
Element trailerElement = japaneseDocument.select("div.movie-download div.sb-btn a").first();
if(trailerElement != null)
{
String trailerLink = trailerElement.attr("href");
if(trailerLink != null && trailerLink.length() > 0)
return new Trailer(trailerLink);
}
return Trailer.BLANK_TRAILER;
}
@Override
public Studio scrapeStudio() {
//the studio is not on the english version of this page, so we need to go to the japanese one
initializeJapaneseDocument();
if (japaneseDocument != null)
{
Element studioElement = japaneseDocument.select("div.movie-info dl dt:contains(スタジオ:) ~ dd a").first();
if(studioElement != null)
{
String studioElementText = studioElement.text().trim();
if(getScrapingLanguage() == Language.ENGLISH)
TranslateString.translateStringJapaneseToEnglish(studioElement.text().trim());
if(studioElementText != null && studioElementText.length() > 0)
return new Studio(studioElementText);
}
}
return Studio.BLANK_STUDIO;
}
@Override
public String createSearchString(File file) {
scrapedMovieFile = file;
String fileNameNoExtension = findIDTagFromFile(file, isFirstWordOfFileIsID());
return fileNameNoExtension;
}
@Override
public SearchResult[] getSearchResults(String searchString)
throws IOException {
SearchResult[] googleResults = getLinksFromGoogle(searchString, "http://en.caribbeancompr.com/eng/moviepages/");
//Remove any parts of the URL after .html - for some reason this sometimes happens and messes up the scrape
for(int i = 0; i < googleResults.length; i++)
{
String currentUrl = googleResults[i].getUrlPath();
if(!currentUrl.endsWith(".html") && currentUrl.contains(".html"))
{
String newURL = currentUrl.substring(0,currentUrl.indexOf(".html")+5);
googleResults[i].setUrlPath(newURL);
}
}
return googleResults;
}
private void initializeJapaneseDocument() {
if(japaneseDocument == null)
{
String urlOfCurrentPage = document.location();
if(urlOfCurrentPage != null && urlOfCurrentPage.contains("moviepages"))
{
//the genres are only available on the japanese version of the page
urlOfCurrentPage = urlOfCurrentPage.replaceFirst(Pattern.quote("http://en.caribbeancompr.com/eng/"), "http://www.caribbeancompr.com/");
if(urlOfCurrentPage.length() > 1)
{
try {
japaneseDocument = Jsoup.connect(urlOfCurrentPage).userAgent("Mozilla").ignoreHttpErrors(true).timeout(SiteParsingProfile.CONNECTION_TIMEOUT_VALUE).get();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
}
}
@Override
public String getParserName() {
return "Caribbeancom Premium";
}
@Override
public SiteParsingProfile newInstance() {
return new CaribbeancomPremiumParsingProfile();
}
}