package moviescraper.doctord.controller.siteparsingprofile.specific;
import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.LinkedList;
import java.util.List;
import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.codec.EncoderException;
import org.apache.commons.codec.net.URLCodec;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import moviescraper.doctord.controller.siteparsingprofile.SiteParsingProfile;
import moviescraper.doctord.model.SearchResult;
import moviescraper.doctord.model.dataitem.Actor;
import moviescraper.doctord.model.dataitem.Director;
import moviescraper.doctord.model.dataitem.Genre;
import moviescraper.doctord.model.dataitem.ID;
import moviescraper.doctord.model.dataitem.MPAARating;
import moviescraper.doctord.model.dataitem.OriginalTitle;
import moviescraper.doctord.model.dataitem.Outline;
import moviescraper.doctord.model.dataitem.Plot;
import moviescraper.doctord.model.dataitem.Rating;
import moviescraper.doctord.model.dataitem.ReleaseDate;
import moviescraper.doctord.model.dataitem.Runtime;
import moviescraper.doctord.model.dataitem.Set;
import moviescraper.doctord.model.dataitem.SortTitle;
import moviescraper.doctord.model.dataitem.Studio;
import moviescraper.doctord.model.dataitem.Tagline;
import moviescraper.doctord.model.dataitem.Thumb;
import moviescraper.doctord.model.dataitem.Title;
import moviescraper.doctord.model.dataitem.Top250;
import moviescraper.doctord.model.dataitem.Trailer;
import moviescraper.doctord.model.dataitem.Votes;
import moviescraper.doctord.model.dataitem.Year;
public class R18ParsingProfile extends SiteParsingProfile implements SpecificProfile {
private static final SimpleDateFormat r18ReleaseDateFormat = new SimpleDateFormat("MMM. dd,yyyy", Locale.ENGLISH);
private static final SimpleDateFormat r18ReleaseDateFormatAlternate = new SimpleDateFormat("MMM dd,yyyy", Locale.ENGLISH);
@Override
public String getParserName() {
return "R18.com";
}
@Override
public List<ScraperGroupName> getScraperGroupNames()
{
if(groupNames == null)
groupNames = Arrays.asList(ScraperGroupName.JAV_CENSORED_SCRAPER_GROUP);
return groupNames;
}
@Override
public Title scrapeTitle() {
Element titleElement = document.select("cite[itemprop=name]").first();
if(titleElement != null)
return new Title(titleElement.text());
else return new Title("");
}
@Override
public OriginalTitle scrapeOriginalTitle() {
// r18 does not have a title in japanese :(
return OriginalTitle.BLANK_ORIGINALTITLE;
}
@Override
public SortTitle scrapeSortTitle() {
//no SortTitle - the user usually provides their own
return SortTitle.BLANK_SORTTITLE;
}
@Override
public Set scrapeSet() {
Element setElement = document.select("div.product-details dl dt:contains(Series:) + dd a").first();
if(setElement != null)
{
String setText = setElement.text().trim();
if(setText.endsWith("..."))
{
System.out.println("Visiting set page to get full text");
try
{
Document setDocument = SiteParsingProfile.downloadDocumentFromURLString(setElement.attr("href"));
Element setElementFullText = setDocument.select("div.cmn-ttl-tabMain01 div.txt01").first();
if(setElementFullText != null)
{
return new Set(setElementFullText.text());
}
}
catch(Exception e)
{
e.printStackTrace();
return new Set(setText);
}
}
return new Set(setText);
}
return Set.BLANK_SET;
}
@Override
public Rating scrapeRating() {
//this site doesn't have ratings
return Rating.BLANK_RATING;
}
@Override
public Year scrapeYear() {
return scrapeReleaseDate().getYear();
}
@Override
public ReleaseDate scrapeReleaseDate() {
Element releaseDateElement = document.select("div.product-details dl dt:contains(Release Date) ~ dd").first();
if(releaseDateElement != null && releaseDateElement.text().length() > 4)
{
String releaseDateText = releaseDateElement.text().trim();
//gah why is this site so inconsistent. September should be Sep., not "Sept.".
//They randomly decide how many letters they want each month to take.
if(releaseDateText.contains("Sept.")) {
releaseDateText = releaseDateText.replaceFirst(Pattern.quote("Sept."), "Sep.");
}
//months abbreviated e.g.: "Oct."
SimpleDateFormat formatToUse = r18ReleaseDateFormat;
//month did not get abreviated
if (!releaseDateText.contains(".")) {
formatToUse = r18ReleaseDateFormatAlternate;
}
ReleaseDate releaseDate = new ReleaseDate(releaseDateText, formatToUse);
return releaseDate;
}
return ReleaseDate.BLANK_RELEASEDATE;
}
@Override
public Top250 scrapeTop250() {
return Top250.BLANK_TOP250;
}
@Override
public Trailer scrapeTrailer() {
Element element = document.select("object#FreeViewPlayer>param[name=flashvars]").first();
if (element != null){
String flashvars = element.attr("value");
Pattern pattern = Pattern.compile("^.*&fid=(.+)&.*&bid=(\\d)(w|s)&.*$");
Matcher matcher = pattern.matcher(flashvars);
if (matcher.matches()){
String cid = matcher.group(1);
int bitrates = Integer.parseInt(matcher.group(2));
String ratio = matcher.group(3);
String quality = (bitrates & 0b100) != 0 ? "dmb" : (bitrates & 0b010) != 0 ? "dm" : "sm";
String firstLetterOfCid = cid.substring(0,1);
String threeLetterCidCode = cid.substring(0,3);
String trailerURL = String.format("http://cc3001.r18.com/litevideo/freepv/%1$s/%2$s/%3$s/%3$s_%4$s_%5$s.mp4",
firstLetterOfCid, threeLetterCidCode, cid, quality, ratio);
return new Trailer(trailerURL);
}
}
return Trailer.BLANK_TRAILER;
}
@Override
public Votes scrapeVotes() {
return Votes.BLANK_VOTES;
}
@Override
public Outline scrapeOutline() {
return Outline.BLANK_OUTLINE;
}
@Override
public Plot scrapePlot() {
Element plotElement = document.select("div.cmn-box-description01 h1 ~ p").first();
if(plotElement != null)
return new Plot(plotElement.text());
return Plot.BLANK_PLOT;
}
@Override
public Tagline scrapeTagline() {
return Tagline.BLANK_TAGLINE;
}
@Override
public Runtime scrapeRuntime() {
Element runtimeElement = document.select("div.product-details dl dt:contains(Runtime:) ~ dd").first();
if(runtimeElement != null && runtimeElement.text().length() > 0)
{
String runtimeText = runtimeElement.text();
runtimeText = runtimeText.replace(" min.", "");
return new Runtime(runtimeText);
}
return Runtime.BLANK_RUNTIME;
}
@Override
public Thumb[] scrapePosters() {
return scrapePostersAndFanart(true);
}
private Thumb[] scrapePostersAndFanart(boolean doCrop)
{
Element dvdBoxartElement = document.select("section div.box01.mb10.detail-view.detail-single-picture img").first();
if(dvdBoxartElement != null)
{
String imgSrc = dvdBoxartElement.attr("src");
if(imgSrc.length() > 0)
{
try {
Thumb poster = new Thumb(imgSrc, doCrop);
Thumb [] posterArray = {poster};
return posterArray;
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
return new Thumb[0];
}
@Override
public Thumb[] scrapeFanart() {
return scrapePostersAndFanart(false);
}
@Override
public Thumb[] scrapeExtraFanart() {
List<Thumb> thumbList = new LinkedList<>();
Elements previewProductGalleryImgLinks = document.select(".product-gallery li a img");
if(previewProductGalleryImgLinks != null)
{
for(Element currentPreviewImage : previewProductGalleryImgLinks)
{
String imgThumbnailSrc = currentPreviewImage.attr("data-original");
if(imgThumbnailSrc != null && imgThumbnailSrc.length() > 0)
{
int indexOfLastDash = imgThumbnailSrc.lastIndexOf('-');
String fullImagePath = imgThumbnailSrc.substring(0,indexOfLastDash)+ "jp" + imgThumbnailSrc.substring(indexOfLastDash);
try {
thumbList.add(new Thumb(fullImagePath));
} catch (MalformedURLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
}
//we could save time by not doing this check, but we might get broken images if the site changes their format
//to speed things up a bit, we will check one image and assume the rest is OK
if(thumbList.size() > 0 && !fileExistsAtURL(thumbList.get(thumbList.size()/2).getThumbURL().toString())) {
System.err.println("We expected to find extra fanart and did not at: " + document.location());
return new Thumb[0];
}
return thumbList.toArray(new Thumb[thumbList.size()]);
}
@Override
public MPAARating scrapeMPAA() {
return MPAARating.RATING_XXX;
}
@Override
public ID scrapeID() {
Element idElement = document.select("div.product-details dl dt:contains(Content ID:) ~ dd").first();
if(idElement != null && idElement.text().length() > 0 )
{
// Some h.m.p titles does not feature the correct Content ID. We need to get it from another location.
if (idElement.text().startsWith("41hodv") && scrapeStudio().getStudio().equals("h.m.p")) {
Element wishListElement = document.select("div.js-add-to-wishlist[data-wishlist-id]").first();
if (wishListElement != null) {
return new ID(DmmParsingProfile.fixUpIDFormatting(wishListElement.attr("data-wishlist-id")));
}
}
String r18ID = idElement.text();
return new ID(DmmParsingProfile.fixUpIDFormatting(r18ID));
}
return new ID("");
}
@Override
public ArrayList<Genre> scrapeGenres() {
ArrayList<Genre> genreList = new ArrayList<>();
Elements genreElements = document.select("div.product-details dl dt:contains(Categories:) ~ dd a");
if(genreElements != null)
{
for(Element currentGenre : genreElements)
{
String genreText = currentGenre.text();
if(genreText.length() > 0 && !genreText.equals("Hi-Def") && !genreText.equals("Featured Actress") && !(genreText.toLowerCase().startsWith("featured")))
{
genreList.add(new Genre(genreText));
}
}
}
return genreList;
}
@Override
public ArrayList<Actor> scrapeActors() {
ArrayList<Actor> actorList = new ArrayList<>();
Elements actorElementTabs = document.select("div.js-tab-contents div[id]");
if(actorElementTabs != null)
{
for(Element currentActor : actorElementTabs)
{
String actorName = currentActor.select("div.txt01 div").first().text();
String actorThumbUrl = currentActor.select("img").first().attr("src");
if(actorName != null && actorName.length() > 0)
{
if(actorThumbUrl != null && actorThumbUrl.length() > 0 && !actorThumbUrl.contains("nowprinting"))
{
Thumb actorThumb;
try {
actorThumb = new Thumb(actorThumbUrl);
Actor actorWithThumb = new Actor(actorName,"",actorThumb);
actorList.add(actorWithThumb);
} catch (MalformedURLException e) {
e.printStackTrace();
Actor actorWithoutThumb = new Actor(actorName,"",null);
actorList.add(actorWithoutThumb);
}
}
else
{
Actor actorWithoutThumb = new Actor(actorName,"",null);
actorList.add(actorWithoutThumb);
}
}
}
}
return actorList;
}
@Override
public ArrayList<Director> scrapeDirectors() {
ArrayList<Director> directorList = new ArrayList<>();
Element studioElement = document.select("div.product-details dl dt:contains(Director:) + dd").first();
if(studioElement != null)
{
String directorText = studioElement.text();
if(directorText.length() > 0 && !directorText.startsWith("-"))
directorList.add(new Director(directorText,null));
}
return directorList;
}
@Override
public Studio scrapeStudio() {
Element studioElement = document.select("div.product-details dl dt:contains(Studio:) + dd a").first();
if(studioElement != null)
{
String studioText = studioElement.text();
if(studioText.length() > 0)
return new Studio(studioText);
}
return Studio.BLANK_STUDIO;
}
@Override
public String createSearchString(File file) {
scrapedMovieFile = file;
// The general approach is search for 'tag' + '5-digit 0-padded number'.
// This gets pretty good results, usually a perfect match,
// or 2 to 5 results for clashing ids - still good for manual picking.
String baseId = findIDTagFromFile(file, isFirstWordOfFileIsID()).replace("-", "");
Pattern patternID = Pattern.compile("([0-9]*\\D+)(\\d+)");
Matcher matcher = patternID.matcher(baseId);
String groupOne = "";
String groupTwo = "";
while (matcher.find()) {
groupOne = matcher.group(1);
groupTwo = matcher.group(2);
}
if (groupOne == null || groupOne.isEmpty() || groupTwo == null || groupTwo.isEmpty())
return null;
int number = Integer.parseInt(groupTwo);
// some h.m.p. titles need extra padding
if (groupOne.toUpperCase().equals("HODV")) {
return String.format("%s+%05d", groupOne, number);
}
return String.format("%s%05d", groupOne, number);
}
private SearchResult[] searchResultOnR18(String searchWord) {
URLCodec codec = new URLCodec();
String searchWordURLEncoded;
try {
searchWordURLEncoded = codec.encode(searchWord);
String searchPattern = "http://www.r18.com/common/search/floor=movies/searchword=" + searchWordURLEncoded + "/";
System.out.println("Searching on R18 with this URL:" + searchPattern);
Document searchResultsPage = Jsoup.connect(searchPattern).timeout(SiteParsingProfile.CONNECTION_TIMEOUT_VALUE).get();
Elements moviesFound = searchResultsPage.select(".cmn-list-product01 li");
if(moviesFound != null && moviesFound.size() > 0)
{
SearchResult [] foundResults = new SearchResult[moviesFound.size()];
int i = 0;
for(Element searchResult : moviesFound)
{
String urlPath = searchResult.select("a").attr("href");
String label = searchResult.select("img").first().attr("alt");
Thumb previewImage = new Thumb(searchResult.select("img").first().attr("data-original"));
SearchResult searchResultToAdd = new SearchResult(urlPath, label, previewImage);
foundResults[i] = searchResultToAdd;
i++;
}
return foundResults;
}
} catch (EncoderException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return null;
}
@Override
public SearchResult[] getSearchResults(String searchString)
throws IOException {
SearchResult[] results = null;
if (searchString != null) {
results = searchResultOnR18(searchString);
if (results == null) {
// lots of old Moodyz titles are listed by their VHS tag,
// those starting with 'MD' may get a good match removing the trailing 'D',
// (MDED -> MDE, MDID -> MDI, MDLD -> MDL...)
// result will be filtered during the amalgamation process though, need to fix that
Pattern patternID = Pattern.compile("^(MD.)D(\\d+)$", Pattern.CASE_INSENSITIVE);
Matcher matcher = patternID.matcher(searchString);
if (matcher.matches()){
String moodyzSearchPattern = matcher.replaceAll("$1$2");
results = searchResultOnR18(moodyzSearchPattern);
}
}
}
if (results == null) {
// results = getLinksFromGoogle(searchString, "r18.com");
results = new SearchResult[0];
}
return results;
}
@Override
public SiteParsingProfile newInstance() {
return new R18ParsingProfile();
}
@Override
public String toString(){
return "R18.com";
}
}