package moviescraper.doctord.controller.siteparsingprofile.specific;
import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.codec.EncoderException;
import org.apache.commons.codec.net.URLCodec;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.lang3.ArrayUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import moviescraper.doctord.controller.siteparsingprofile.SecurityPassthrough;
import moviescraper.doctord.controller.siteparsingprofile.SiteParsingProfile;
import moviescraper.doctord.model.Movie;
import moviescraper.doctord.model.SearchResult;
import moviescraper.doctord.model.dataitem.Actor;
import moviescraper.doctord.model.dataitem.Director;
import moviescraper.doctord.model.dataitem.Genre;
import moviescraper.doctord.model.dataitem.ID;
import moviescraper.doctord.model.dataitem.MPAARating;
import moviescraper.doctord.model.dataitem.OriginalTitle;
import moviescraper.doctord.model.dataitem.Outline;
import moviescraper.doctord.model.dataitem.Plot;
import moviescraper.doctord.model.dataitem.Rating;
import moviescraper.doctord.model.dataitem.ReleaseDate;
import moviescraper.doctord.model.dataitem.Runtime;
import moviescraper.doctord.model.dataitem.Set;
import moviescraper.doctord.model.dataitem.SortTitle;
import moviescraper.doctord.model.dataitem.Studio;
import moviescraper.doctord.model.dataitem.Tagline;
import moviescraper.doctord.model.dataitem.Thumb;
import moviescraper.doctord.model.dataitem.Title;
import moviescraper.doctord.model.dataitem.Top250;
import moviescraper.doctord.model.dataitem.Votes;
import moviescraper.doctord.model.dataitem.Year;
public class Data18MovieParsingProfile extends SiteParsingProfile implements SpecificProfile, SecurityPassthrough {
boolean useSiteSearch = true;
String yearFromFilename = "";
String fileName;
Thumb[] scrapedExtraFanart;
boolean hasRunScrapeExtraFanart = false;
//I've unfortunately had to make this static due to the current mess of a way this type of scraping is done where the object used
//to create the search results is not the same as the object used to actually scrape the document.
private static HashMap<String, String> releaseDateMap;
@Override
public List<ScraperGroupName> getScraperGroupNames()
{
if(groupNames == null)
groupNames = Arrays.asList(ScraperGroupName.AMERICAN_ADULT_DVD_SCRAPER_GROUP);
return groupNames;
}
@Override
public Title scrapeTitle() {
Element titleElement = document.select("div#centered.main2 div h1").first();
if(titleElement != null)
return new Title(titleElement.text());
else return new Title("");
}
@Override
public OriginalTitle scrapeOriginalTitle() {
return OriginalTitle.BLANK_ORIGINALTITLE;
}
@Override
public SortTitle scrapeSortTitle() {
// TODO Auto-generated method stub
return SortTitle.BLANK_SORTTITLE;
}
@Override
public Set scrapeSet() {
Element setElement = document.select("div div.p8 div p a[href*=/series/]").first();
if(setElement != null)
return new Set(setElement.text());
else return Set.BLANK_SET;
}
@Override
public Rating scrapeRating() {
// TODO Auto-generated method stub
return Rating.BLANK_RATING;
}
@Override
public Year scrapeYear() {
//old method before site update in September 2014
Element releaseDateElement = document.select("div p:contains(Release Date:) b").first();
//new method after site update in mar 2015
if(releaseDateElement == null)
{
releaseDateElement = document.select("div.p8 div.gen12 p:contains(Release Date:), div.p8 div.gen12 p:contains(Production Year:)").first();
if(releaseDateElement != null)
{
String releaseDateText = releaseDateElement.text().trim();
final Pattern pattern = Pattern.compile("(\\d{4})"); //4 digit years
final Matcher matcher = pattern.matcher(releaseDateText);
if ( matcher.find() ) {
String year = matcher.group(matcher.groupCount());
return new Year(year);
}
if(releaseDateText.length() > 4)
{
//just get the first 4 letters which is the year
releaseDateText = releaseDateText.substring(0,4);
return new Year(releaseDateText);
}
else return Year.BLANK_YEAR;
}
}
else
{
String releaseDateText = releaseDateElement.text().trim();
//just get the last 4 letters which is the year
if(releaseDateText.length() >= 4)
{
releaseDateText = releaseDateText.substring(releaseDateText.length()-4,releaseDateText.length());
return new Year(releaseDateText);
}
}
return Year.BLANK_YEAR;
}
@Override
public Top250 scrapeTop250() {
// TODO Auto-generated method stub
return Top250.BLANK_TOP250;
}
@Override
public Votes scrapeVotes() {
// TODO Auto-generated method stub
return Votes.BLANK_VOTES;
}
@Override
public Outline scrapeOutline() {
// TODO Auto-generated method stub
return Outline.BLANK_OUTLINE;
}
@Override
public Plot scrapePlot() {
Element plotElement = document.select("p.gen12:contains(Description:)").first();
if(plotElement != null)
{
String plotText = plotElement.text();
if(plotText.startsWith("Description: "))
{
plotText = plotText.replaceFirst("Description:", "");
}
return new Plot(plotText);
}
return Plot.BLANK_PLOT;
}
@Override
public Tagline scrapeTagline() {
return Tagline.BLANK_TAGLINE;
}
@Override
public Runtime scrapeRuntime() {
Element runtimeElement = document.select("p.gen12:contains(Length:)").first();
if(runtimeElement != null)
{
String runtimeElementText = runtimeElement.text().replaceFirst(Pattern.quote("Length:"), "").replaceFirst(Pattern.quote(" min."), "").trim();
return new Runtime(runtimeElementText);
}
else return Runtime.BLANK_RUNTIME;
}
@Override
public Thumb[] scrapePosters() {
Element posterElement = document.select("a[rel=covers]").first();
if(posterElement != null)
{
Thumb[] posterThumbs = new Thumb[1];
try {
posterThumbs[0] = new Thumb(fixIPAddressOfData18(posterElement.attr("href")));
return posterThumbs;
} catch (MalformedURLException e) {
e.printStackTrace();
return new Thumb[0];
}
}
return new Thumb[0];
}
/**
* Fix for Github issue 97 (https://github.com/DoctorD1501/JAVMovieScraper/issues/97)
* The european IP address for galleries gives us a HTTP response code of 302 (redirect), which prevents us from downloading things
* we will route to the american IP address instead
*/
private String fixIPAddressOfData18(String mainImageUrl) {
if(mainImageUrl == null)
return mainImageUrl;
else
{
//tends to be links for main cover, etc
String stringWithIPAdressReplaced = mainImageUrl.replaceFirst("94.229.67.74", "74.50.117.45");
//tends to be image gallery on movie page
stringWithIPAdressReplaced = stringWithIPAdressReplaced.replaceFirst("78.110.165.210", "74.50.117.48");
return stringWithIPAdressReplaced;
}
}
@Override
public Thumb[] scrapeFanart() {
if(!hasRunScrapeExtraFanart && scrapedExtraFanart == null)
{
scrapeExtraFanart();
}
Element posterElement = document.select("a[rel=covers]:contains(Back Cover)").first();
if(posterElement != null)
{
Thumb[] posterThumbs = new Thumb[1];
try {
posterThumbs[0] = new Thumb(fixIPAddressOfData18(posterElement.attr("href")));
return ArrayUtils.addAll(scrapedExtraFanart, posterThumbs);
} catch (MalformedURLException e) {
e.printStackTrace();
if(scrapedExtraFanart != null)
return scrapedExtraFanart;
else return new Thumb[0];
}
}
if(scrapedExtraFanart != null)
return scrapedExtraFanart;
else return new Thumb[0];
}
@Override
public Thumb[] scrapeExtraFanart() {
hasRunScrapeExtraFanart = true;
if(scrapedExtraFanart != null)
{
return scrapedExtraFanart;
}
//find split scene links from a full movie
Elements sceneContentLinks = document.select("div[onmouseout]:matches(Scene \\d\\d?)");
ArrayList<String> contentLinks = new ArrayList<>();
ArrayList<Thumb> extraFanart = new ArrayList<>();
if(sceneContentLinks != null)
{
//get just the id from url of the content
for(Element sceneContentLink : sceneContentLinks)
{
Element linkElement = sceneContentLink.select("a[href*=/content/").first();
if(linkElement != null)
{
String linkElementURL = linkElement.attr("href");
if(linkElementURL.contains("/"))
{
String contentID = linkElementURL.substring(linkElementURL.lastIndexOf("/")+1,linkElementURL.length());
contentLinks.add(contentID);
}
}
}
}
// Checking for changed and/or different contentIDs from the main/root item and building a new array
ArrayList<String> galleryLinks = new ArrayList<>();
for(String myID : contentLinks)
{
String currentGalleryURL = "http://www.data18.com/content/" + myID;
try {
Document galleryDocument = Jsoup.connect(currentGalleryURL).timeout(SiteParsingProfile.CONNECTION_TIMEOUT_VALUE).userAgent("Mozilla/5.0 (Windows NT 6.1; WOW64; rv:5.0) Gecko/20100101 Firefox/5.0").get();
if(galleryDocument!= null)
{
Elements galleryElement = galleryDocument.select("div a[href*=/viewer/]");
Element linkElement = galleryElement.select("a[href*=/viewer/").first();
if(linkElement != null)
{
String linkElementURL = linkElement.attr("href");
if(linkElementURL.contains("/"))
{
String [] parts = linkElementURL.split("/");
galleryLinks.add(parts[4]);
}
}
}
} catch (IOException e) {
e.printStackTrace();
//continue;
}
}
// Results would be duplicated due to "Scene 1: xxxxxxxx" as well as "scene 1" later.
// Just removing those to get a clean list of galleries
ArrayList<String> resultList = new ArrayList<>();
HashSet<String> set = new HashSet<>();
for (String link : galleryLinks){
if (!set.contains(link)){
resultList.add(link);
set.add(link);
}
}
//for each id, go to the viewer page for that ID
//for(String contentID : contentLinks)
for(String contentID : resultList)
{
//int viewerPageNumber = 1;
for(int viewerPageNumber = 1; viewerPageNumber <= 15; viewerPageNumber++)
{
String currentViewerPageURL = "http://www.data18.com/viewer/" + contentID + "/" + String.format("%02d", viewerPageNumber);
try {
Document viewerDocument = Jsoup.connect(currentViewerPageURL).timeout(SiteParsingProfile.CONNECTION_TIMEOUT_VALUE).userAgent("Mozilla/5.0 (Windows NT 6.1; WOW64; rv:5.0) Gecko/20100101 Firefox/5.0").get();
if(viewerDocument!= null)
{
Element imgElement = viewerDocument.select("div#post_view a[href*=/viewer/] img").first();
if(imgElement != null)
{
String mainImageUrl = imgElement.attr("src");
Thumb thumbToAdd = new Thumb(fixIPAddressOfData18(mainImageUrl));
String previewURL = mainImageUrl.substring(0,mainImageUrl.length()-6) + "th8/" + mainImageUrl.substring(mainImageUrl.length()-6,mainImageUrl.length());
if(fileExistsAtURL(previewURL))
thumbToAdd.setPreviewURL(new URL(fixIPAddressOfData18(previewURL)));
//System.out.println("Scraped Viewer: " + currentViewerPageURL);
thumbToAdd.setViewerURL(new URL(currentViewerPageURL));
extraFanart.add(thumbToAdd);
}
}
} catch (IOException e) {
e.printStackTrace();
//continue;
}
}
}
scrapedExtraFanart = extraFanart.toArray(new Thumb[extraFanart.size()]);
System.out.println("Number of Thumbs: " + scrapedExtraFanart.length);
return scrapedExtraFanart;
}
@Override
public MPAARating scrapeMPAA() {
return MPAARating.RATING_XXX;
}
@Override
public ID scrapeID() {
return ID.BLANK_ID;
}
@Override
public ArrayList<Genre> scrapeGenres() {
ArrayList<Genre> genreList = new ArrayList<>();
Elements genreElements = document.select("div.gen12:has(b:containsOwn(Categories:)) p a[href*=/movies/], div.p8:has(div:containsOwn(Categories:)) a[href*=/movies/]");
//System.out.println("genreElements = " + genreElements);
if (genreElements != null)
{
for(Element currentGenreElement : genreElements)
{
String genreText = currentGenreElement.text().trim();
if(genreText != null && genreText.length() > 0)
genreList.add(new Genre(genreText));
}
}
return genreList;
}
@Override
public ArrayList<Actor> scrapeActors() {
Elements actorElements = document.select("p.line1 a img");
ArrayList<Actor> actorList = new ArrayList<>();
if(actorElements != null)
{
for(Element currentActorElement : actorElements)
{
String actorName = currentActorElement.attr("alt");
String actorThumbnail = currentActorElement.attr("src");
//case with actor with thumbnail
if(actorThumbnail != null && !actorThumbnail.equals("http://img.data18.com/images/no_prev_60.gif"))
{
try {
actorThumbnail = actorThumbnail.replaceFirst(Pattern.quote("/60/"), "/120/");
actorList.add(new Actor(actorName, null, new Thumb(actorThumbnail)));
} catch (MalformedURLException e) {
actorList.add(new Actor(actorName, null, null));
e.printStackTrace();
}
}
//add the actor with no thumbnail
else
{
actorList.add(new Actor(actorName, null, null));
}
}
}
Elements otherActors = document.select("[href^=http://www.data18.com/dev/]");
if(otherActors != null) {
for (Element element : otherActors) {
String actorName = element.attr("alt");
actorName = element.childNode(0).toString();
actorList.add(new Actor(actorName, null, null));
}
}
return actorList;
}
@Override
public ArrayList<Director> scrapeDirectors() {
ArrayList<Director> directorList = new ArrayList<>();
Element directorElement = document.select("a[href*=director=]").first();
if(directorElement != null)
{
String directorName = directorElement.text();
if(directorName != null && directorName.length() > 0 && !directorName.equals("Unknown"))
directorList.add(new Director(directorName,null));
}
return directorList;
}
@Override
public Studio scrapeStudio() {
Element studioElement = document.select("div div.p8 div p a[href*=/studios/").first();
if(studioElement != null)
{
String studioText = studioElement.text().trim();
if(studioText != null && studioText.length() > 0)
return new Studio(studioText);
}
return Studio.BLANK_STUDIO;
}
@Override
public String createSearchString(File file) {
scrapedMovieFile = file;
String fileBaseName;
if(file.isFile())
fileBaseName = FilenameUtils.getBaseName(Movie.getUnstackedMovieName(file));
else
fileBaseName = file.getName();
fileName = fileBaseName;
String [] splitBySpace = fileBaseName.split(" ");
if(splitBySpace.length > 1)
{
//check if last word in filename contains a year like (2012) or [2012]
if(splitBySpace[splitBySpace.length-1].matches("[\\(\\[]\\d{4}[\\)\\]]"))
{
yearFromFilename = splitBySpace[splitBySpace.length-1].replaceAll("[\\(\\[\\)\\]]", "");
fileBaseName = fileBaseName.replaceFirst("[\\(\\[]\\d{4}[\\)\\]]","").trim();
}
}
if(useSiteSearch)
{
URLCodec codec = new URLCodec();
try {
fileBaseName = codec.encode(fileBaseName);
} catch (EncoderException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
fileBaseName = "http://www.data18.com/search/?k=" + fileBaseName + "&t=2";
return fileBaseName;
}
return FilenameUtils.getBaseName(file.getName());
}
@Override
public SearchResult[] getSearchResults(String searchString)
throws IOException {
//System.out.println("Trying to scrape with URL = " + searchString);
if(useSiteSearch)
{
ArrayList<SearchResult> linksList = new ArrayList<>();
Document doc = Jsoup.connect(searchString).userAgent("Mozilla/5.0 (Windows NT 6.1; WOW64; rv:5.0) Gecko/20100101 Firefox/5.0").ignoreHttpErrors(true).timeout(SiteParsingProfile.CONNECTION_TIMEOUT_VALUE).get();
Elements movieSearchResultElements = doc.select("div[style=float: left; padding: 6px; width: 130px;]");
if(movieSearchResultElements == null || movieSearchResultElements.size() == 0)
{
this.useSiteSearch = false;
return getLinksFromGoogle(fileName.replace("-", ""), "data18.com/movies/");
}
else
{
for(Element currentMovie : movieSearchResultElements)
{
String currentMovieURL = currentMovie.select("a").first().attr("href");
String currentMovieTitle = currentMovie.select("a").last().text();
String releaseDateText = currentMovie.ownText();
if(releaseDateText != null && releaseDateText.length() > 0)
currentMovieTitle = currentMovieTitle + " (" + releaseDateText + ")";
Thumb currentMovieThumb = new Thumb(currentMovie.select("img").attr("src"));
linksList.add(new SearchResult(currentMovieURL, currentMovieTitle, currentMovieThumb));
if(releaseDateMap == null)
releaseDateMap = new HashMap<>();
//I'm putting into a static variable that never gets freed, so this could be a potential memory leak
//TODO: find a better way to do this without a global variable
releaseDateMap.put(currentMovieURL, releaseDateText);
}
return linksList.toArray(new SearchResult[linksList.size()]);
}
}
else
{
this.useSiteSearch = false;
return getLinksFromGoogle(searchString, "data18.com/movies/");
}
}
@Override
public String toString(){
return "Data18Movie";
}
@Override
public SiteParsingProfile newInstance() {
return new Data18MovieParsingProfile();
}
@Override
public String getParserName() {
return "Data18 Movie";
}
@Override
public ReleaseDate scrapeReleaseDate() {
//Unfortunately this data is not available on full on the page we are scraping, so we store the info from the search result
//creation and retrieve it here
if(releaseDateMap != null && releaseDateMap.containsKey(document.location()))
{
String releaseDate = releaseDateMap.get(document.location());
if(releaseDate != null && releaseDate.length() > 4)
return new ReleaseDate(releaseDate);
}
return ReleaseDate.BLANK_RELEASEDATE;
}
@Override
public boolean requiresSecurityPassthrough(Document document) {
return Data18SharedMethods.requiresSecurityPassthrough(document);
}
@Override
public Document runSecurityPassthrough(Document document, SearchResult originalSearchResult) {
return Data18SharedMethods.runSecurityPassthrough(document, originalSearchResult);
}
}