package moviescraper.doctord.controller.siteparsingprofile.specific;
import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.io.FilenameUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import moviescraper.doctord.controller.languagetranslation.Language;
import moviescraper.doctord.controller.siteparsingprofile.SiteParsingProfile;
import moviescraper.doctord.model.SearchResult;
import moviescraper.doctord.model.dataitem.Actor;
import moviescraper.doctord.model.dataitem.Director;
import moviescraper.doctord.model.dataitem.Genre;
import moviescraper.doctord.model.dataitem.ID;
import moviescraper.doctord.model.dataitem.MPAARating;
import moviescraper.doctord.model.dataitem.OriginalTitle;
import moviescraper.doctord.model.dataitem.Outline;
import moviescraper.doctord.model.dataitem.Plot;
import moviescraper.doctord.model.dataitem.Rating;
import moviescraper.doctord.model.dataitem.ReleaseDate;
import moviescraper.doctord.model.dataitem.Runtime;
import moviescraper.doctord.model.dataitem.Set;
import moviescraper.doctord.model.dataitem.SortTitle;
import moviescraper.doctord.model.dataitem.Studio;
import moviescraper.doctord.model.dataitem.Tagline;
import moviescraper.doctord.model.dataitem.Thumb;
import moviescraper.doctord.model.dataitem.Title;
import moviescraper.doctord.model.dataitem.Top250;
import moviescraper.doctord.model.dataitem.Trailer;
import moviescraper.doctord.model.dataitem.Votes;
import moviescraper.doctord.model.dataitem.Year;
public class HeyzoParsingProfile extends SiteParsingProfile implements SpecificProfile {
private String englishPage;
private String japanesePage;
Document japaneseDocument;
@Override
public String getParserName() {
return "HEYZO";
}
public HeyzoParsingProfile()
{
super();
}
@Override
public Title scrapeTitle() {
Element titleElement = document.select("div#movie h1").first();
if(titleElement != null)
{
String titleElementText = titleElement.text().trim().replaceAll("[ ]+", " ");
return new Title(titleElementText);
}
return new Title("");
}
@Override
public OriginalTitle scrapeOriginalTitle() {
if(scrapingLanguage == Language.JAPANESE)
return new OriginalTitle(scrapeTitle().getTitle());
else
{
Document originalDocument = document;
document = japaneseDocument;
OriginalTitle originalTitle = new OriginalTitle(scrapeTitle().getTitle());
document = originalDocument;
return originalTitle;
}
}
@Override
public SortTitle scrapeSortTitle() {
return SortTitle.BLANK_SORTTITLE;
}
@Override
public Set scrapeSet() {
return Set.BLANK_SET;
}
@Override
public Rating scrapeRating() {
//This used to be scrapable, but this now requires javascript to parse the page to get the rating
//TODO: If I ever replace jsoup with a javascript enabled parser, rewrite this function
return Rating.BLANK_RATING;
/*this was the code that should work if I had javascript enabled parser*/
/*
Element ratingValueElement = japaneseDocument.select("#review-value").first();
if(ratingValueElement != null)
{
return new Rating(5.0, ratingValueElement.text().trim());
}
else return Rating.BLANK_RATING;
*/
}
@Override
public Year scrapeYear() {
return scrapeReleaseDate().getYear();
}
@Override
public ReleaseDate scrapeReleaseDate()
{
Element releaseDateElement = japaneseDocument.select("div.movieInfo span.release-day + span.dataInfo").first();
if(releaseDateElement != null)
{
//System.out.println("year = " + yearElement.text());
String yearText = releaseDateElement.text().trim();
if(yearText.length() > 4)
{
return new ReleaseDate(yearText);
}
}
return ReleaseDate.BLANK_RELEASEDATE;
}
@Override
public Top250 scrapeTop250() {
return Top250.BLANK_TOP250;
}
@Override
public Votes scrapeVotes() {
return Votes.BLANK_VOTES;
}
@Override
public Outline scrapeOutline() {
return Outline.BLANK_OUTLINE;
}
@Override
public Plot scrapePlot() {
return Plot.BLANK_PLOT;
}
@Override
public Tagline scrapeTagline() {
return Tagline.BLANK_TAGLINE;
}
@Override
public Runtime scrapeRuntime() {
Element runtimeElement = document.select("tbody:contains(Whole Movie File Download) tr:contains(:) td").first();
if(runtimeElement != null)
{
String[] runtimeTextSplit = runtimeElement.text().trim().split((":"));
if(runtimeTextSplit.length == 3)
{
int hours = Integer.parseInt(runtimeTextSplit[0]);
int minutes = Integer.parseInt(runtimeTextSplit[1]);
int totalMinutes = (hours * 60) + minutes;
if(totalMinutes > 0)
return new Runtime(new Integer(totalMinutes).toString());
}
}
return Runtime.BLANK_RUNTIME;
}
@Override
public Thumb[] scrapePosters() {
ArrayList<Thumb> thumbList = new ArrayList<>();
String scrapedId = scrapeID().getId();
try {
//gallery links
for(int i = 1; i <= 21; i++)
{
String potentialGalleryImageURL = "http://en.heyzo.com/contents/3000/" + scrapedId + "/gallery/0" + String.format("%02d",i) + ".jpg";
String potentialGalleryPreviewImageURL = "http://en.heyzo.com/contents/3000/" + scrapedId + "/gallery/thumbnail_0" + String.format("%02d",i) + ".jpg";
if(SiteParsingProfile.fileExistsAtURL(potentialGalleryImageURL))
{
Thumb thumbToAdd = new Thumb(potentialGalleryImageURL);
thumbToAdd.setPreviewURL(new URL(potentialGalleryPreviewImageURL));
thumbList.add(thumbToAdd);
}
}
//image that is the preview of the trailer
Thumb trailerPreviewThumb = new Thumb("http://www.heyzo.com/contents/3000/" + scrapedId + "/images/player_thumbnail_450.jpg");
thumbList.add(trailerPreviewThumb);
} catch (MalformedURLException e) {
e.printStackTrace();
return thumbList.toArray(new Thumb[thumbList.size()]);
}
// TODO Auto-generated method stub
return thumbList.toArray(new Thumb[thumbList.size()]);
}
@Override
public Trailer scrapeTrailer(){
String scrapedId = scrapeID().getId();
String trailerURL = "http://sample.heyzo.com/contents/3000/" + scrapedId + "/heyzo_hd_0194_sample.mp4";
if(SiteParsingProfile.fileExistsAtURL(trailerURL))
return new Trailer(trailerURL);
return Trailer.BLANK_TRAILER;
}
@Override
public Thumb[] scrapeFanart() {
return scrapePosters();
}
@Override
public Thumb[] scrapeExtraFanart() {
return scrapePosters();
}
@Override
public MPAARating scrapeMPAA() {
return MPAARating.RATING_XXX;
}
@Override
public ID scrapeID() {
//Just get the ID from the page URL by doing some string manipulation
String baseUri = document.baseUri();
if(baseUri.length() > 0 && baseUri.contains("heyzo.com"))
{
baseUri = baseUri.replaceFirst("/index.html", "");
String idFromBaseUri = baseUri.substring(baseUri.lastIndexOf('/')+1);
return new ID(idFromBaseUri);
}
return ID.BLANK_ID;
}
@Override
public ArrayList<Genre> scrapeGenres() {
ArrayList<Genre> genreList = new ArrayList<>();
Elements genreElements = document.select("div.movieInfo a[href*=/listpages/category");
if(genreElements != null)
{
for(Element currentGenre : genreElements)
{
if(currentGenre.text().trim().length() > 0)
genreList.add(new Genre(currentGenre.text().trim()));
}
}
return genreList;
}
@Override
public ArrayList<Actor> scrapeActors() {
Elements actorElements = document.select("div.movieInfo span.dataInfo a[href*=/listpages/actor");
ArrayList<Actor> actorList = new ArrayList<>();
for(Element currentActor : actorElements)
{
String actorName = currentActor.text().trim();
String actorHref = currentActor.attr("href");
String actorNumber = null;
String actorThumbUrl = null;
if(actorHref != null && actorHref.length() > 0)
{
String [] splitHrefByUnderScore = actorHref.split("_");
if(splitHrefByUnderScore.length > 0)
{
actorNumber = splitHrefByUnderScore[1];
actorThumbUrl = "http://en.heyzo.com/actorprofile/3000/" + String.format("%04d", Integer.parseInt(actorNumber)) + "/profile.jpg";
}
}
//we found a thumbnail image for this actor
if(actorThumbUrl != null && SiteParsingProfile.fileExistsAtURL(actorThumbUrl))
{
try {
actorList.add(new Actor(actorName, "", new Thumb(actorThumbUrl)));
} catch (MalformedURLException e) {
e.printStackTrace();
actorList.add(new Actor(actorName, "", null));
}
}
//we didn't find a thumbnail image for this actor
else
{
actorList.add(new Actor(actorName, "", null));
}
}
return actorList;
}
@Override
public ArrayList<Director> scrapeDirectors() {
return new ArrayList<>();
}
@Override
public Studio scrapeStudio() {
// TODO Auto-generated method stub
return new Studio("HEYZO");
}
@Override
public String createSearchString(File file) {
scrapedMovieFile = file;
String fileID = findIDTagFromFile(file).toLowerCase();
if (fileID != null) {
englishPage = "http://en.heyzo.com/moviepages/" + fileID + "/index.html";
japanesePage = "http://www.heyzo.com/moviepages/" + fileID + "/index.html";
try {
japaneseDocument = Jsoup.connect(japanesePage).timeout(SiteParsingProfile.CONNECTION_TIMEOUT_VALUE).get();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
if(scrapingLanguage == Language.ENGLISH)
{
return englishPage;
}
else
{
return japanesePage;
}
}
return null;
}
@Override
public SearchResult[] getSearchResults(String searchString)
throws IOException {
SearchResult searchResult = new SearchResult(searchString);
SearchResult[] searchResultArray = {searchResult};
return searchResultArray;
}
public static String findIDTagFromFile(File file) {
return findIDTag(FilenameUtils.getName(file.getName()));
}
public static String findIDTag(String fileName) {
Pattern pattern = Pattern.compile("[0-9]{4}");
Matcher matcher = pattern.matcher(fileName);
if (matcher.find()) {
String searchString = matcher.group();
return searchString;
}
return null;
}
@Override
public SiteParsingProfile newInstance() {
return new HeyzoParsingProfile();
}
}