package moviescraper.doctord.controller.siteparsingprofile.specific; import java.io.File; import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Arrays; import java.util.LinkedList; import java.util.List; import java.util.Locale; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.io.FilenameUtils; import org.apache.commons.lang3.text.WordUtils; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import moviescraper.doctord.controller.languagetranslation.Language; import moviescraper.doctord.controller.languagetranslation.TranslateString; import moviescraper.doctord.controller.siteparsingprofile.SiteParsingProfile; import moviescraper.doctord.model.SearchResult; import moviescraper.doctord.model.dataitem.Actor; import moviescraper.doctord.model.dataitem.Director; import moviescraper.doctord.model.dataitem.Genre; import moviescraper.doctord.model.dataitem.ID; import moviescraper.doctord.model.dataitem.MPAARating; import moviescraper.doctord.model.dataitem.OriginalTitle; import moviescraper.doctord.model.dataitem.Outline; import moviescraper.doctord.model.dataitem.Plot; import moviescraper.doctord.model.dataitem.Rating; import moviescraper.doctord.model.dataitem.ReleaseDate; import moviescraper.doctord.model.dataitem.Runtime; import moviescraper.doctord.model.dataitem.Set; import moviescraper.doctord.model.dataitem.SortTitle; import moviescraper.doctord.model.dataitem.Studio; import moviescraper.doctord.model.dataitem.Tagline; import moviescraper.doctord.model.dataitem.Thumb; import moviescraper.doctord.model.dataitem.Title; import moviescraper.doctord.model.dataitem.Top250; import moviescraper.doctord.model.dataitem.Trailer; import moviescraper.doctord.model.dataitem.Votes; import moviescraper.doctord.model.dataitem.Year; public class CaribbeancomParsingProfile extends SiteParsingProfile implements SpecificProfile { Document japaneseDocument; String id; boolean useTranslationOfJapanesePageForEnglishMetadata = true; private static final SimpleDateFormat caribbeanReleaseDateFormat = new SimpleDateFormat("yyyy/mm/dd", Locale.ENGLISH); @Override public String getParserName() { return "Caribbeancom"; } /** * loads up the japanese version of this page into japaneseDocument */ private void initializeJapaneseDocument() { if(document != null && japaneseDocument == null) { String url = "http://www.caribbeancom.com/moviepages/" + id + "/index.html"; japaneseDocument = SiteParsingProfile.downloadDocumentFromURLString(url); } } @Override public Title scrapeTitle() { Document documentToUse = document; Element titleElement = documentToUse.select("title").first(); //for now, we're always going to use the japanese page, as the below variable is always true if(useTranslationOfJapanesePageForEnglishMetadata) { initializeJapaneseDocument(); documentToUse = japaneseDocument; titleElement = documentToUse.select("div.video-detail h1[itemprop=name]").first(); } if(titleElement != null) { //We only sometimes do the translation of the japanese page, however if(getScrapingLanguage() == Language.ENGLISH) { return new Title(WordUtils.capitalize(TranslateString.translateStringJapaneseToEnglish(titleElement.text()))); } else return new Title(titleElement.text()); } return new Title(""); } @Override public OriginalTitle scrapeOriginalTitle() { initializeJapaneseDocument(); Element titleElement = japaneseDocument.select("div.video-detail h1[itemprop=name]").first(); if(titleElement != null) return new OriginalTitle(titleElement.text()); return OriginalTitle.BLANK_ORIGINALTITLE; } @Override public SortTitle scrapeSortTitle() { return SortTitle.BLANK_SORTTITLE; } @Override public Set scrapeSet() { return Set.BLANK_SET; } @Override public Rating scrapeRating() { initializeJapaneseDocument(); Element stars = japaneseDocument.select("div.movie-info dl dt:contains(ユーザー評価:) ~ dd ").first(); if(stars != null && stars.text().contains("★")) { //count the number of ★ characters, max number of stars is 5 and half stars not supported return new Rating(5.0, String.valueOf(stars.text().length())); } return new Rating(0,""); } @Override public Year scrapeYear() { return scrapeReleaseDate().getYear(); } @Override public ReleaseDate scrapeReleaseDate() { initializeJapaneseDocument(); Element releaseDate = japaneseDocument.select("div.movie-info dl dt:contains(配信日:) ~ dd ").first(); if(releaseDate != null && releaseDate.text().length() > 4) { return new ReleaseDate(releaseDate.text(), caribbeanReleaseDateFormat); } return ReleaseDate.BLANK_RELEASEDATE; } @Override public Top250 scrapeTop250() { // TODO Auto-generated method stub return Top250.BLANK_TOP250; } @Override public Votes scrapeVotes() { return Votes.BLANK_VOTES; } @Override public Outline scrapeOutline() { return Outline.BLANK_OUTLINE; } @Override public Plot scrapePlot() { initializeJapaneseDocument(); Element plotElement = japaneseDocument.select("div.movie-comment p").first(); if(plotElement != null && plotElement.text().length() > 0) { if(getScrapingLanguage() == Language.ENGLISH) return new Plot(TranslateString.translateStringJapaneseToEnglish(plotElement.text())); else return new Plot(plotElement.text()); } return Plot.BLANK_PLOT; } @Override public Tagline scrapeTagline() { return Tagline.BLANK_TAGLINE; } @Override public Runtime scrapeRuntime() { initializeJapaneseDocument(); Element durationElement = japaneseDocument.select("div.movie-info dl dt:contains(�?生時間:) ~ dd ").first(); if(durationElement != null && durationElement.text().trim().length() > 0) { String [] durationSplitByTimeUnit = durationElement.text().split(":"); if(durationSplitByTimeUnit.length == 3) { int hours = Integer.parseInt(durationSplitByTimeUnit[0]); int minutes = Integer.parseInt(durationSplitByTimeUnit[1]); //we don't care about seconds int totalMinutes = (hours * 60) + minutes; return new Runtime(new Integer(totalMinutes).toString()); } } return Runtime.BLANK_RUNTIME; } @Override public Thumb[] scrapePosters() { ID id = scrapeID(); ArrayList<Thumb> posters = new ArrayList<>(); if(id != null && id.getId().length() > 0) { String trailerPoster = "http://www.caribbeancom.com/moviepages/" + id.getId() + "/images/" + "l_l.jpg"; if(SiteParsingProfile.fileExistsAtURL(trailerPoster)) { try { posters.add(new Thumb(trailerPoster)); } catch (MalformedURLException e) { e.printStackTrace(); } } for(int imageNum = 1; imageNum <=5; imageNum++) { String additionalImageURLTemplate = "http://www.caribbeancom.com/moviepages/" + id.getId() + "/images/l/00" + imageNum + ".jpg"; String additionalImageURLTemplatePreview = "http://www.caribbeancom.com/moviepages/" + id.getId() + "/images/s/00" + imageNum + ".jpg"; if(SiteParsingProfile.fileExistsAtURL(additionalImageURLTemplate)) { try { Thumb additionalThumb = new Thumb(additionalImageURLTemplate); additionalThumb.setPreviewURL(new URL(additionalImageURLTemplatePreview)); posters.add(additionalThumb); } catch (MalformedURLException e) { e.printStackTrace(); } } } } return posters.toArray(new Thumb[posters.size()]); } @Override public Thumb[] scrapeFanart() { return scrapePosters(); } @Override public Thumb[] scrapeExtraFanart() { Thumb[] posters = scrapePosters(); List<Thumb> posterList = new LinkedList<>(Arrays.asList(posters)); if(posterList.size() > 0) posterList.remove(0); return posterList.toArray(new Thumb[posterList.size()]); } @Override public MPAARating scrapeMPAA() { return MPAARating.RATING_XXX; } @Override public ID scrapeID() { initializeJapaneseDocument(); //Just get the ID from the page URL by doing some string manipulation String baseUri = japaneseDocument.baseUri(); if(baseUri.length() > 0 && baseUri.contains("caribbeancom.com")) { baseUri = baseUri.replaceFirst("/index.html", ""); String idFromBaseUri = baseUri.substring(baseUri.lastIndexOf('/')+1); return new ID(idFromBaseUri); } return new ID(""); } @Override public ArrayList<Genre> scrapeGenres() { initializeJapaneseDocument(); ArrayList<Genre> genreList = new ArrayList<>(); Elements genres = japaneseDocument.select("div.movie-info dl.movie-info-cat:contains(カテゴリー:) dd "); if(genres != null) { for(Element currentGenre : genres){ if(currentGenre.text().trim().length() > 0) { String genreText = currentGenre.text(); //right now it's in Japanese since only the Japanese page has info on the genres if(getScrapingLanguage() == Language.ENGLISH) genreText = TranslateString.translateStringJapaneseToEnglish(currentGenre.text().trim()); genreList.add(new Genre(genreText)); } } } return genreList; } @Override public ArrayList<Actor> scrapeActors() { ArrayList<Actor> actorList = new ArrayList<>(); initializeJapaneseDocument(); //Element actorEnglishSearchElement = document.select("table.info_table tbody tr td.property:contains(Starring:) ~ td a").first(); Elements japaneseActors = japaneseDocument.select("div.movie-info dl dt:contains(出演:) ~ dd a"); //Disabling the english actor scraping and just going to use the japanese ones for now - the data for english actors //doesn't comma seperate each person /*if(actorEnglishSearchElement != null && getScrapingLanguage() == Language.ENGLISH) { String hrefText = actorEnglishSearchElement.attr("href"); hrefText = hrefText.replaceFirst(Pattern.quote("/eng/search/"),""); hrefText = hrefText.replaceFirst("/[0-9].html", ""); try { hrefText = URLDecoder.decode( hrefText, "UTF-8" ); String[] actorNames = hrefText.split(","); for(int i = 0; i < actorNames.length; i++) { actorList.add(new Actor(actorNames[i],"",null)); } } catch (UnsupportedEncodingException e) { // TODO Auto-generated catch block e.printStackTrace(); } }*/ //Scrape actors from Japanese page for now and do a name translate if we are scraping in English if(japaneseActors != null) { for(Element japaneseActor : japaneseActors) { String actorName = japaneseActor.text(); if(scrapingLanguage == Language.ENGLISH) actorName = TranslateString.translateJapanesePersonNameToRomaji(actorName); actorList.add(new Actor(actorName,"",null)); } } return actorList; } @Override public ArrayList<Director> scrapeDirectors() { //No Director information on the site return new ArrayList<>(); } @Override public Studio scrapeStudio() { return new Studio("Caribbeancom"); } @Override public Trailer scrapeTrailer() { ID id = scrapeID(); if(id != null && id.getId().length() > 0) { String trailerPath = "http://smovie.caribbeancom.com/sample/movies/" + id.getId() + "/sample_m.mp4"; if(SiteParsingProfile.fileExistsAtURL(trailerPath)) return new Trailer(trailerPath); } return Trailer.BLANK_TRAILER; } @Override public String createSearchString(File file) { scrapedMovieFile = file; this.id = findIDTagFromFile(file); String englishPage = "http://en.caribbeancom.com/eng/moviepages/" + id + "/index.html"; return englishPage; } @Override public SearchResult[] getSearchResults(String searchString) throws IOException { SearchResult englishPage = new SearchResult(searchString); SearchResult [] results = {englishPage}; initializeJapaneseDocument(); return results; } public static String findIDTagFromFile(File file) { return findIDTag(FilenameUtils.getName(file.getName())); } public static String findIDTag(String fileName) { Pattern pattern = Pattern.compile("[0-9]{6}-[0-9]{3}"); Matcher matcher = pattern.matcher(fileName); if (matcher.find()) { String searchString = matcher.group(); return searchString; } return null; } @Override public SiteParsingProfile newInstance() { return new CaribbeancomParsingProfile(); } }