package moviescraper.doctord.controller.siteparsingprofile.specific; import java.io.File; import java.io.IOException; import java.net.MalformedURLException; import java.util.ArrayList; import java.util.Arrays; import java.util.LinkedList; import java.util.List; import java.util.regex.Pattern; import org.apache.commons.codec.net.URLCodec; import org.apache.commons.lang3.StringUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import moviescraper.doctord.controller.languagetranslation.JapaneseCharacter; import moviescraper.doctord.controller.languagetranslation.TranslateString; import moviescraper.doctord.controller.siteparsingprofile.SiteParsingProfile; import moviescraper.doctord.model.SearchResult; import moviescraper.doctord.model.dataitem.Actor; import moviescraper.doctord.model.dataitem.Director; import moviescraper.doctord.model.dataitem.Genre; import moviescraper.doctord.model.dataitem.ID; import moviescraper.doctord.model.dataitem.MPAARating; import moviescraper.doctord.model.dataitem.OriginalTitle; import moviescraper.doctord.model.dataitem.Outline; import moviescraper.doctord.model.dataitem.Plot; import moviescraper.doctord.model.dataitem.Rating; import moviescraper.doctord.model.dataitem.ReleaseDate; import moviescraper.doctord.model.dataitem.Runtime; import moviescraper.doctord.model.dataitem.Set; import moviescraper.doctord.model.dataitem.SortTitle; import moviescraper.doctord.model.dataitem.Studio; import moviescraper.doctord.model.dataitem.Tagline; import moviescraper.doctord.model.dataitem.Thumb; import moviescraper.doctord.model.dataitem.Title; import moviescraper.doctord.model.dataitem.Top250; import moviescraper.doctord.model.dataitem.Votes; import moviescraper.doctord.model.dataitem.Year; public class JavZooParsingProfile extends SiteParsingProfile implements SpecificProfile { private static final String siteLanguageToScrape = "en"; @Override public List<ScraperGroupName> getScraperGroupNames() { if(groupNames == null) groupNames = Arrays.asList(ScraperGroupName.JAV_CENSORED_SCRAPER_GROUP); return groupNames; } public JavZooParsingProfile(Document doc) { super(doc); } public JavZooParsingProfile() { // TODO Auto-generated constructor stub } @Override public Title scrapeTitle() { Element titleElement = document.select("div.container h3").first(); if(titleElement != null) { //remove the ID number off beginning of the title, if it exists (and it usually always does on JavLibrary) String titleElementText = titleElement.text().trim(); titleElementText = titleElementText.substring(StringUtils.indexOf(titleElementText," ")).trim(); //sometimes this still leaves "- " at the start of the title, so we'll want to get rid of that too if(titleElementText.startsWith("- ")) { titleElementText = titleElementText.replaceFirst(Pattern.quote("- "), ""); } //sometimes title is not translated to english if (document.location().contains("/en/")) if (JapaneseCharacter.containsJapaneseLetter(titleElementText)) return new Title(TranslateString.translateStringJapaneseToEnglish(titleElementText)); return new Title(titleElementText); } else return new Title(""); } @Override public OriginalTitle scrapeOriginalTitle() { try { Element titleElement = document.select("div.container h3").first(); if(titleElement != null) { //remove the ID number off beginning of the title, if it exists (and it usually always does on JavLibrary) String titleElementText = titleElement.text().trim(); titleElementText = titleElementText.substring(StringUtils.indexOf(titleElementText," ")).trim(); //sometimes this still leaves "- " at the start of the title, so we'll want to get rid of that too if(titleElementText.startsWith("- ")) { titleElementText = titleElementText.replaceFirst(Pattern.quote("- "), ""); } //sometimes title is not translated on the english site if (JapaneseCharacter.containsJapaneseLetter(titleElementText)) return new OriginalTitle(titleElementText); // scrape japanese site for original text String japaneseUrl = document.location().replaceFirst(Pattern.quote("/en/"), "/ja/"); if (japaneseUrl.equals(document.location())) return new OriginalTitle(titleElementText); Document japaneseDoc = Jsoup.connect(japaneseUrl).timeout(CONNECTION_TIMEOUT_VALUE).get(); JavZooParsingProfile spp = new JavZooParsingProfile(japaneseDoc); return spp.scrapeOriginalTitle(); } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return OriginalTitle.BLANK_ORIGINALTITLE; } @Override public SortTitle scrapeSortTitle() { // we don't need any special sort title - that's usually something the // user provides return SortTitle.BLANK_SORTTITLE; } @Override public Set scrapeSet() { Element setElement = document.select("div.container p:contains(Series:) ~ p a").first(); if(setElement != null) { return new Set(setElement.text().trim()); } else return Set.BLANK_SET; } @Override public Rating scrapeRating() { // this site does not have ratings, so just return some default values return Rating.BLANK_RATING; } @Override public Year scrapeYear() { return scrapeReleaseDate().getYear(); } @Override public ReleaseDate scrapeReleaseDate() { Element releaseDateElement = document.select("div.container p:contains(Release Date:), div.container p:contains(發行日期:)").first(); if(releaseDateElement != null) { String releaseDateText = releaseDateElement.text().trim(); releaseDateText = releaseDateText.replace("Release Date:", ""); releaseDateText = releaseDateText.replace("發行日期:", ""); if(releaseDateText != null && releaseDateText.length() > 4) return new ReleaseDate(releaseDateText.trim()); } return ReleaseDate.BLANK_RELEASEDATE; } @Override public Top250 scrapeTop250() { // This type of info doesn't exist on JavZoo return Top250.BLANK_TOP250; } @Override public Votes scrapeVotes() { //This type of info doesn't exist on JavZoo return Votes.BLANK_VOTES; } @Override public Outline scrapeOutline() { //This type of info doesn't exist on JavZoo return Outline.BLANK_OUTLINE; } @Override public Plot scrapePlot() { //This type of info doesn't exist on JavZoo return Plot.BLANK_PLOT; } @Override public Tagline scrapeTagline() { //This type of info doesn't exist on JavZoo return Tagline.BLANK_TAGLINE; } @Override public Runtime scrapeRuntime() { Element runtimeElement = document.select("div.container p:contains(Length:)").first(); if(runtimeElement != null) { String lengthText = runtimeElement.text().trim(); lengthText = lengthText.replaceFirst(Pattern.quote("Length: "), ""); lengthText = lengthText.replaceFirst(Pattern.quote("min"), ""); if(lengthText.length() > 0) { return new Runtime(lengthText); } } return Runtime.BLANK_RUNTIME; } @Override public Thumb[] scrapePosters() { return scrapePostersAndFanart(true); } @Override public Thumb[] scrapeFanart() { return scrapePostersAndFanart(false); } private Thumb[] scrapePostersAndFanart(boolean doCrop) { Element posterElement = document .select("a.bigImage img") .first(); Thumb[] posterThumbs = new Thumb[1]; if(posterElement != null) { String posterLink = posterElement.attr("src").trim(); try{ if (doCrop) //posterThumbs[0] = new Thumb(posterLink, 52.7, 0, 0, 0); posterThumbs[0] = new Thumb(posterLink, true); else posterThumbs[0] = new Thumb(posterLink); return posterThumbs; } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); return new Thumb[0]; } } else return new Thumb[0]; } @Override public MPAARating scrapeMPAA() { return MPAARating.RATING_XXX; } @Override public ID scrapeID() { Element idElement = document.select("div.container p:contains(ID:)").first(); if(idElement != null) { String idText = idElement.text().trim(); idText = idText.replaceFirst(Pattern.quote("ID: "), ""); return new ID(idText); } else return ID.BLANK_ID; } @Override public ArrayList<Genre> scrapeGenres() { Elements genreElements = document.select(".genre"); if(genreElements != null) { ArrayList<Genre> genreList = new ArrayList<>(genreElements.size()); for(Element currentGenre: genreElements) { genreList.add(new Genre(currentGenre.text().trim())); } return genreList; } return new ArrayList<>(); } @Override public ArrayList<Actor> scrapeActors() { Elements actorElements = document.select("div#avatar-waterfall a.avatar-box"); if(actorElements != null) { ArrayList<Actor> actorList = new ArrayList<>(actorElements.size()); for(Element currentActor : actorElements) { String actorName = currentActor.select("span").first().text().trim(); String actorThumbURL = currentActor.select("img").first().attr("src"); //we want the full resolution thumbnail, so replace the "medium" from the URL to get it //actorThumbURL = actorThumbURL.replaceFirst(Pattern.quote("/medium/"), "/"); try { //we can add the actor with their thumbnail so long as we aren't using a placeholder image if(!actorThumbURL.contains("nowprinting.gif")) { actorList.add(new Actor(actorName,"",new Thumb(actorThumbURL))); } else //otherwise add the actor without an image { actorList.add(new Actor(actorName,"",null)); } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } return actorList; } return new ArrayList<>(); } @Override public ArrayList<Director> scrapeDirectors() { Element directorElement = document.select("div.row.movie p:contains(Director:)").first(); if(directorElement != null) { ArrayList<Director> directorList = new ArrayList<>(1); String directorNameText = directorElement.text().trim(); directorNameText = directorNameText.replaceFirst(Pattern.quote("Director: "), ""); directorList.add(new Director(directorNameText, null)); return directorList; } else return new ArrayList<>(); } @Override public Studio scrapeStudio() { Element studioElement = document.select("div.row.movie p:contains(Studio:) ~ p a").first(); if(studioElement != null) { String studioText = studioElement.text().trim(); studioText = studioText.replaceFirst(Pattern.quote("Studio: "), ""); return new Studio(studioText); } else return Studio.BLANK_STUDIO; } @Override public String createSearchString(File file) { scrapedMovieFile = file; String fileNameNoExtension = findIDTagFromFile(file, isFirstWordOfFileIsID()); //return fileNameNoExtension; URLCodec codec = new URLCodec(); try { String fileNameURLEncoded = codec.encode(fileNameNoExtension); String searchTerm = "http://www.javdog.com/" + siteLanguageToScrape + "/search/" + fileNameURLEncoded; return searchTerm; } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } return null; } @Override public SearchResult[] getSearchResults(String searchString) throws IOException { LinkedList<SearchResult> linksList = new LinkedList<>(); try{ Document doc = Jsoup.connect(searchString).userAgent("Mozilla").ignoreHttpErrors(true).timeout(SiteParsingProfile.CONNECTION_TIMEOUT_VALUE).get(); { Elements divVideoLinksElements = doc.select("div.item:has(a[href*=/movie/])"); for(Element currentDivVideoLink : divVideoLinksElements) { Element videoLinksElements = currentDivVideoLink.select("a[href*=/movie/]").last(); String idFromSearchResult = currentDivVideoLink.select("span").first().text(); String currentLink = videoLinksElements.attr("href"); String currentLabel = idFromSearchResult + " " + videoLinksElements.text(); String currentThumb = currentDivVideoLink.select("img").first().attr("src"); if(currentLink.length() > 1) { SearchResult searchResult = new SearchResult(currentLink, currentLabel, new Thumb(currentThumb)); //maybe we can improve search accuracy by putting our suspected best match at the front of the array //we do this by examining the ID from the search result and seeing if it was in our initial search string if(searchString.contains(idFromSearchResult) || searchString.contains(idFromSearchResult.replaceAll(Pattern.quote("-"),""))) linksList.addFirst(searchResult); else linksList.addLast(searchResult); } } return linksList.toArray(new SearchResult[linksList.size()]); } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); return new SearchResult[0]; } } @Override public Thumb[] scrapeExtraFanart() { ArrayList<Thumb> imageList = new ArrayList<>(); Elements sampleBoxImageLinks = document.select("div.sample-box li a[href]"); if (sampleBoxImageLinks != null) { for(Element link: sampleBoxImageLinks) try { imageList.add(new Thumb(link.attr("href"))); } catch (MalformedURLException e) { e.printStackTrace(); } } return imageList.toArray(new Thumb[imageList.size()]); } @Override public String toString(){ return "JavZoo"; } @Override public SiteParsingProfile newInstance() { return new JavZooParsingProfile(); } @Override public String getParserName() { return "JavZoo"; } }