package moviescraper.doctord.controller.siteparsingprofile.specific; import java.io.File; import java.io.IOException; import java.net.URI; import java.net.URL; import java.util.ArrayList; import java.util.Arrays; import java.util.LinkedList; import java.util.List; import java.util.Locale; import java.util.regex.Matcher; import java.util.regex.Pattern; import moviescraper.doctord.controller.languagetranslation.Language; import moviescraper.doctord.controller.languagetranslation.TranslateString; import moviescraper.doctord.controller.siteparsingprofile.SiteParsingProfile; import moviescraper.doctord.model.SearchResult; import moviescraper.doctord.model.dataitem.Actor; import moviescraper.doctord.model.dataitem.Director; import moviescraper.doctord.model.dataitem.Genre; import moviescraper.doctord.model.dataitem.ID; import moviescraper.doctord.model.dataitem.MPAARating; import moviescraper.doctord.model.dataitem.OriginalTitle; import moviescraper.doctord.model.dataitem.Outline; import moviescraper.doctord.model.dataitem.Plot; import moviescraper.doctord.model.dataitem.Rating; import moviescraper.doctord.model.dataitem.ReleaseDate; import moviescraper.doctord.model.dataitem.Set; import moviescraper.doctord.model.dataitem.SortTitle; import moviescraper.doctord.model.dataitem.Studio; import moviescraper.doctord.model.dataitem.Tagline; import moviescraper.doctord.model.dataitem.Thumb; import moviescraper.doctord.model.dataitem.Title; import moviescraper.doctord.model.dataitem.Top250; import moviescraper.doctord.model.dataitem.Trailer; import moviescraper.doctord.model.dataitem.Votes; import moviescraper.doctord.model.dataitem.Year; import moviescraper.doctord.model.preferences.MoviescraperPreferences; import org.apache.commons.codec.net.URLCodec; import org.apache.commons.lang3.StringEscapeUtils; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.text.WordUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; public class DmmParsingProfile extends SiteParsingProfile implements SpecificProfile { final static double dmmMaxRating = 5.00; private boolean doGoogleTranslation; private boolean scrapeTrailers; @Override public List<ScraperGroupName> getScraperGroupNames() { if(groupNames == null) groupNames = Arrays.asList(ScraperGroupName.JAV_CENSORED_SCRAPER_GROUP); return groupNames; } public DmmParsingProfile() { super(); doGoogleTranslation = (scrapingLanguage == Language.ENGLISH); scrapeTrailers = true; } public DmmParsingProfile(Document document) { super(document); doGoogleTranslation = (scrapingLanguage == Language.ENGLISH); } /** * Default constructor does not define a document, so be careful not to call * scrape methods without initializing the document first some other way. * This constructor is mostly used for calling createSearchString() and * getSearchResults() */ public DmmParsingProfile(boolean doGoogleTranslation) { super(); this.doGoogleTranslation = doGoogleTranslation; if(this.doGoogleTranslation == false) setScrapingLanguage(Language.JAPANESE); scrapeTrailers = true; } public DmmParsingProfile(boolean doGoogleTranslation, boolean scrapeTrailers) { super(); this.doGoogleTranslation = doGoogleTranslation; if(this.doGoogleTranslation == false) setScrapingLanguage(Language.JAPANESE); this.scrapeTrailers = scrapeTrailers; } public DmmParsingProfile(Document document, boolean doGoogleTranslation) { super(document); this.doGoogleTranslation = doGoogleTranslation; if(this.doGoogleTranslation == false) setScrapingLanguage(Language.JAPANESE); } @Override public Title scrapeTitle() { Element titleElement = document.select("[property=og:title]").first(); // run a google translate on the japanese title if(doGoogleTranslation) { return new Title( TranslateString.translateStringJapaneseToEnglish(titleElement .attr("content").toString())); } else { return new Title(titleElement.attr("content").toString()); } } @Override public OriginalTitle scrapeOriginalTitle() { Element titleElement = document.select("[property=og:title]").first(); // leave the original title as the japanese title return new OriginalTitle(titleElement.attr("content").toString()); } @Override public SortTitle scrapeSortTitle() { // we don't need any special sort title - that's usually something the // user provides return SortTitle.BLANK_SORTTITLE; } @Override public Set scrapeSet() { Element setElement = document.select( "table.mg-b20 tr td a[href*=article=series/id=]").first(); if (setElement == null) return Set.BLANK_SET; else if (doGoogleTranslation) { return new Set( TranslateString.translateStringJapaneseToEnglish(setElement .text())); } else return new Set(setElement.text()); } @Override public Rating scrapeRating() { Element ratingElement = document.select(".d-review__average strong") .first(); if (ratingElement != null) return new Rating(dmmMaxRating, ratingElement.text().replace("点", "")); else return Rating.BLANK_RATING; } @Override public Year scrapeYear() { return scrapeReleaseDate().getYear(); } @Override public ReleaseDate scrapeReleaseDate(){ Element releaseDateElement = document .select("table.mg-b20 tr td:contains(貸出開始日:) + td, table.mg-b20 tr td:contains(発売日:) + td, table.mg-b20 tr td:contains(商品発売日:) + td") .first(); if(releaseDateElement != null) { String releaseDate = releaseDateElement.text(); //we want to convert something like 2015/04/25 to 2015-04-25 releaseDate = StringUtils.replace(releaseDate, "/", "-"); return new ReleaseDate(releaseDate); } return ReleaseDate.BLANK_RELEASEDATE; } @Override public Top250 scrapeTop250() { // This type of info doesn't exist on DMM return Top250.BLANK_TOP250; } @Override public Votes scrapeVotes() { Element votesElement = document.select(".d-review__evaluates strong") .first(); if (votesElement != null) return new Votes(votesElement.text()); else return Votes.BLANK_VOTES; } @Override public Outline scrapeOutline() { // TODO Auto-generated method stub return Outline.BLANK_OUTLINE; } @Override public Plot scrapePlot() { //dvd mode Element plotElement = document.select("p.mg-b20").first(); if(plotElement == null || document.baseUri().contains("/digital/video")) { //video rental mode if it didnt find a match using above method plotElement = document.select("tbody .mg-b20.lh4").first(); } if(doGoogleTranslation) { return new Plot( TranslateString.translateStringJapaneseToEnglish(plotElement .text())); } else return new Plot(plotElement.text()); } @Override public Tagline scrapeTagline() { return Tagline.BLANK_TAGLINE; } @Override public moviescraper.doctord.model.dataitem.Runtime scrapeRuntime() { String runtime = ""; Element runtimeElement = document.select( "table.mg-b20 tr td:contains(収録時間:) + td").first(); if (runtimeElement != null) { // get rid of japanese word for minutes and just get the number runtime = runtimeElement.text().replaceAll("分", ""); } return new moviescraper.doctord.model.dataitem.Runtime(runtime); } @Override public Trailer scrapeTrailer(){ try { //we can return no trailers if scraping trailers is not enabled or the page we are scraping does not have a button to link to the trailer Element buttonElement; if(scrapeTrailers && (buttonElement = document.select("a.d-btn[onclick*=sampleplay]").first()) != null){ System.out.println("There should be a trailer, searching now..."); // First, scrape the contents of the 'play trailer' button action. It's a small ajax document containing // an iframe that hosts the flash video player. Then scrape that iframe contents obtaining trailer information. String playerPath = buttonElement.attr("onclick").replaceFirst("^.*sampleplay\\('([^']+).*$", "$1"); playerPath = StringEscapeUtils.unescapeJava(playerPath); URL playerURL = new URI(document.location()).resolve(playerPath).toURL(); Document playerDocument = Jsoup.parse(playerURL, CONNECTION_TIMEOUT_VALUE); URL iframeURL = new URL(playerDocument.select("iframe").first().attr("abs:src")); Document iframeDocument = Jsoup.parse(iframeURL, CONNECTION_TIMEOUT_VALUE); String flashPlayerScript = iframeDocument.select("script").last().data(); Pattern pattern = Pattern.compile(".*flashvars.fid\\s*=\\s*\"([^\"]+).*flashvars.bid\\s*=\\s*\"(\\d)(w|s)\".*", Pattern.DOTALL); Matcher matcher = pattern.matcher(flashPlayerScript); if (matcher.matches()){ String cid = matcher.group(1); int bitrates = Integer.parseInt(matcher.group(2)); String ratio = matcher.group(3); String quality = (bitrates & 0b100) != 0 ? "dmb" : (bitrates & 0b010) != 0 ? "dm" : "sm"; String firstLetterOfCid = cid.substring(0,1); String threeLetterCidCode = cid.substring(0,3); String potentialTrailerURL = String.format("http://cc3001.dmm.co.jp/litevideo/freepv/%1$s/%2$s/%3$s/%3$s_%4$s_%5$s.mp4", firstLetterOfCid, threeLetterCidCode, cid, quality, ratio); if(SiteParsingProfile.fileExistsAtURL(potentialTrailerURL)) { System.out.println("Trailer existed at: " + potentialTrailerURL); return new Trailer(potentialTrailerURL); } } System.err.println("I expected to find a trailer and did not at " + document.location()); } } catch (Exception e) { e.printStackTrace(); } return Trailer.BLANK_TRAILER; } @Override public Thumb[] scrapePosters() { //don't crop the cover for videoc elements as it is a website release and does not have dvd art if(document.baseUri().contains("/digital/videoc")) return scrapePostersAndFanart(false, false); else return scrapePostersAndFanart(true, false); } /** * Helper method for scrapePoster() and scapeFanart since this code is * virtually identical * * @param doCrop * - if true, will only get the front cover as the initial poster * element; otherwise it uses the entire dvd case from DMM.co.jp * @return Thumb[] containing all the scraped poster and extraart (if doCrop * is true) or the cover and back in extraart (if doCrop is false) */ private Thumb[] scrapePostersAndFanart(boolean doCrop, boolean scrapingExtraFanart) { // the movie poster, on this site it usually has both front and back // cover joined in one image Element postersElement = document.select( "a[href^=http://pics.dmm.co.jp][name=package-image], div#sample-video img[src*=/pics.dmm.co.jp]").first(); // the extra screenshots for this movie. It's just the thumbnail as the // actual url requires javascript to find. // We can do some string manipulation on the thumbnail URL to get the // full URL, however Elements extraArtElementsSmallSize = document.select("div#sample-image-block img.mg-b6"); ArrayList<Thumb> posters = new ArrayList<>( 1 + extraArtElementsSmallSize.size()); String posterLink = postersElement.attr("abs:href"); if(posterLink == null || posterLink.length() < 1) posterLink = postersElement.attr("abs:src"); try { // for the poster, do a crop of the the right side of the dvd case image //(which includes both cover art and back art) // so we only get the cover if (doCrop && !scrapingExtraFanart) //use javCropCoverRoutine version of the new Thumb constructor to handle the cropping posters.add(new Thumb(posterLink, true)); else if (!scrapingExtraFanart) posters.add(new Thumb(posterLink)); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } if(scrapingExtraFanart) { // maybe you're someone who doesn't want the movie poster as the cover. // Include the extra art in case // you want to use one of those for (Element item : extraArtElementsSmallSize) { // We need to do some string manipulation and put a "jp" before the // last dash in the URL to get the full size picture String extraArtLinkSmall = item.attr("abs:src"); int indexOfLastDash = extraArtLinkSmall.lastIndexOf('-'); String URLpath = extraArtLinkSmall.substring(0, indexOfLastDash) + "jp" + extraArtLinkSmall.substring(indexOfLastDash); try { if (Thumb.fileExistsAtUrl(URLpath)) posters.add(new Thumb(URLpath)); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } return posters.toArray(new Thumb[0]); } @Override public Thumb[] scrapeFanart() { return scrapePostersAndFanart(false, false); } @Override public MPAARating scrapeMPAA() { return MPAARating.RATING_XXX; } @Override public ID scrapeID() { Element idElement = document.select("td:containsOwn(品番:) ~ td").first(); if(idElement != null) { String idElementText = idElement.text(); idElementText = fixUpIDFormatting(idElementText); return new ID(idElementText); } //This page didn't have an ID, so just put in a empty one else return ID.BLANK_ID; } public static String fixUpIDFormatting(String idElementText){ //DMM sometimes has a letter and underscore then followed by numbers. numbers will be stripped in the next step, so let's strip out the underscore prefix part of the string if(idElementText.contains("_")) { idElementText = idElementText.substring(idElementText.indexOf('_')+1); } //DMM sometimes includes numbers before the ID, so we're going to strip them out to use //the same convention that other sites use for the id number idElementText = idElementText.substring(StringUtils.indexOfAnyBut(idElementText,"0123456789")); //Dmm has everything in lowercase for this field; most sites use uppercase letters as that follows what shows on the cover so will uppercase the string //English locale used for uppercasing just in case user is in some region that messes with the logic of this code... idElementText = idElementText.toUpperCase(Locale.ENGLISH); //insert the dash between the text and number part int firstNumberIndex = StringUtils.indexOfAny(idElementText, "0123456789"); idElementText = idElementText.substring(0,firstNumberIndex) + "-" + idElementText.substring(firstNumberIndex); //remove extra zeros in case we get a 5 or 6 digit numerical part //(For example ABC-00123 will become ABC-123) Pattern patternID = Pattern.compile("([0-9]*\\D+)(\\d{5,6})"); Matcher matcher = patternID.matcher(idElementText); String groupOne = ""; String groupTwo = ""; while (matcher.find()) { groupOne = matcher.group(1); groupTwo = matcher.group(2); } if(groupOne.length() > 0 && groupTwo.length() > 0) { groupTwo = String.format("%03d", Integer.parseInt(groupTwo)); return groupOne + groupTwo; } return idElementText; } @Override public ArrayList<Genre> scrapeGenres() { Elements genreElements = document .select("table.mg-b12 tr td a[href*=article=keyword/id=]"); ArrayList<Genre> genres = new ArrayList<>(genreElements.size()); for (Element genreElement : genreElements) { // get the link so we can examine the id and do some sanity cleanup // and perhaps some better translation that what google has, if we // happen to know better String href = genreElement.attr("abs:href"); String genreID = genreElement.attr("abs:href").substring( href.indexOf("id=") + 3, href.length() - 1); if (acceptGenreID(genreID)) { if(doGoogleTranslation == false) { genres.add(new Genre(genreElement.text())); } else { String potentialBetterTranslation = betterGenreTranslation( genreElement.text(), genreID); // we didn't know of anything hand picked for genres, just use // google translate if (potentialBetterTranslation.equals("")) { genres.add(new Genre(TranslateString .translateStringJapaneseToEnglish(genreElement .text()))); } // Cool, we got something we want to use instead for our genre, // let's use that else { genres.add(new Genre(potentialBetterTranslation)); } } } } // System.out.println("genres" + genreElements); return genres; } private String betterGenreTranslation(String text, String genreID) { String betterGenreTranslatedString = ""; switch (genreID) { case "5001": betterGenreTranslatedString = "Creampie"; break; case "5002": betterGenreTranslatedString = "Fellatio"; break; case "1013": betterGenreTranslatedString = "Nurse"; break; default: break; } return betterGenreTranslatedString; } private String betterActressTranslation(String text, String actressID) { String betterActressTranslatedString = ""; switch (actressID) { case "17802": betterActressTranslatedString = "Tsubomi"; break; case "27815": betterActressTranslatedString = "Sakura Aida"; break; case "1014395": betterActressTranslatedString = "Yuria Ashina"; break; case "1001819": betterActressTranslatedString = "Emiri Himeno"; break; case "1006261": betterActressTranslatedString = "Uta Kohaku"; break; case "101792": betterActressTranslatedString = "Nico Nohara"; break; case "1015472": betterActressTranslatedString = "Tia"; break; case "1016186": betterActressTranslatedString = "Yuko Shiraki"; break; case "1009910": betterActressTranslatedString = "Hana Nonoka"; break; case "1016458": betterActressTranslatedString = "Eve Hoshino"; break; case "1019676": betterActressTranslatedString = "Rie Tachikawa"; break; case "1017201": betterActressTranslatedString = "Meisa Chibana"; break; case "1018387": betterActressTranslatedString = "Nami Itoshino"; break; case "1014108": betterActressTranslatedString = "Juria Tachibana"; break; case "1016575": betterActressTranslatedString = "Chika Kitano"; break; case "24489": betterActressTranslatedString = "Chichi Asada"; break; case "20631": betterActressTranslatedString = "Mitsuki An"; break; default: break; } return betterActressTranslatedString; } // Return false on any genres we don't want scraping in. This can later be // something the user configures, but for now I'll use it // to get rid of weird stuff like DVD toaster // the genreID comes from the href to the genre keyword from DMM // Example: <a href="/mono/dvd/-/list/=/article=keyword/id=6004/"> // The genre ID would be 6004 which is passed in as the String private boolean acceptGenreID(String genreID) { switch (genreID) { case "6529": // "DVD Toaster" WTF is this? Nuke it! return false; case "6102": // "Sample Video" This is not a genre! return false; default: break; } return true; } @Override public ArrayList<Actor> scrapeActors() { // scrape all the actress IDs Elements actressIDElements = document .select("span#performer a[href*=article=actress/id=]"); ArrayList<Actor> actorList = new ArrayList<>( actressIDElements.size()); for (Element actressIDLink : actressIDElements) { String actressIDHref = actressIDLink.attr("abs:href"); String actressNameKanji = actressIDLink.text(); String actressID = actressIDHref.substring( actressIDHref.indexOf("id=") + 3, actressIDHref.length() - 1); String actressPageURL = "http://actress.dmm.co.jp/-/detail/=/actress_id=" + actressID + "/"; try { Document actressPage = Jsoup.connect(actressPageURL).timeout(SiteParsingProfile.CONNECTION_TIMEOUT_VALUE) .get(); Element actressNameElement = actressPage.select("td.t1 h1") .first(); Element actressThumbnailElement = actressPage.select( "tr.area-av30.top td img").first(); String actressThumbnailPath = actressThumbnailElement.attr("abs:src"); //Sometimes the translation service from google gives us weird engrish instead of a name, so let's compare it to the thumbnail file name for the image as a sanity check //if the names aren't close enough, we'll use the thumbnail name //many times the thumbnail name is off by a letter or two or has a number in it, which is why we just don't use this all the time... String actressNameFromThumbnailPath = actressThumbnailPath.substring(actressThumbnailPath.lastIndexOf('/')+1, actressThumbnailPath.lastIndexOf('.')); //To do a proper comparison using Lev distance, let's fix case, make first name appear first get rid of numbers actressNameFromThumbnailPath = actressNameFromThumbnailPath.replaceAll("[0-9]", ""); actressNameFromThumbnailPath = actressNameFromThumbnailPath.replaceAll("_"," "); actressNameFromThumbnailPath = WordUtils.capitalize(actressNameFromThumbnailPath); actressNameFromThumbnailPath = StringUtils.reverseDelimited(actressNameFromThumbnailPath, ' '); // The actor's name is easier to google translate if we get the // hiragana form of it. // The hiragana form of it is between a '(' and a ')' (These are // not parens but some japanese version of parens) String actressNameHiragana = actressNameElement.text() .substring(actressNameElement.text().indexOf('(') + 1, actressNameElement.text().indexOf(')')); // maybe we know in advance the translation system will be junk, // so we check our manual override of people we know it will get // the name wrong on String actressNameEnglish = betterActressTranslation( actressNameHiragana, actressID); boolean didWeManuallyOverrideActress = false; if (actressNameEnglish.equals("") && doGoogleTranslation) { actressNameEnglish = TranslateString .translateJapanesePersonNameToRomaji(actressNameHiragana); } else didWeManuallyOverrideActress = true; //use the difference between the two strings to determine which is the better one. The google translate shouldn't be that many characters away from the thumbnail name, or it's garbage //unless the thumbnail name was the generic "Nowprinting" one, in which case use the google translate if(!actressNameFromThumbnailPath.equals("Nowprinting")) { int LevenshteinDistance = StringUtils.getLevenshteinDistance(actressNameEnglish, actressNameFromThumbnailPath); if(LevenshteinDistance > 3 && !didWeManuallyOverrideActress) { //System.out.println("(We found a junk result from google translate, swapping over to cleaned up thumbnail name"); //System.out.println("Google translate's version of our name: " + actressNameEnglish + " Thumbnail name of person: " + actressNameFromThumbnailPath + " Lev Distance: " + LevenshteinDistance + ")"); actressNameEnglish = actressNameFromThumbnailPath; } } //Sometimes DMM lists a fake under the Name "Main". It's weird and it's not a real person, so just ignore it. if (!actressNameEnglish.equals("Main")) { if(doGoogleTranslation) { if(!actressThumbnailPath.contains("nowprinting.gif")) { actorList.add(new Actor(actressNameEnglish, "", new Thumb( actressThumbnailPath))); } else { actorList.add(new Actor(actressNameEnglish,"",null)); } } else { if(!actressThumbnailPath.contains("nowprinting.gif")) { actorList.add(new Actor(actressNameKanji,"",new Thumb(actressThumbnailPath))); } else { actorList.add(new Actor(actressNameKanji,"",null)); } } } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } //Get actors that are just a "Name" and have no page of their own (common on some web releases) Elements nameOnlyActors = document.select("table.mg-b20 tr td:contains(�??�?:) + td"); for(Element currentNameOnlyActor : nameOnlyActors) { String actorName = currentNameOnlyActor.text().trim(); //for some reason, they sometimes list the age of the person after their name, so let's get rid of that actorName = actorName.replaceFirst("\\([0-9]{2}\\)",""); if(doGoogleTranslation) actorName = TranslateString.translateJapanesePersonNameToRomaji(actorName); actorList.add(new Actor(actorName, "", null)); } return actorList; } @Override public ArrayList<Director> scrapeDirectors() { ArrayList<Director> directors = new ArrayList<>(); Element directorElement = document.select( "table.mg-b20 tr td a[href*=article=director/id=]").first(); if (directorElement != null && directorElement.hasText()) { if(doGoogleTranslation) directors.add(new Director(TranslateString .translateStringJapaneseToEnglish(directorElement.text()), null)); else directors.add(new Director(directorElement.text(),null)); } return directors; } @Override public Studio scrapeStudio() { Element studioElement = document.select( "table.mg-b20 tr td a[href*=article=label/id=]").first(); if (studioElement != null) { if(doGoogleTranslation) return new Studio( TranslateString.translateStringJapaneseToEnglish(studioElement .text())); else return new Studio(studioElement.text()); } else return Studio.BLANK_STUDIO; } @Override public String createSearchString(File file) { scrapedMovieFile = file; String fileNameNoExtension = findIDTagFromFile(file, isFirstWordOfFileIsID()); //System.out.println("fileNameNoExtension in DMM: " + fileNameNoExtension); URLCodec codec = new URLCodec(); try { String fileNameURLEncoded = codec.encode(fileNameNoExtension); //System.out.println("FileNameUrlencode = " + fileNameURLEncoded); return "http://www.dmm.co.jp/search/=/searchstr=" + fileNameURLEncoded + "/"; } catch (Exception e) { e.printStackTrace(); } return null; } /** * returns a String[] filled in with urls of each of the possible movies * found on the page returned from createSearchString * * @throws IOException */ @Override public SearchResult[] getSearchResults(String searchString) throws IOException { boolean firstPageScraping = true; Document searchResultsPage = Jsoup.connect(searchString).timeout(CONNECTION_TIMEOUT_VALUE).get(); Element nextPageLink = searchResultsPage.select("div.list-capt div.list-boxcaptside.list-boxpagenation ul li:not(.terminal) a").last(); ArrayList<SearchResult> searchResults = new ArrayList<>(); ArrayList<String> pagesVisited = new ArrayList<>(); while(firstPageScraping || nextPageLink != null) { nextPageLink = searchResultsPage.select("div.list-capt div.list-boxcaptside.list-boxpagenation ul li:not(.terminal) a").last(); String currentPageURL = searchResultsPage.baseUri(); String nextPageURL = ""; if(nextPageLink != null) nextPageURL = nextPageLink.attr("abs:href"); pagesVisited.add(currentPageURL); //I can probably combine this into one selector, but it wasn't working when I tried it, //so for now I'm making each its own variable and looping through and adding in all the elements seperately Elements dvdLinks = searchResultsPage .select("p.tmb a[href*=/mono/dvd/"); Elements rentalElements = searchResultsPage .select("p.tmb a[href*=/rental/ppr/"); Elements digitalElements = searchResultsPage .select("p.tmb a[href*=/digital/videoa/], p.tmb a[href*=/digital/videoc/]"); //get /mono/dvd links for (int i = 0; i < dvdLinks.size(); i++) { String currentLink = dvdLinks.get(i).attr("abs:href"); Element imageLinkElement = dvdLinks.get(i).select("img").first(); if(imageLinkElement != null) { Thumb currentPosterThumbnail = new Thumb(imageLinkElement.attr("abs:src")); searchResults.add(new SearchResult(currentLink, "", currentPosterThumbnail)); } else { searchResults.add(new SearchResult(currentLink)); } } //get /rental/ppr links for (int i = 0; i < rentalElements.size(); i++) { String currentLink = rentalElements.get(i).attr("abs:href"); Element imageLinkElement = rentalElements.get(i).select("img").first(); if(imageLinkElement != null) { Thumb currentPosterThumbnail = new Thumb(imageLinkElement.attr("abs:src")); searchResults.add(new SearchResult(currentLink, "", currentPosterThumbnail)); } else { searchResults.add(new SearchResult(currentLink)); } } //get /digital/videoa links for (int i = 0; i < digitalElements.size(); i++) { String currentLink = digitalElements.get(i).attr("abs:href"); System.out.println("currentLink = " + currentLink); Element imageLinkElement = digitalElements.get(i).select("img").first(); if(imageLinkElement != null) { Thumb currentPosterThumbnail = new Thumb(imageLinkElement.attr("abs:src")); searchResults.add(new SearchResult(currentLink, "", currentPosterThumbnail)); } else { searchResults.add(new SearchResult(currentLink)); } } firstPageScraping = false; //get the next page of search results (if it exists) using the "next page" link, but only if we haven't visited that page before //TODO this is really not the cleanest way of doing this - I can probably find some way to make the selector not send me in a loop //of pages, but this will work for now if(nextPageLink != null && !pagesVisited.contains(nextPageURL)) searchResultsPage = Jsoup.connect(nextPageURL).get(); else break; } return searchResults.toArray(new SearchResult[searchResults.size()]); } public SearchResult[] getSearchResultsWithoutDVDLinks(String dmmSearchString) throws IOException { SearchResult[] allSearchResult = getSearchResults(dmmSearchString); List<SearchResult> filteredSearchResults = new LinkedList<>(); for(SearchResult currentSR : allSearchResult) { System.out.println("current SR = " + currentSR.getUrlPath()); if(!currentSR.getUrlPath().contains("/mono/dvd/")) filteredSearchResults.add(currentSR); } return filteredSearchResults.toArray(new SearchResult[filteredSearchResults.size()]); } @Override public Thumb[] scrapeExtraFanart() { if(super.isExtraFanartScrapingEnabled()) return scrapePostersAndFanart(false, true); else return new Thumb[0]; } @Override public String toString(){ return "DMM.co.jp"; } @Override public SiteParsingProfile newInstance() { MoviescraperPreferences preferences = MoviescraperPreferences.getInstance(); return new DmmParsingProfile(!preferences.getScrapeInJapanese()); } @Override public String getParserName() { return "DMM.co.jp"; } }