/* * To change this template, choose Tools | Templates * and open the template in the editor. */ package org.easyrec.utils; import org.easyrec.exception.core.ClusterException; import org.easyrec.model.core.web.Item; import org.easyrec.model.core.TenantVO; import org.easyrec.model.core.web.Operator; import org.easyrec.model.core.web.RemoteTenant; import org.easyrec.model.core.web.Session; import org.easyrec.service.core.ClusterService; import org.easyrec.service.core.TenantService; import org.easyrec.service.domain.TypeMappingService; import org.easyrec.service.web.NamedConfigurationService; import org.easyrec.service.web.nodomain.ShopRecommenderService; import org.easyrec.store.dao.IDMappingDAO; import org.easyrec.store.dao.core.types.ItemTypeDAO; import org.easyrec.store.dao.web.OperatorDAO; import org.easyrec.store.dao.web.RemoteTenantDAO; import org.easyrec.utils.spring.cli.AbstractDependencyInjectionSpringCLI; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.net.URI; import java.net.URLDecoder; import java.util.*; import java.util.logging.Logger; import org.easyrec.plugin.container.PluginRegistry; import org.easyrec.plugin.model.Version; /** * DOCUMENT ME! * * @author pmarschik */ @SuppressWarnings({"UnusedDeclaration"}) public class Movielens100kImporter extends AbstractDependencyInjectionSpringCLI { private static final Logger logger = Logger.getLogger(Movielens1MImporter.class.getName()); private static final double RATING_COUNT = 100000; private static final double MOVIE_COUNT = 1682; private static final double USER_COUNT = 943; private OperatorDAO operatorDAO; private RemoteTenantDAO remoteTenantDAO; private ShopRecommenderService shopRecommenderService; private TenantService tenantService; private ClusterService clusterService; private TypeMappingService typeMappingService; private IDMappingDAO idMappingDAO; private ItemTypeDAO itemTypeDAO; private NamedConfigurationService namedConfigurationService; private PluginRegistry pluginRegistry; private HashMap<Integer, String> clusters; public static void main(String[] args) { Movielens100kImporter importer = new Movielens100kImporter(); importer.processCommandLineCall(args); System.exit(0); } public void setNamedConfigurationService(NamedConfigurationService namedConfigurationService) { this.namedConfigurationService = namedConfigurationService; } public void setItemTypeDAO(ItemTypeDAO itemTypeDAO) { this.itemTypeDAO = itemTypeDAO; } public void setOperatorDAO(OperatorDAO operatorDAO) { this.operatorDAO = operatorDAO; } public void setRemoteTenantDAO(RemoteTenantDAO remoteTenantDAO) { this.remoteTenantDAO = remoteTenantDAO; } public void setShopRecommenderService(ShopRecommenderService shopRecommenderService) { this.shopRecommenderService = shopRecommenderService; } public void setTenantService(TenantService tenantService) { this.tenantService = tenantService; } public ClusterService getClusterService() { return clusterService; } public void setClusterService(ClusterService clusterService) { this.clusterService = clusterService; } public TypeMappingService getTypeMappingService() { return typeMappingService; } public void setTypeMappingService(TypeMappingService typeMappingService) { this.typeMappingService = typeMappingService; } public IDMappingDAO getidMappingDAO() { return idMappingDAO; } public void setidMappingDAO(IDMappingDAO idMappingDAO) { this.idMappingDAO = idMappingDAO; } public IDMappingDAO getIdMappingDAO() { return idMappingDAO; } public void setIdMappingDAO(IDMappingDAO idMappingDAO) { this.idMappingDAO = idMappingDAO; } public PluginRegistry getPluginRegistry() { return pluginRegistry; } public void setPluginRegistry(PluginRegistry pluginRegistry) { this.pluginRegistry = pluginRegistry; } @Override protected String[] getConfigLocations() { return new String[]{"spring/web/importer/movielens/AllInOne_Movielens100k.xml"}; } @Override protected int processCommandLineCall(String[] args) { try { processCommandLineCallEx(args); return 0; } catch (Exception e) { throw new RuntimeException(e); } } public void initClusters() { clusters = new HashMap<Integer, String>(); clusters.put(0, "Unknown"); clusters.put(1, "Action"); clusters.put(2, "Adventure"); clusters.put(3, "Animation"); clusters.put(4, "Children"); clusters.put(5, "Comedy"); clusters.put(6, "Crime"); clusters.put(7, "Documentary"); clusters.put(8, "Drama"); clusters.put(9, "Fantasy"); clusters.put(10, "Filmnoir"); clusters.put(11, "Horror"); clusters.put(12, "Musical"); clusters.put(13, "Mystery"); clusters.put(14, "Romance"); clusters.put(15, "Scifi"); clusters.put(16, "Thriller"); clusters.put(17, "War"); clusters.put(18, "Western"); } @Override protected void usage() { System.out.println("Usage: java -...ImporterCLI <path_to_movielens_100k> <useClusters>"); } private void parseData(Operator operator, File ratingsFile, Map<Integer, Movie> movies, String tenantName, Session session, boolean useClusters, boolean useItemTypes) throws FileNotFoundException, NumberFormatException { if (remoteTenantDAO.exists(tenantName)) { System.out.println("Tenant " + tenantName + " already exists. SKIPPING import."); return; } String tenantDescription = "MovieLens data sets were collected by the GroupLens Research Project\n" + "at the University of Minnesota.\n" + "\n" + "This data set consists of:\n" + "\t* 100,000 ratings (1-5) from 943 users on 1682 movies.\n" + "\t* Each user has rated at least 20 movies."; TenantVO tenant = new TenantVO(tenantName, tenantDescription, 1, 5, 2.5); tenantService.insertTenantWithTypes(tenant, null); System.out.println("\nTenant got id: " + tenant.getId()); remoteTenantDAO.update(operator.getOperatorId(), tenant.getId(), "", tenantDescription); tenantService.updateConfigProperty(tenant.getId(), RemoteTenant.AUTO_ARCHIVER_ENABLED, "false"); tenantService.updateConfigProperty(tenant.getId(), RemoteTenant.AUTO_ARCHIVER_TIME_RANGE, RemoteTenant.AUTO_ARCHIVER_DEFAULT_TIME_RANGE); // enable backtracking by default tenantService.updateConfigProperty(tenant.getId(), RemoteTenant.BACKTRACKING, "true"); // enable auto rule mining by default tenantService.updateConfigProperty(tenant.getId(), RemoteTenant.SCHEDULER_ENABLED, "false"); tenantService.updateConfigProperty(tenant.getId(), RemoteTenant.SCHEDULER_EXECUTION_TIME, RemoteTenant.SCHEDULER_DEFAULT_EXECUTION_TIME); namedConfigurationService.setupDefaultConfiguration(tenant.getId()); RemoteTenant remoteTenant = remoteTenantDAO.get(tenant.getId()); Scanner ratings = new Scanner(ratingsFile); ratings.useDelimiter("\\t|(\\r)?\\n"); System.out.println("\nLoading ratings ..."); int line = 0; int lastPerc = 0; if (useItemTypes) { for (String itemTypeName : clusters.values()) { itemTypeDAO.insertOrUpdate(tenant.getId(), "GENRE_" + itemTypeName.toUpperCase(), true); } } do { line++; double percentage = (line * 100.0) / RATING_COUNT; if (((Math.floor(percentage) % 10) == 0) && ((int) percentage != lastPerc)) { lastPerc = (int) percentage; System.out.print(lastPerc + "% "); } int userId = ratings.nextInt(); int movieId = ratings.nextInt(); int rating = ratings.nextInt(); String timestampStr = ratings.next(); Date timestamp = new Date(Long.parseLong(timestampStr)); // Date timestamp = new Date(); Movie movie = movies.get(movieId); String itemType = Item.DEFAULT_STRING_ITEM_TYPE; if (useItemTypes) { int genreId = movie.getGenres().nextSetBit(0); if (genreId >= 0 && clusters.containsKey(genreId)) { String clusterName = clusters.get(genreId); if (clusterName != null) itemType = "GENRE_" + clusterName.toUpperCase(); } } shopRecommenderService.rateItem(remoteTenant, "" + userId, "" + movieId, itemType, movie.getName() + " Genres: " + movie.getGenres(), movie.getImdbUrl(), movie.getGeneratedImageUrl(), rating, timestamp, session); // always also view with type item shopRecommenderService.viewItem(remoteTenant, "" + userId, "" + movieId, Item.DEFAULT_STRING_ITEM_TYPE, movie.getName() + " Genres: " + movie.getGenres(), movie.getImdbUrl(), movie.getGeneratedImageUrl(), timestamp, session); // use generic sendAction method for view actions for testing // shopRecommenderService.sendAction(remoteTenant, "" + userId, "" + movieId, Item.DEFAULT_STRING_ITEM_TYPE, // movie.getName() + " Genres: " + movie.getGenres(), movie.getImdbUrl(), movie.getGeneratedImageUrl(), // "VIEW", rating, timestamp, session); } while (ratings.hasNextInt()); if (useClusters) { System.out.println("Creating Clusters for tenant!\n"); for (String clusterName : clusters.values()) { try { clusterService.addCluster(remoteTenant.getId(), clusterName, "The Genre " + clusterName, clusterService.getClustersForTenant(remoteTenant.getId()).getRoot().getName()); } catch (ClusterException ce) { System.out .println("An error occured creating the clusters for tenant " + remoteTenant.getStringId() + ": " + ce.getMessage()); } } System.out.println("Done!"); System.out.println("Adding movies to clusters:\n"); for (Movie movie : movies.values()) { for (int i = movie.getGenres().nextSetBit(0); i >= 0; i = movie.getGenres().nextSetBit(i + 1)) { // operate on index i here try { if ((i < 0) || (i > 18)) { System.out.println("Unknown Genre: " + i + " " + movie.getName()); } else { clusterService.addItemToCluster(remoteTenant.getId(), clusters.get(i), idMappingDAO.lookup(Integer.toString(movie.getId())), typeMappingService.getIdOfItemType(remoteTenant.getId(), Item.DEFAULT_STRING_ITEM_TYPE)); } } catch (ClusterException ce) { System.out .println("An error occured adding item " + movie.getName() + " to cluster " + clusters.get(i) + ": " + ce.getMessage()); } } } System.out.println("Done!"); } } private Map<Integer, Movie> parseMovies(File moviesFile) throws FileNotFoundException { Map<Integer, Movie> movies = new TreeMap<Integer, Movie>(); FileInputStream fsi = new FileInputStream(moviesFile); Scanner movieScanner = new Scanner(fsi, "UTF-8"); movieScanner.useDelimiter("\\||\\r?\\n"); System.out.println("Loading movies ..."); int line = 0; int lastPerc = 0; do { line++; double percentage = (line * 100.0) / MOVIE_COUNT; if (((Math.floor(percentage) % 10) == 0) && ((int) percentage != lastPerc)) { lastPerc = (int) percentage; System.out.print(lastPerc + "% "); } int id = movieScanner.nextInt(); String name = movieScanner.next(); String releaseDate = movieScanner.next(); String videoReleaseDate = movieScanner.next(); String imdbUrl = movieScanner.next(); try { imdbUrl = URLDecoder.decode(imdbUrl, "UTF-8").replaceAll(" ", "+"); } catch (Exception ignored) { } BitSet genres = new BitSet(); int idx = 0; genres.set(idx++, movieScanner.nextInt() == 1); // unknown genres.set(idx++, movieScanner.nextInt() == 1); // action genres.set(idx++, movieScanner.nextInt() == 1); // adventure genres.set(idx++, movieScanner.nextInt() == 1); // animation genres.set(idx++, movieScanner.nextInt() == 1); // childrens genres.set(idx++, movieScanner.nextInt() == 1); // comedy genres.set(idx++, movieScanner.nextInt() == 1); // crime genres.set(idx++, movieScanner.nextInt() == 1); // documentary genres.set(idx++, movieScanner.nextInt() == 1); // drama genres.set(idx++, movieScanner.nextInt() == 1); // fantasy genres.set(idx++, movieScanner.nextInt() == 1); // filmnoir genres.set(idx++, movieScanner.nextInt() == 1); // horror genres.set(idx++, movieScanner.nextInt() == 1); // musical genres.set(idx++, movieScanner.nextInt() == 1); // mystery genres.set(idx++, movieScanner.nextInt() == 1); // romance genres.set(idx++, movieScanner.nextInt() == 1); // scifi genres.set(idx++, movieScanner.nextInt() == 1); // thriller genres.set(idx++, movieScanner.nextInt() == 1); // war genres.set(idx, movieScanner.nextInt() == 1); // western String imageUrl = ""; /* try { // TODO need better name handling, e.g. remove all braces and text inside braces, // mov "The" to the front etc. // strip the year String queryName = name.substring(0, name.length() - 7); if (queryName.endsWith(", The")) queryName = "The " + queryName.substring(0, queryName.length() - 6); //queryName = URLEncoder.encode(queryName, "UTF-8"); queryName = queryName.replaceAll(" ", "%20"); int queryYear = Integer.parseInt(releaseDate.substring(7)); // built using http://www.freebase.com/queryeditor String freebaseQuery = "http://www.freebase" + ".com/api/service/mqlread?query={%20%22query%22%3A%20%5B{%20%22%2Fcommon%2Ftopic%2Fimage%22" + "%3A%20{%20%22id%22%3A%20null%2C%20%22limit%22%3A%201%2C%20%22optional%22%3A%20true%20}%2C%" + "20%22FBID96%3Ainitial_release_date%22%3A%20%5B{%20%22type%22%3A%20%22%2Ftype%2Fdatetime%22" + "%2C%20%22value%3C%22%3A%20%22" + (queryYear + 1) + "%22%2C%20%22value%3E%3D%22%3A%20%22" + queryYear + "%22%20}%5D%2C%20%22id%22%3A%20null%2C%20%22limit%22%3A%201%2C%20%22name%22%3A" + "%20null%2C%20%22q0%3Aname~%3D%22%3A%20%22*" + queryName + "*%22%2C%20%22s0%3Atype%22%3A%20%5B{" + "%20%22id%22%3A%20%22%2Ffilm%2Ffilm%22%2C%20%22link%22%3A%20%5B{%20%22timestamp%22%3A%20%5B" + "{%20%22optional%22%3A%20true%2C%20%22type%22%3A%20%22%2Ftype%2Fdatetime%22%2C%20%22value%2" + "2%3A%20null%20}%5D%2C%20%22type%22%3A%20%22%2Ftype%2Flink%22%20}%5D%2C%20%22type%22%3A%20%" + "22%2Ftype%2Ftype%22%20}%5D%2C%20%22type%22%3A%20%22%2Ffilm%2Ffilm%22%20}%5D%20}"; URL url = new URL(freebaseQuery); URLConnection connection = url.openConnection(); InputStream textInputStream = connection.getInputStream(); StringBuilder content = new StringBuilder(); int curChar = -1; while ((curChar = textInputStream.read()) != -1) content.append((char) curChar); JSONObject response = new JSONObject(content.toString()); JSONArray resultArray = response.getJSONArray("result"); if (resultArray.length() > 0) { String imageId = resultArray.getJSONObject(0).getJSONObject( "/common/topic/image").getString("id"); imageUrl = "http://img.freebase.com/api/trans/image_thumb" + imageId + "?maxwidth=1024"; } } catch (Exception ignored) { System.out.println(ignored); } */ movies.put(id, new Movie(id, name, releaseDate, videoReleaseDate, imdbUrl, genres, imageUrl)); } while (movieScanner.hasNextInt()); movieScanner.close(); return movies; } private void processCommandLineCallEx(String[] args) throws Exception { String pathToDataset = "C://projects//easyrec//movielens//small//ml-data"; //"C:\\DATA\\datasets\\ml100k"; boolean useClusters = true; boolean useItemTypes = true; if (args.length != 1) { if (!new File(pathToDataset).exists()) usage(); } else pathToDataset = args[0]; File datasetFile = new File(pathToDataset); File moviesFile = new File(datasetFile.getAbsolutePath() + File.separator + "u.item"); File ratingsFile = new File(datasetFile.getAbsolutePath() + File.separator + "u.data"); if (!datasetFile.exists()) { System.err.println("Path \"" + pathToDataset + "\" doesn't exist."); return; } if (!moviesFile.exists() || !ratingsFile.exists()) { logger.info(moviesFile.toString()); logger.info(ratingsFile.toString()); System.err.println("movies.dat or ratings.dat not found."); return; } pluginRegistry.installPlugin(URI.create("http://www.easyrec.org/plugins/ARM"), new Version("0.98")); Map<Integer, Movie> movies = parseMovies(moviesFile); //noinspection ConstantConditions if (useClusters || useItemTypes) initClusters(); Operator operator = operatorDAO.get("easyrec"); Session session = new Session("ml100k-import-session", "127.0.0.1"); parseData(operator, ratingsFile, movies, "Movielens_100k", session, useClusters, useItemTypes); for (int i = 0; i < 5; i++) { ratingsFile = new File(datasetFile.getAbsolutePath() + File.separator + "u" + (i + 1) + ".base"); parseData(operator, ratingsFile, movies, "Movielens_100k_" + (i + 1), session, useClusters, useItemTypes); } } private static class Movie { private BitSet genres; private String imdbUrl; private String name; private String releaseDate; private String videoReleaseDate; private String generatedImageUrl; private int id; public Movie(int id, String name, String releaseDate, String videoReleaseDate, String imdbUrl, BitSet genres, String generatedImageUrl) { this.id = id; this.name = name; this.releaseDate = releaseDate; this.videoReleaseDate = videoReleaseDate; this.imdbUrl = imdbUrl; this.genres = genres; this.generatedImageUrl = generatedImageUrl; } public BitSet getGenres() { return genres; } public int getId() { return id; } public String getImdbUrl() { return imdbUrl; } public String getName() { return name; } public String getReleaseDate() { return releaseDate; } public String getVideoReleaseDate() { return videoReleaseDate; } public String getGeneratedImageUrl() { return generatedImageUrl; } } }