package happy.research.data;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import com.hp.hpl.jena.util.FileManager;
public class Crawler
{
/**
* Crawl user profiles
*
* @throws Exception
*/
public static void crawlUsers() throws Exception
{
String source = "filmTrustSources.txt";
String prefix = "http://trust.mindswap.org/cgi-bin/FilmTrust/foaf.cgi?user=";
String basePath = "C:/Users/guoguibing/Desktop/";
/* Read Sources */
BufferedReader br = new BufferedReader(new FileReader(new File(source)));
String line = null;
Map<String, String> userHrefs = new HashMap<>();
while ((line = br.readLine()) != null)
{
if (line.startsWith("<li>"))
{
int index1 = line.indexOf("user=") + 5;
int index2 = line.indexOf(">", index1);
String user = line.substring(index1, index2).replaceAll(" ", "");
if (user.contains("&"))
{
int index = user.indexOf("&");
user = user.substring(0, index);
}
String href = prefix + user;
if (!userHrefs.containsKey(user)) userHrefs.put(user, href);
}
}
br.close();
/* Read & Write User Models */
String dirPath = basePath + "userProfiles/";
File dir = new File(dirPath);
if (!dir.exists()) dir.mkdirs();
int count = 0;
for (Entry<String, String> entry : userHrefs.entrySet())
{
System.out.println("Count = " + ++count + "/" + userHrefs.size());
String user = entry.getKey();
String href = entry.getValue();
// user = user.replace('`', '-');
user = user.replace('|', '-');
String filePath = dirPath + user + ".rdf";
File file = new File(filePath);
if (file.exists() && file.length() > 0) continue;
String content = FileManager.get().readWholeFileAsUTF8(href);
BufferedWriter bw = new BufferedWriter(new FileWriter(file));
bw.write(content);
bw.close();
}
}
/**
* Crawl movies data, using rating data as a source of movie's ids
*
* @throws Exception
*/
public static void crawlMovies() throws Exception
{
String ratings = "ratings.txt";
String prefix = "http://trust.mindswap.org/cgi-bin/FilmTrust/filmRDF.cgi?movie=";
String basePath = "C:/Users/guoguibing/Desktop/";
List<String> movies = new ArrayList<>();
BufferedReader br = new BufferedReader(new FileReader(new File(ratings)));
String line = null;
while ((line = br.readLine()) != null)
{
String[] data = line.split("::");
String movie = data[1];
if (!movies.contains(movie)) movies.add(movie);
}
br.close();
/* Read and Write Movies data */
String dirPath = basePath + "movies/";
File dir = new File(dirPath);
if (!dir.exists()) dir.mkdirs();
int count = 0;
for (String movie : movies)
{
System.out.println("Current in progress = " + ++count + "/" + movies.size());
String filePath = dirPath + movie + ".rdf";
File file = new File(filePath);
if (file.exists() && file.length() > 0) continue;
String movieHref = prefix + movie;
String content = FileManager.get().readWholeFileAsUTF8(movieHref);
BufferedWriter bw = new BufferedWriter(new FileWriter(file));
bw.write(content);
bw.close();
}
}
public static void main(String[] args) throws Exception
{
Crawler.crawlUsers();
// Crawler.crawlMovies();
}
}