package edu.uncc.cs.watsonsim.scripts;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URLDecoder;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.util.zip.GZIPInputStream;
import edu.uncc.cs.watsonsim.Database;
import edu.uncc.cs.watsonsim.Environment;
/**
* This script takes a page view file from Wikimedia at
* http://dumps.wikimedia.org/other/pagecounts-raw/
*
* @author Sean Gallagher
*
*/
public class WikipediaViewCounter {
public static void main(String[] args) throws IOException {
Database db = new Database(new Environment());
PreparedStatement statement = db.prep("UPDATE meta SET pageviews = pageviews + ? WHERE title = ?;");
String[] filenames = {
"pagecounts-20081005-130000.gz",
"pagecounts-20081009-020001.gz",
"pagecounts-20081010-190001.gz",
"pagecounts-20081011-200000.gz",
"pagecounts-20081017-220000.gz",
"pagecounts-20081024-010000.gz",
"pagecounts-20081025-070000.gz",
"pagecounts-20081107-080000.gz",
"pagecounts-20081109-130000.gz",
"pagecounts-20081117-180000.gz",
"pagecounts-20081119-120000.gz",
"pagecounts-20081123-180001.gz",
"pagecounts-20081201-090000.gz",
"pagecounts-20081204-010000.gz",
"pagecounts-20081213-000001.gz",
"pagecounts-20081219-160000.gz",
"pagecounts-20081222-050000.gz",
"pagecounts-20081222-160000.gz",
"pagecounts-20081223-190001.gz",
"pagecounts-20081230-130000.gz",
"pagecounts-20091011-160000.gz",
"pagecounts-20091017-180000.gz",
"pagecounts-20091017-190001.gz",
"pagecounts-20091022-110000.gz",
"pagecounts-20091024-090000.gz",
"pagecounts-20091102-110000.gz",
"pagecounts-20091113-080001.gz",
"pagecounts-20091114-150000.gz",
"pagecounts-20091120-210000.gz",
"pagecounts-20091123-180001.gz",
"pagecounts-20091204-000000.gz",
"pagecounts-20091218-000000.gz",
"pagecounts-20091223-050000.gz",
"pagecounts-20091226-010000.gz",
"pagecounts-20091228-120000.gz",
"pagecounts-20101002-200000.gz",
"pagecounts-20101004-220000.gz",
"pagecounts-20101006-110000.gz",
"pagecounts-20101006-220000.gz",
"pagecounts-20101007-200000.gz",
"pagecounts-20101008-120000.gz",
"pagecounts-20101008-130001.gz",
"pagecounts-20101008-180000.gz",
"pagecounts-20101008-200000.gz",
"pagecounts-20101010-100000.gz",
"pagecounts-20101011-210000.gz",
"pagecounts-20101025-120000.gz",
"pagecounts-20101028-160000.gz",
"pagecounts-20101110-010001.gz",
"pagecounts-20101113-200000.gz",
"pagecounts-20101208-070000.gz",
"pagecounts-20101212-060000.gz",
"pagecounts-20101217-190000.gz",
"pagecounts-20101224-200000.gz",
"pagecounts-20101225-120000.gz",
"pagecounts-20101227-130000.gz",
"pagecounts-20101230-120000.gz",
"pagecounts-20111003-090000.gz",
"pagecounts-20111009-040000.gz",
"pagecounts-20111011-230000.gz",
"pagecounts-20111013-030000.gz",
"pagecounts-20111017-060000.gz",
"pagecounts-20111030-150000.gz",
"pagecounts-20111112-010000.gz",
"pagecounts-20111116-090000.gz",
"pagecounts-20111126-000000.gz",
"pagecounts-20111203-140000.gz",
"pagecounts-20111208-000001.gz",
"pagecounts-20111209-030000.gz",
"pagecounts-20111218-090000.gz",
"pagecounts-20111223-140000.gz",
"pagecounts-20121003-140000.gz",
"pagecounts-20121007-080000.gz",
"pagecounts-20121017-060001.gz",
"pagecounts-20121023-200000.gz",
"pagecounts-20121026-000000.gz",
"pagecounts-20121030-160000.gz",
"pagecounts-20121102-040000.gz",
"pagecounts-20121124-110000.gz",
"pagecounts-20121129-160000.gz",
"pagecounts-20121207-150000.gz",
"pagecounts-20121208-000000.gz",
"pagecounts-20121209-230000.gz",
"pagecounts-20121215-010000.gz",
"pagecounts-20121217-230000.gz",
"pagecounts-20121220-020001.gz",
"pagecounts-20131001-170000.gz",
"pagecounts-20131001-220001.gz",
"pagecounts-20131005-150014.gz",
"pagecounts-20131015-140000.gz",
"pagecounts-20131016-170005.gz",
"pagecounts-20131109-150010.gz",
"pagecounts-20131120-210001.gz",
"pagecounts-20131122-020005.gz",
"pagecounts-20131125-210002.gz",
"pagecounts-20131126-050002.gz",
"pagecounts-20131128-040000.gz",
"pagecounts-20131130-220002.gz",
"pagecounts-20131210-090000.gz",
"pagecounts-20131210-200000.gz"
};
for (String filename : filenames) {
try (BufferedReader br = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(filename))))) {
String line = br.readLine();
int batchsize = 0;
while (line != null) {
if (line.startsWith("en ")) {
// An English Wikipedia page
// project_code page_title_no_quotes_no_spaces pagecount datainbytes
// such as: en Animal 390 10989083
String[] fields = line.split(" ");
String title = null;
try {
title = URLDecoder.decode(fields[1].replace('_', ' '), "UTF-8");
} catch (IllegalArgumentException e) {}
if (title != null) {
// Opposite of catch{} above
// Page count
statement.setInt(1, Integer.parseInt(fields[2]));
// Docno
statement.setString(2, title);
statement.addBatch();
batchsize += 1;
}
}
if (batchsize == 100000) {
System.out.print(".");
statement.executeBatch();
batchsize = 0;
}
line = br.readLine();
}
statement.executeBatch();
} catch (FileNotFoundException e) {
System.err.println("Could not find " + filename);
System.exit(1);
} catch (IOException e) {
System.err.println("Error reading from " + filename);
e.printStackTrace();
System.exit(1);
} catch (SQLException e) {
System.err.println("Error running SQL while applying " + filename);
e.printStackTrace();
System.exit(1);
}
}
}
}