package org.wikibrain.wikidata; import edu.emory.mathcs.backport.java.util.Collections; import org.apache.commons.io.FileUtils; import org.apache.commons.io.LineIterator; import org.apache.commons.lang.StringUtils; import org.wikibrain.parser.DumpSplitter; import org.wikibrain.utils.ParallelForEach; import org.wikibrain.utils.Procedure; import org.wikibrain.utils.WpIOUtils; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.concurrent.atomic.AtomicInteger; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Creates a test wikidata dump that extracts some records from a dump file: * - The first 400 items * - All properties * * @author Shilad Sen */ public class CreateTestDump { private static Logger LOG = LoggerFactory.getLogger(CreateTestDump.class); public static void main(String args[]) throws IOException { if (args.length != 2) { System.err.println( "Usage: java " + CreateTestDump.class + " all_wikidata_input.bz2 test_extract_output.bz2\n"); System.exit(1); } final AtomicInteger i = new AtomicInteger(); final AtomicInteger articles = new AtomicInteger(); final List<String> filtered = Collections.synchronizedList(new ArrayList<String>()); BufferedReader reader = WpIOUtils.openBufferedReader(new File(args[0])); while (true) { String line = reader.readLine(); if (line == null) { break; } line = line.trim(); if (line.endsWith(",")) { line = line.substring(0, line.length()-1); } if (articles.incrementAndGet() % 100000 == 0) { LOG.info("processing entry " + articles); } if (line.contains("\"type\":\"property\"")) { filtered.add(line); } else if (line.contains("\"type\":\"item\"") && i.incrementAndGet() < 400) { filtered.add(line); } } final BufferedWriter writer = WpIOUtils.openBZ2Writer(new File(args[1])); writer.write("[\n"); writer.write(StringUtils.join(filtered, ",\n")); writer.write("\n]\n"); writer.close(); } }