//package com.spbsu.wiki; // //import com.spbsu.commons.io.codec.seq.DictExpansion; //import com.spbsu.commons.io.codec.seq.Dictionary; //import com.spbsu.commons.seq.CharSeqAdapter; //import com.spbsu.commons.util.Holder; //import com.spbsu.commons.util.ThreadTools; //import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream; //import se.lth.cs.nlp.mediawiki.model.WikipediaPage; //import se.lth.cs.nlp.mediawiki.parser.SinglestreamXmlDumpParser; //import se.lth.cs.nlp.pipeline.Filter; //import se.lth.cs.nlp.pipeline.PipelineBuilder; //import se.lth.cs.nlp.wikipedia.lang.RuConfig; //import se.lth.cs.nlp.wikipedia.parser.SwebleWikimarkupToText; // //import javax.xml.parsers.SAXParserFactory; //import java.io.File; //import java.io.FileInputStream; //import java.io.FileWriter; //import java.util.concurrent.BlockingQueue; //import java.util.concurrent.ThreadPoolExecutor; // ///** // * User: solar // * Date: 01.10.15 // * Time: 12:52 // */ //public class CreateWikiCharDict { // public static void main(String[] args) throws Exception { // //noinspection unchecked // final DictExpansion<Character> expansion = new DictExpansion<>((Dictionary<Character>)Dictionary.EMPTY, 50000, System.out); // final String fileName = args[0]; // final SAXParserFactory factory = SAXParserFactory.newInstance(); // factory.setValidating(false); // final File output = new File(fileName.substring(0, fileName.lastIndexOf(".")) + ".dict"); //// final DictExpansion<Character> expansion = new DictExpansion<>(new HashSet<>(Arrays.asList('a')), 1000, System.out); //// for (int i = 0; i < 1000; i++) // // //final SinglestreamXmlDumpParser parser = new SinglestreamXmlDumpParser(new GZIPInputStream(new FileInputStream(fileName))); // // final SinglestreamXmlDumpParser parser = new SinglestreamXmlDumpParser(new BZip2CompressorInputStream(new FileInputStream(fileName))); // // final ThreadPoolExecutor executor = ThreadTools.createBGExecutor("Creating DictExpansion", 1000000); // PipelineBuilder.input(parser).pipe(new SwebleWikimarkupToText(new RuConfig())).pipe(new Filter<WikipediaPage>() { // int index = 0; // final Holder<Dictionary<Character>> dumped = new Holder<>(); // // @Override // protected boolean accept(WikipediaPage wikipediaPage) { // String text = wikipediaPage.getText(); // text = text.replaceAll("\\s+", " "); // text = text.replaceAll("«", "\""); // text = text.replaceAll("»", "\""); // String[] sentences = text.split("\\.\\s"); // for (int i = 0; i < sentences.length; i++) { // final String sentence = sentences[i]; // if (sentence.length() < 100) // continue; //// System.out.println(sentence); // final Runnable item = () -> { // expansion.accept(new CharSeqAdapter(sentence)); // if ((++index) % 10000 == 0 && dumped.getValue() != expansion.result()) { // try { // dumped.setValue(expansion.result()); // expansion.printPairs(new FileWriter(output.getAbsolutePath())); // } catch (Exception e) { // e.printStackTrace(); // } // } // }; // final BlockingQueue<Runnable> queue = executor.getQueue(); // if (queue.remainingCapacity() == 0) { // try { // queue.put(item); // } catch (InterruptedException e) { // throw new RuntimeException(e); // } // } // else executor.execute(item); // } // return false; // } // }).build().run(); // } //}