/* * Copyright (C) ${year} Omry Yadan <${email}> * All rights reserved. * * See https://github.com/omry/banana/blob/master/BSD-LICENSE for licensing information */ package net.yadan.banana.tools; import net.yadan.banana.map.IVarKeyHashMap; import net.yadan.banana.map.VarKeyHashMapVisitorAdapter; import net.yadan.banana.memory.Buffer; import net.yadan.banana.memory.IBuffer; import net.yadan.banana.utils.TextIndex; import net.yadan.utils.Histogram; import net.yadan.utils.RateCounter; import net.yadan.utils.Util; import org.xml.sax.Attributes; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; import java.io.File; import java.io.IOException; import java.util.Comparator; import java.util.PriorityQueue; import java.util.StringTokenizer; public class WikipediaIndexer { private static char SEPS[] = " \t,.;:/\\{}[]()<>'\"=|\n!-_*?&0123456789–".toCharArray(); // http://en.wikipedia.org/wiki/Stop_words private static String m_stopWords; static { m_stopWords = "a,able,about,across,after,all,almost,also,am,among,an," + "and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either," + "else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how," + "however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might," + "most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own," + "rather,said,say,says,she,should,since,so,some,than,that,the,their," + "them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were," + "what,when,where,which,while,who,whom,why,will,with,would,yet,you,your"; m_stopWords += "a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z," + "one,two,three,four,five,six,seven,eight,nine,ten," + "www,http,html,com,org,net,url,jpg,gif,thumb,history,page,image," + "see,ref,references,name,date,external,well,used,being,use,later,man,over,last,such," + "new,year,title,right,left,called,many,category,first,between," + "cite,reflist,links,known,publisher,more,people,place,same,id,end,number,states,several,state," + "book,early,made,both,book,isbn,world,web,during,up,file,including,th,part,work,accessdate," + "years,time,under,"; } static long total_time = 0; static RateCounter m_documentsRate = new RateCounter(10); static RateCounter m_wordsRate = new RateCounter(10); static int m_numIndexed = 0; private static TextIndex s_index; public static void main(String[] args) throws ParserConfigurationException, SAXException, IOException { String xmlFile = args[0]; // TODO : change initial index size s_index = new TextIndex(100, 30); s_index.setMaxDocListSize(Integer.MAX_VALUE); s_index.setDebug(true); StringTokenizer t = new StringTokenizer(m_stopWords, ","); while (t.hasMoreTokens()) { s_index.addStopWord(t.nextToken()); } // if (true) { // indexText(1, test2); // System.exit(0); // } System.out.println("Indexing " + xmlFile); SAXParserFactory factory = SAXParserFactory.newInstance(); SAXParser saxParser = factory.newSAXParser(); DefaultHandler handler = new DefaultHandler() { boolean inPage = false; boolean inPageId = false; boolean inPageTitle = false; boolean inPageRevision = false; boolean inPageRevisionText = false; @Override public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException { if (!inPage && "page".equalsIgnoreCase(qName)) { inPage = true; } if (inPage) { if (!inPageTitle && "title".equalsIgnoreCase(qName)) { inPageTitle = true; } if (!inPageId && "id".equalsIgnoreCase(qName)) { inPageId = true; } if (!inPageRevision && "revision".equalsIgnoreCase(qName)) { inPageRevision = true; } } if (inPageRevision) { if (!inPageRevisionText && "text".equalsIgnoreCase(qName)) { inPageRevisionText = true; } } } StringBuilder m_currentText = new StringBuilder(); private int m_currentID; // private String m_currentTitle; @Override public void endElement(String uri, String localName, String qName) throws SAXException { if (inPageRevision) { if (inPageRevisionText && "text".equalsIgnoreCase(qName)) { inPageRevisionText = false; } } if (inPage) { if (inPageRevision && "revision".equalsIgnoreCase(qName)) { String text = m_currentText.toString(); if (!text.startsWith("#REDIRECT") && !text.startsWith("{{Redirect")) { indexText(m_currentID, text); } m_currentText.setLength(0); inPageRevision = false; } if (inPageTitle && "title".equalsIgnoreCase(qName)) { inPageTitle = false; } if (inPageId && "id".equalsIgnoreCase(qName)) { inPageId = false; } } if (inPage && "page".equalsIgnoreCase(qName)) { inPage = false; } } @Override public void characters(char[] ch, int start, int length) throws SAXException { if (inPageTitle) { // m_currentTitle = new String(ch, start, length); } if (inPageRevisionText) { m_currentText.append(ch, start, length); } if (inPageId) { String string = new String(ch, start, length); m_currentID = Integer.parseInt(string); } } }; saxParser.parse(new File(xmlFile), handler); } public static void indexText(int documentId, String text) { m_documentsRate.tick(); long t = System.currentTimeMillis(); int numWords = s_index.index(documentId, text, SEPS); m_wordsRate.tick(numWords); total_time += (System.currentTimeMillis() - t); if (m_numIndexed++ % 10000 == 0 && m_documentsRate.getTicksPerSecond() != -1) { printStats(s_index); } } public static void printStats(TextIndex index) { System.out.println("Indexing " + m_documentsRate.getTicksPerSecond() + "/sec, " + m_wordsRate.getTicksPerSecond() + " words/sec"); System.out.println(String.format("Indexed %s documents (with %s words), total of %s", Util.formatNum(index.getNumDocumentsIndexed()), Util.formatNum(index.getNumWordsTokenized()), Util.formatSize(index.getTotalIndexedTextSize()))); long memoryUsage = index.computeMemoryUsage(); IVarKeyHashMap word2Index = index.getWord2DocList(); MaxDocListFinder visitor = new MaxDocListFinder(); word2Index.visitRecords(visitor); final Histogram docListLengthHistogram = Histogram.createStatic(10, visitor.min, visitor.max); IndexStatsCollector collector = new IndexStatsCollector(docListLengthHistogram); word2Index.visitRecords(collector); System.out .println(String.format("Index size %s, index/text size=%.2f%%", Util.formatSize(memoryUsage), 100 * (memoryUsage / (float) index.getTotalIndexedTextSize()))); System.out.println(String.format("\tdoclists : used+free=total : %s+%s=%s", Util.formatSize(collector.totalUsedForDocLists * 4), Util.formatSize(collector.totalFreeForDocLists * 4), Util.formatSize(collector.totalUsedForDocLists * 4 + collector.totalFreeForDocLists * 4))); System.out.println(String.format("\twords : used+free=total : %s+%s=%s", Util.formatSize(collector.totalUsedForWords * 4), Util.formatSize(collector.totalFreeForWords * 4), Util.formatSize(collector.totalUsedForWords * 4 + collector.totalFreeForWords * 4))); System.out.println("Doc list sizes histogram " + docListLengthHistogram); System.out.println(); int n = 0; if (n > 0) { TopXWordsCollector topX = new TopXWordsCollector(n); word2Index.visitRecords(topX); for (WordAndCount w : topX.m_bestWords) { System.out.println("'" + w.word + "' : " + w.count); } } } private static final class IndexStatsCollector extends VarKeyHashMapVisitorAdapter { int totalUsedForWords = 0; int totalFreeForWords = 0; int totalUsedForDocLists = 0; int totalFreeForDocLists = 0; private Histogram m_docListLengthHistogram; public IndexStatsCollector(Histogram docListLengthHistogram) { m_docListLengthHistogram = docListLengthHistogram; } @Override public void visit(IVarKeyHashMap map, int keyPtr, int valuesPtr, long num, long total) { int docListSize = map.getInt(valuesPtr, TextIndex.DOC_LIST_SIZE_OFFSET); m_docListLengthHistogram.addToHistogram(docListSize); int docListAllocation = map.valueMemory().maximumCapacityFor(valuesPtr); totalUsedForDocLists += docListSize; assert docListAllocation >= docListSize; totalFreeForDocLists += (docListAllocation - docListSize); } } private static class WordAndCount implements Comparable<WordAndCount> { String word; long count; public WordAndCount(String w, int c) { word = w; count = c; } @Override public String toString() { return word + ":" + count; } @Override public int compareTo(WordAndCount o) { return (int) (count - o.count); // return count > o.count ? 1 : count < o.count ? -1 : 0; } } private static final class TopXWordsCollector extends VarKeyHashMapVisitorAdapter { PriorityQueue<WordAndCount> m_bestWords; private int m_topX; IBuffer m_tmpWord = new Buffer(50); char chars[] = new char[50]; public TopXWordsCollector(int topX) { if (topX > 0) { m_topX = topX; m_bestWords = new PriorityQueue<WordAndCount>(topX); } else { m_topX = -topX; m_bestWords = new PriorityQueue<WordAndCount>(-topX, new Comparator<WordAndCount>() { @Override public int compare(WordAndCount o1, WordAndCount o2) { return -o1.compareTo(o2); } }); } } @Override public void visit(IVarKeyHashMap map, int keyPtr, int valuesPtr, long num, long total) { int wordSize = map.keysMemory().getInt(keyPtr, 0) * 2; map.keysMemory().getBuffer(keyPtr, 1, m_tmpWord, wordSize); m_tmpWord.getChars(0, chars, 0, wordSize); if (chars[wordSize - 1] == 0) { wordSize--; } int docListSize = map.getInt(valuesPtr, TextIndex.DOC_LIST_SIZE_OFFSET); String word = new String(chars, 0, wordSize); m_bestWords.add(new WordAndCount(word, docListSize)); if (m_bestWords.size() > m_topX) { m_bestWords.remove(); } m_tmpWord.reset(); } } private static final class MaxDocListFinder extends VarKeyHashMapVisitorAdapter { int max = Integer.MIN_VALUE; int min = Integer.MAX_VALUE; @Override public void visit(IVarKeyHashMap map, int keyPtr, int valuePtr, long num, long total) { int size = map.getInt(valuePtr, TextIndex.DOC_LIST_SIZE_OFFSET); max = Math.max(size, max); min = Math.min(size, min); } } static String test = "[[File:Autistic-sweetiepie-boy-with-ducksinarow.jpg|thumb|alt=Young boy asleep on a bed, facing the camera, with only the head visible and the body off-camera. On the bed behind the boy's head is a dozen or so toys carefully arranged in a line." + "|A young boy with autism who has arranged his toys in a row]] '''[[Stereotypy]]''' is repetitive movement, such as hand flapping, head rolling, or body rocking [[Compulsive behavior]]''' is intended and appears to follow rules, such as arranging objects in " + "stacks or lines.'''Sameness''' is resistance to change; for example, insisting that the furniture not be moved or refusing to be interrupted. '''[[Ritual#Psychology|Ritualistic behavior]]''' involves an unvarying pattern of daily activities, such as an " + "unchanging menu or a dressing ritual. This is closely associated with sameness and an independent validation has suggested combining the two factors.<ref name=Lam-Aman>{{vcite journal |journal=J Autism Dev Disord |year=2007 |volume=37 |issue=5 " + "|pages=855–66 |title=The Repetitive Behavior Scale-Revised: independent validation in individuals with autism spectrum disorders |author=Lam KSL, Aman MG |doi=10.1007/s10803-006-0213-z |pmid=17048092 }}</ref> '''Restricted behavior''' is limited " + "in focus, interest, or activity, such as preoccupation with a single television program, toy, or game. '''[[Self-injury]]''' includes movements that injure or can injure the person, such as eye poking, [[skin picking]], hand biting, " + "and head banging.<ref name=Johnson/> A 2007 study reported that self-injury at some point affected about 30% of children with ASD.<ref name=Dominick/> No single repetitive or self-injurious behavior seems to be specific to autism, but only" + " autism appears to have an elevated pattern of occurrence and severity of these behaviors.<ref>{{vcite journal |journal=J Autism Dev Disord |year=2000 |volume=30 |issue=3 |pages=237–43 |title=Varieties of repetitive behavior " + "in autism: comparisons to mental retardation |author=Bodfish JW, Symons FJ, Parker DE, Lewis MH |doi=10.1023/A:1005596502855 |pmid=11055459 }}</ref>"; static String test2 = "[[File:Autistic-sweetiepie-boy-with-ducksinarow.jpg|thumb|alt=Young boy asleep on a bed, facing the camera, with only the head visible and the body off-camera. On the bed behind the boy's head is a dozen or so toys carefully arranged in a line." + "|A young boy with autism who has arranged his toys in a row]] '''[[Stereotypy]]''' is repetitive movement, such as hand flapping, head rolling, or body rocking [[Compulsive behavior]]''' is intended and appears to follow rules, such as arranging objects in " + "stacks or lines.'''Sameness''' is resistance to change; for example, insisting that the furniture not be moved or refusing to be interrupted. '''[[Ritual#Psychology|Ritualistic behavior]]''' involves an unvarying pattern of daily activities, such as an " + "unchanging menu or a dressing ritual. This is closely associated with sameness and an independent validation has suggested combining the two factors.<ref name=Lam-Aman>{{vcite journal |journal=J Autism Dev Disord |year=2007 |volume=37 |issue=5 " + "|pages=855–66 |title=The Repetitive Behavior Scale-Revised: independent validation in individuals with autism spectrum disorders |author=Lam KSL, Aman MG |doi=10.1007/s10803-006-0213-z |pmid=17048092 }}</ref> '''Restricted behavior''' is limited " + "in focus, interest, or activity"; }