Statistics.java example

Explorer

Mint-Athena-master
- WEB-INF
  - src
    - java
      - gr
        ntua
        ivml
        athena
        XSDDocumentationGeneration.java
        actions
        DoMapping.java
        Download.java
        GeneralAction.java
        Home.java
        Import.java
        ImportStatus.java
        ImportSummary.java
        ImportsPanel.java
        ItemPanel.java
        LockRelease.java
        LockSummary.java
        LoggedInInterceptor.java
        Login.java
        Logout.java
        ManagementAction.java
        Mapselection.java
        OAIHandler.java
        PreviewError.java
        PreviewReport.java
        Profile.java
        Publish.java
        PublishStatus.java
        Register.java
        Reminder.java
        ReportSummary.java
        ScriptTester.java
        Stats.java
        ThesaurusAction.java
        Transform.java
        TransformStatus.java
        XMLPreview.java
        package-info.java
        concurrent
        PublicationProcessor.java
        Queues.java
        Ticker.java
        UploadIndexer.java
        XSLTransform.java
        db
        AsyncNodeStore.java
        DAO.java
        DB.java
        DataUploadDAO.java
        GlobalPrefixStore.java
        LockManager.java
        MappingDAO.java
        OrganizationDAO.java
        PublicationDAO.java
        StatisticsDAO.java
        StatisticsRandomDAO.java
        TestSetup.java
        ThesaurusAssignmentDAO.java
        ThesaurusDAO.java
        TransformationDAO.java
        UserDAO.java
        XMLNodeDAO.java
        XmlObjectDAO.java
        XpathHolderDAO.java
        harvesting
        Harvester.java
        RepositoryValidator.java
        SingleHarvester.java
        concurrent
        HarvestingExecutor.java
        ScheduledHarvestingExecutor.java
        io
        FileImporter.java
        MultipleRecordsFileImporter.java
        MultipleRecordsZipImporter.java
        util
        Test.java
        XMLDbHandler.java
        xml
        schema
        AboutType.java
        DeletedRecordType.java
        DescriptionType.java
        GetRecordType.java
        GranularityType.java
        HeaderType.java
        IdentifyType.java
        ListIdentifiersType.java
        ListMetadataFormatsType.java
        ListRecordsType.java
        ListSetsType.java
        MetadataFormatType.java
        MetadataType.java
        OAIPMHerrorType.java
        OAIPMHerrorcodeType.java
        OAIPMHtype.java
        ObjectFactory.java
        RecordType.java
        RequestType.java
        ResumptionTokenType.java
        SetType.java
        StatusType.java
        VerbType.java
        package-info.java
        mapping
        JSONMappingHandler.java
        MappingElement.java
        MappingManager.java
        MappingSummary.java
        MappingVersionControl.java
        Schema.java
        persistent
        AccessAuthenticationLogic.java
        BlobWrap.java
        DataUpload.java
        EsePublication.java
        Lock.java
        Lockable.java
        Mapping.java
        Organization.java
        Publication.java
        ReportI.java
        SecurityEnabled.java
        Thesaurus.java
        ThesaurusAssignment.java
        Transformation.java
        User.java
        XMLNode.java
        XmlObject.java
        XpathHolder.java
        test
        ExportXmlTest.java
        IndexTest.java
        MappingTest.java
        NodeTest.java
        OrganizationTests.java
        PublicationTest.java
        StatisticTests.java
        TestDbConnection.java
        TestUploadIndexer.java
        TransformationTest.java
        Upload.java
        UploadDeleteTest.java
        UserTest.java
        util
        Config.java
        ConnectionCheckoutLog.java
        EncodingFilter.java
        ExternalSort.java
        HibernateSessionFilter.java
        Import.java
        MailSender.java
        NodeReader.java
        NodeStoreI.java
        OAIStats.java
        Publish.java
        SessionClose.java
        StringUtils.java
        TabbedStringComparator.java
        Transform.java
        TraversableI.java
        VisitorI.java
        xml
        Handler.java
        IndexingHandler.java
        SchemaValidator.java
        StandardIOHandler.java
        Statistics.java
        TreeGenerationParser.java
        UniqueXPathHandler.java
        transform
        XMLFormatter.java
        XMLNodeTransform.java
        XSLTGenerator.java
        XSLTransform.java
        util
        ElementValueMap.java
        Namespaces.java
        XPathUtils.java
        xsd
        XSDParser.java

package gr.ntua.ivml.athena.xml;

import gr.ntua.ivml.athena.db.DB;
import gr.ntua.ivml.athena.persistent.XmlObject;
import gr.ntua.ivml.athena.persistent.XpathHolder;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;

import org.apache.log4j.Logger;

public class Statistics {
	public static final Logger log = Logger.getLogger(Statistics.class);
	
	public XmlObject xmlObject;
	public Statistics( XmlObject xml ) {
		this.xmlObject = xml;
	}
	
	/* This method should return a list of the available namespaces in
	 * the upload together with the appropriate prefixes. The key of the
	 * hashmap represents the prefix and the value is the uri of the namespace
	 * e.g. key: oai value: http://openarchives.org/oai. If the xml is correct
	 * all the pairs of prefixes & namespace uris will be unique so the resulted
	 * hashmap will be consistent.
	 * ARNE: As mentioned before, you loose all namespaces without prefixes
	 */

	public LinkedHashMap<String, String> getNameSpaces() {
		List<Object[]> l = DB.getXpathHolderDAO().listNamespaces(xmlObject);
        LinkedHashMap<String, String> res = new LinkedHashMap<String, String>();
        for( Object[] oa: l ) {
        	if( oa[0] != null && oa[0].toString().trim().length() != 0 )
        		res.put( oa[0].toString(), oa[1].toString());
        }
        return res;
	}

	/* This method will return a list of the available elements for a specific
	 * namespace, the parameter should be the namespace prefix although this
	 * can change to the uri namespace if needed, the namespace is just shorter.
	 * ARNE: namespace prefixes are not mandatory and thus namespaces without 
	 * prefixes cannot be listed here 
	 */
	public List<String> getElements(String namespacePrefix){
		return DB.getXpathHolderDAO().getElementsByNamespace(xmlObject, namespacePrefix);
	}

	
	/*This method returns a list with the names of the attributes of a single
	 * element.
	 * ARNE: It will merge attributes of elements that have the same name and
	 * do not belong to the same type.
	 */
	public List<String> getAttributes(String elementName){
		List<XpathHolder> l = DB.getXpathHolderDAO().getByName(xmlObject, elementName);
		HashSet<String> res = new HashSet<String>();
        for( XpathHolder xp: l ) {
        	for( XpathHolder attr: xp.getAttributes()) {
        		res.add( attr.getName());
        	}
        }
        List<String> l2 = new ArrayList<String>();
        l2.addAll(res);
        return l2;
	}

	/*This method returns the actual statistics per element/attribute for the
	 * first case of statistics. The array should have two distinct value, the
	 * first is the frequency for the element (how many times it occurs in the
	 * upload) and the second value should be the uniqueness of the values, how
	 * many distict unique values have been found for the set of all possible
	 * value the element has in this particular upload. If the usage of an array
	 * causes problems just change it to anything more appropriate. The parameter
	 * of the method is the element/attribute name as it is returned by the
	 * getElements() and getAttributes() methods.
	 * ARNE: This will only really work for attributes and elements that have only
	 * text and not subelements.  
	 */
	public long[] getElementAttrFreqs(String elementName){
				
        long[] res = new long[2];
 
		List<XpathHolder> l = DB.getXpathHolderDAO().getByName(xmlObject, elementName);
		for( XpathHolder xp: l ) {
			res[0] += xp.getCount();
		}
		res[1] = DB.getXMLNodeDAO().countDistinct(xmlObject, elementName);
		return res;
	}

	/* This method returns the set of possible values an element/attribute
	 * has together with the frequency of it. It should return a hashmap where
	 * the key is the value of the element/attribute and the value attached to it,
	 * the frequency it has. The parameter of the method is the element/attribute
	 * name. We should make sure that only sensible data are returned, for example
	 * in cases like a description element which has a lot of free text this statistic
	 * has no meaning.
	 * ARNE: this can return hundreds of thousands of values and blow the system
	 * the default is a limit to 1000 entries
	 */
	public Map<String, Integer> getElementValues(String elementName, int limit){
		return DB.getXMLNodeDAO().getCountByValue(xmlObject, elementName, limit);
	}

	public Map<String, Integer> getElementValues(String elementName){
		log.debug( "Element values for " + elementName + " on " + xmlObject.getDbID() );
		return DB.getXMLNodeDAO().getCountByValue(xmlObject, elementName,  1000 );
	}

    /*This method returns the median length of the values for a specific
	 * element/attribute. Parameter is the element/attribute name and
	 * returned value is a float representing the median length.
	 */

	public float getAverageLength(String elementName){
		return DB.getXMLNodeDAO().getAvgLength(xmlObject, elementName);
	}
	
	public float getMedianLength(String elementName) {
		float f = 0;
		
		try {
			f = DB.getXMLNodeDAO().getAvgLength(xmlObject, elementName);
		} catch(Exception e) {
			
		}
		return f;
	}

	/*This method returns the value distribution for a specific
	 * element/distribution. The key of the hashmap is arbitrary, used mainly for retrieval
	 * and the value associated with it is the occurences. For example
	 * if we have for a specific element 35 unique values and 100 occurences of
	 * the element then we might have one value appearing 10 times another one
	 * 5 times and the rest 20 values only once. The parameter of the method
	 * is the element name and the result is a hashmap with above mentioned
	 * key/values. This method is used for generating the sparklines.
	 * ARNE: I dont really get this one, if the keys are irrelevant, what are the
	 * values?
	 */

	public HashMap<Integer, Integer> getValueDistribution(String elementName){
		HashMap<Integer, Integer> res = new HashMap<Integer, Integer>();
        res.put(1, 20);
        res.put(2,34);
        res.put(3,5);
        res.put(4,43);
        res.put(5, 8);
        res.put(6, 12);
        res.put(7, 19);
        res.put(8, 32);
        res.put(9, 47);
        res.put(10, 29);
		return res;
	}
}