/********************************************************************************** * $URL: https://source.sakaiproject.org/svn/citations/trunk/citations-osid/xserver/src/java/org/sakaibrary/osid/repository/xserver/AssetIterator.java $ * $Id: AssetIterator.java 105079 2012-02-24 23:08:11Z ottenhoff@longsight.com $ *********************************************************************************** * * Copyright (c) 2006, 2007, 2008 The Sakai Foundation * * Licensed under the Educational Community License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.opensource.org/licenses/ECL-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * **********************************************************************************/ package org.sakaibrary.osid.repository.xserver; import java.io.IOException; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.xml.parsers.ParserConfigurationException; import org.sakaibrary.xserver.session.MetasearchSession; import org.sakaibrary.xserver.session.MetasearchSessionManager; import org.xml.sax.SAXException; import org.xml.sax.SAXParseException; /** * @author gbhatnag * @version */ public class AssetIterator extends org.xml.sax.helpers.DefaultHandler implements org.osid.repository.AssetIterator { /* * Xserver error codes */ public static final int XSERVER_ERROR_MERGE_LIMIT = 134; public static final int XSERVER_ERROR_ALL_MERGED = 137; private static final long serialVersionUID = 1L; private static final String REGULAR_EXPRESSION_FILE = "/data/citationRegex.txt"; private static final org.apache.commons.logging.Log LOG = org.apache.commons.logging.LogFactory.getLog( "org.sakaibrary.osid.repository.xserver.AssetIterator" ); private java.util.LinkedList assetQueue; private java.util.ArrayList regexArray; private String guid; private int totalRecordsCursor = 0; private int numRecordsReturned = 0; private org.osid.shared.Id repositoryId; private org.osid.shared.Id recordStructureId; private org.osid.repository.Asset asset; private org.osid.repository.Record record; /* * Preferred URL handling */ private String preferredUrl; private String preferredUrlFormat; // for SAX parsing private StringBuilder textBuffer; // session private MetasearchSessionManager msm; org.osid.shared.Properties statusProperties; /** * Constructs an empty AssetIterator * * @param guid globally unique identifier for this session * @throws org.osid.repository.RepositoryException */ protected AssetIterator( String guid ) throws org.osid.repository.RepositoryException { this.guid = guid; // get session cache manager msm = MetasearchSessionManager.getInstance(); // create assetQueue assetQueue = new java.util.LinkedList(); // load citation regular expressions try { regexArray = loadCitationRegularExpressions( REGULAR_EXPRESSION_FILE ); } catch( java.io.IOException ioe ) { LOG.warn( "AssetIterator() failed reading citation regular " + "expressions - regex file: " + REGULAR_EXPRESSION_FILE, ioe ); } } private java.util.ArrayList loadCitationRegularExpressions( String filename ) throws java.io.IOException { java.util.ArrayList regexArray = new java.util.ArrayList(); java.io.InputStream is = this.getClass().getResourceAsStream( filename ); try { java.io.BufferedReader regexes = new java.io.BufferedReader( new java.io.InputStreamReader( is ) ); try { // read the regex file and add regexes to array String regex; while( ( regex = regexes.readLine() ) != null ) { String [] nameRegex = regex.split( "=" ); CitationRegex citationRegex = new CitationRegex(); citationRegex.setName( nameRegex[ 0 ].trim() ); citationRegex.setRegex( nameRegex[ 1 ].trim() ); regexArray.add( citationRegex ); } } finally { regexes.close(); } } finally { is.close(); } return regexArray; } public boolean hasNextAsset() throws org.osid.repository.RepositoryException { MetasearchSession metasearchSession = msm.getMetasearchSession( guid ); // get an XServer to check status and update number of records found org.sakaibrary.xserver.XServer xserver = null; statusProperties = null; try { xserver = new org.sakaibrary.xserver.XServer( guid ); xserver.updateSearchStatusProperties(); statusProperties = xserver.getSearchStatusProperties(); } catch( org.sakaibrary.xserver.XServerException xse ) { LOG.warn( "X-Server error: " + xse.getErrorCode() + " - " + xse.getErrorText() ); // throw exception now that status has been updated throw new org.osid.repository.RepositoryException( org.sakaibrary.osid.repository.xserver.MetasearchException. METASEARCH_ERROR ); } // check status for error/timeout String status = null; try { status = ( String ) statusProperties.getProperty( "status" ); } catch( org.osid.shared.SharedException se ) { LOG.warn( "hasNextAsset() failed getting status " + "property", se ); } if( status != null ) { // status and statusMessage are set by XServer.updateSearchStatusProperties if( status.equals( "error" ) ) { throw new org.osid.repository.RepositoryException( org.sakaibrary.osid.repository.xserver.MetasearchException. METASEARCH_ERROR ); } else if( status.equals( "timeout" ) ) { throw new org.osid.repository.RepositoryException( org.sakaibrary.osid.repository.xserver.MetasearchException. SESSION_TIMED_OUT ); } else if( status.equals( "empty" ) ) { // no records found return false; } } else { LOG.warn( "hasNextAsset() - status property is null" ); } // get updated metasearchSession metasearchSession = msm.getMetasearchSession( guid ); Integer numRecordsFound = metasearchSession.getNumRecordsFound(); if( numRecordsFound == null || numRecordsFound.intValue() == 0 ) { // still searching for records, return true return true; } // check if passed max number of attainable records int maxAttainable; boolean gotMergeError = metasearchSession.isGotMergeError(); if( gotMergeError ) { maxAttainable = 300; } else { maxAttainable = numRecordsFound.intValue(); } return ( numRecordsReturned < maxAttainable ); } public org.osid.repository.Asset nextAsset() throws org.osid.repository.RepositoryException { LOG.debug( "nextAsset() [entry] - returned: " + numRecordsReturned + "; total: " + totalRecordsCursor + "; in queue: " + assetQueue.size() ); // return Asset, if ready if( assetQueue.size() > 0 ) { numRecordsReturned++; return ( org.osid.repository.Asset ) assetQueue.removeFirst(); } // assetQueue is empty - check whether we should get more records // or throw an Exception if( hasNextAsset() ) { // hasNextAsset() will throw timeout/error Exceptions if any String status = null; try { status = ( String ) statusProperties.getProperty( "status" ); } catch( org.osid.shared.SharedException se ) { LOG.warn( "nextAsset() failed getting status property", se ); } if( status == null || !status.equals( "ready" ) ) { // the X-Server is still searching/fetching - try again later throw new org.osid.repository.RepositoryException( org.sakaibrary.osid.repository.xserver. MetasearchException.ASSET_NOT_FETCHED ); } // get records from the X-Server MetasearchSession metasearchSession = msm.getMetasearchSession( guid ); org.osid.shared.Id repositoryId = metasearchSession.getRepositoryId(); try { org.sakaibrary.xserver.XServer xserver = new org.sakaibrary.xserver.XServer( guid ); LOG.debug( "nextAsset() calling XServer.getRecordsXML() - assets in " + "queue: " + assetQueue.size() ); createAssets( xserver.getRecordsXML( totalRecordsCursor ), repositoryId ); } catch( org.sakaibrary.xserver.XServerException xse ) { LOG.warn( "X-Server error: " + xse.getErrorCode() + " - " + xse.getErrorText() ); // // Have all (or too many) records been merged? If so, indicate // we've fetched everything we can (end-of-file) // if ((xse.getErrorCodeIntValue() == XSERVER_ERROR_MERGE_LIMIT) || (xse.getErrorCodeIntValue() == XSERVER_ERROR_ALL_MERGED)) { LOG.debug("nextAsset(), Xserver Error " + xse.getErrorCodeIntValue() + ", throwing NO_MORE_ITERATOR_ELEMENTS"); throw new org.osid.repository.RepositoryException( org.osid.shared.SharedException.NO_MORE_ITERATOR_ELEMENTS); } // // Search error // throw new org.osid.repository.RepositoryException( org.sakaibrary.osid.repository.xserver.MetasearchException. METASEARCH_ERROR ); } LOG.debug( "nextAsset(), XServer.getRecordsXML() returns - assets in " + "queue: " + assetQueue.size() ); // // Make sure there really is an asset available - if not, signal "end-of-file" // // Note: this issue can come up if a database provides an estimate but // no actual results // if (assetQueue.size() == 0) { LOG.debug("nextAsset(), An asset is expected, but the asset queue is enpty"); throw new org.osid.repository.RepositoryException( org.osid.shared.SharedException.NO_MORE_ITERATOR_ELEMENTS); } // // records have been fetched and Assets queued // totalRecordsCursor += assetQueue.size(); numRecordsReturned++; return ( org.osid.repository.Asset ) assetQueue.removeFirst(); } else { // no assets available throw new org.osid.repository.RepositoryException( org.osid.shared.SharedException.NO_MORE_ITERATOR_ELEMENTS ); } } /** * This method parses the xml StringBuilder and creates Assets, Records * and Parts in the Repository with the given repositoryId. * * @param xml input xml in "sakaibrary" format * @param log the log being used by the Repository * @param repositoryId the Id of the Repository in which to create Assets, * Records and Parts. * * @throws org.osid.repository.RepositoryException */ private void createAssets( java.io.ByteArrayInputStream xml, org.osid.shared.Id repositoryId ) throws org.osid.repository.RepositoryException { this.repositoryId = repositoryId; recordStructureId = RecordStructure.getInstance().getId(); textBuffer = new StringBuilder(); // use a SAX parser javax.xml.parsers.SAXParserFactory factory; javax.xml.parsers.SAXParser saxParser; // set up the parser factory = javax.xml.parsers.SAXParserFactory.newInstance(); factory.setNamespaceAware( true ); // start parsing try { saxParser = factory.newSAXParser(); saxParser.parse( xml, this ); xml.close(); } catch (SAXParseException spe) { // Use the contained exception, if any Exception x = spe; if (spe.getException() != null) { x = spe.getException(); } // Error generated by the parser LOG.warn("createAssets() parsing exception: " + spe.getMessage() + " - xml line " + spe.getLineNumber() + ", uri " + spe.getSystemId(), x ); } catch (SAXException sxe) { // Error generated by this application // (or a parser-initialization error) Exception x = sxe; if (sxe.getException() != null) { x = sxe.getException(); } LOG.warn( "createAssets() SAX exception: " + sxe.getMessage(), x ); } catch (ParserConfigurationException pce) { // Parser with specified options can't be built LOG.warn( "createAssets() SAX parser cannot be built with " + "specified options" ); } catch (IOException ioe) { // I/O error LOG.warn( "createAssets() IO exception", ioe ); } } //---------------------------------- // SAX DEFAULT HANDLER IMPLEMENTATIONS - //---------------------------------- /** * Receive notification of the beginning of an element. * * @see DefaultHandler */ public void startElement( String namespaceURI, String sName, String qName, org.xml.sax.Attributes attrs ) throws org.xml.sax.SAXException { if( qName.equals( "record" ) ) { populateAssetFromText( "record_start" ); /* * No preferred URL seen (yet) */ preferredUrl = null; preferredUrlFormat = null; } } /** * Receive notification of the end of an element. * * @see DefaultHandler */ public void endElement( String namespaceURI, String sName, String qName ) throws org.xml.sax.SAXException { populateAssetFromText( qName ); } /** * Receive notification of character data inside an element. * * @see DefaultHandler */ public void characters( char[] buf, int offset, int len ) throws org.xml.sax.SAXException { // store character data String text = new String( buf, offset, len ); if( textBuffer == null ) { textBuffer = new StringBuilder( text ); } else { textBuffer.append( text ); } } private void populateAssetFromText( String elementName ) { // new record if( elementName.equals( "record_start" ) ) { try { // create a new asset... need title, description, assetId asset = new Asset( null, null, getId(), repositoryId ); // create a new record record = asset.createRecord( recordStructureId ); } catch( org.osid.repository.RepositoryException re ) { LOG.warn( "populateAssetFromText() failed to " + "create new Asset/Record pair.", re ); } } else if( elementName.equals( "record" ) ) { // a record has ended: do post-processing // // set dateRetrieved setDateRetrieved(); // use inLineCitation to fill in other fields, if possible org.osid.repository.Part inLineCitation; try { if( ( inLineCitation = recordHasPart( InLineCitationPartStructure.getInstance().getType() ) ) != null ) { doRegexParse( ( String )inLineCitation.getValue() ); } } catch( org.osid.repository.RepositoryException re ) { LOG.warn( "populateAssetFromText() failed to " + "gracefully process inLineCitation value.", re ); } // create a preferred URL (if we found all the parts) try { if (preferredUrl != null) { if ((preferredUrlFormat != null) && !(preferredUrlFormat.equalsIgnoreCase("HTML"))) { LOG.debug("Unexpected URL format: " + preferredUrlFormat); } if ((preferredUrlFormat == null) || (preferredUrlFormat.equalsIgnoreCase("HTML"))) { record.createPart(PreferredUrlPartStructure.getInstance().getId(), preferredUrl); } } } catch( org.osid.repository.RepositoryException exception) { LOG.warn("Failed to create preferred URL Part", exception); } finally { preferredUrl = null; preferredUrlFormat = null; } // All done with this asset assetQueue.add( asset ); return; } if( textBuffer == null ) { return; } String text = textBuffer.toString().trim(); if( text.equals( "" ) ) { return; } try { if( elementName.equals( "title" ) ) { asset.updateDisplayName( text ); } else if( elementName.equals( "abstract" ) ) { asset.updateDescription( text ); } else if( elementName.equals( "author" ) ) { record.createPart( CreatorPartStructure.getInstance().getId(), text ); } else if( elementName.equals( "date" ) ) { record.createPart( DatePartStructure.getInstance().getId(), text ); } else if( elementName.equals( "doi" ) ) { record.createPart( DOIPartStructure.getInstance().getId(), text ); } else if( elementName.equals( "edition" ) ) { record.createPart( EditionPartStructure.getInstance().getId(), text ); } else if( elementName.equals( "inLineCitation" ) ) { record.createPart( InLineCitationPartStructure.getInstance().getId(), text ); } else if( elementName.equals( "isnIdentifier" ) ) { record.createPart( IsnIdentifierPartStructure.getInstance().getId(), text ); } else if( elementName.equals( "issue" ) ) { record.createPart( IssuePartStructure.getInstance().getId(), text ); } else if( elementName.equals( "language" ) ) { record.createPart( LanguagePartStructure.getInstance().getId(), text ); } else if( elementName.equals( "note" ) ) { record.createPart( NotePartStructure.getInstance().getId(), text ); } else if( elementName.equals( "openUrl" ) ) { record.createPart( OpenUrlPartStructure.getInstance().getId(), text ); } else if( elementName.equals( "pages" ) ) { createPagesPart( text ); } else if( elementName.equals( "publisherInfo" ) ) { record.createPart( PublisherPartStructure.getInstance().getId(), text ); } else if( elementName.equals( "rights" ) ) { record.createPart( RightsPartStructure.getInstance().getId(), text ); } else if( elementName.equals( "sourceTitle" ) ) { record.createPart( SourceTitlePartStructure.getInstance().getId(), text ); } else if( elementName.equals( "subject" ) ) { record.createPart( SubjectPartStructure.getInstance().getId(), text ); } else if( elementName.equals( "type" ) ) { record.createPart( TypePartStructure.getInstance().getId(), text ); } else if( elementName.equals( "url" ) ) { record.createPart( URLPartStructure.getInstance().getId(), text ); preferredUrl = text; } else if( elementName.equals( "urlLabel" ) ) { record.createPart( URLLabelPartStructure.getInstance().getId(), text ); } else if( elementName.equals( "urlFormat" ) ) { record.createPart( URLFormatPartStructure.getInstance().getId(), text ); preferredUrlFormat = text; } else if( elementName.equals( "volume" ) ) { record.createPart( VolumePartStructure.getInstance().getId(), text ); } else if( elementName.equals( "volumeIssue" ) ) { doRegexParse( text ); } else if( elementName.equals( "year" ) ) { record.createPart( YearPartStructure.getInstance().getId(), text ); } } catch( org.osid.repository.RepositoryException re ) { LOG.warn( "populateAssetFromText() failed to " + "create new Part.", re ); } textBuffer = null; } private void setDateRetrieved() { java.util.GregorianCalendar now = new java.util.GregorianCalendar(); int month = now.get( java.util.Calendar.MONTH ) + 1; int date = now.get( java.util.Calendar.DATE ); String monthStr, dateStr; if( month < 10 ) { monthStr = "0" + month; } else { monthStr = String.valueOf( month ); } if( date < 10 ) { dateStr = "0" + date; } else { dateStr = String.valueOf( date ); } String dateRetrieved = now.get( java.util.Calendar.YEAR ) + "-" + monthStr + "-" + dateStr; try { record.createPart( DateRetrievedPartStructure.getInstance().getId(), dateRetrieved ); } catch( org.osid.repository.RepositoryException re ) { LOG.warn( "setDateRetrieved() failed " + "creating new dateRetrieved Part.", re ); } } /** * This method searches the current record for a Part using its * PartStructure Type. * * @param partStructureType PartStructure Type of Part you need. * @return the Part if it exists in the current record, null if it does not. */ private org.osid.repository.Part recordHasPart( org.osid.shared.Type partStructureType ) { try { org.osid.repository.PartIterator pit = record.getParts(); while( pit.hasNextPart() ) { org.osid.repository.Part part = pit.nextPart(); if( part.getPartStructure().getType().isEqual( partStructureType ) ) { return part; } } } catch( org.osid.repository.RepositoryException re ) { LOG.warn( "recordHasPart() failed getting Parts.", re ); } // did not find the Part return null; } /** * This method does its best to map data contained in an inLineCitation to * other fields such as volume, issue, etc. in the case that they are empty. * It compares the citation to a known set of regular expressions contained * in REGULAR_EXPRESSION_FILE. Adding a new regular expression entails * adding a new case for parsing in this method. * * @param citation inLineCitation to be parsed */ private void doRegexParse( String citation ) { String regexName = null; Pattern pattern; Matcher matcher; boolean hasVolume = false; boolean hasIssue = false; boolean hasDate = false; boolean hasPages = false; boolean hasSourceTitle = false; for( int i = 0; i < regexArray.size(); i++ ) { CitationRegex citationRegex = ( CitationRegex ) regexArray.get( i ); pattern = Pattern.compile( citationRegex.getRegex() ); matcher = pattern.matcher( citation ); if( matcher.find() ) { regexName = citationRegex.getName(); break; } } if( regexName != null ) { // determine which fields are necessary try { hasVolume = recordHasPart( VolumePartStructure.getInstance().getType() ) == null ? false : true; hasIssue = recordHasPart( IssuePartStructure.getInstance().getType() ) == null ? false : true; hasDate = recordHasPart( DatePartStructure.getInstance().getType() ) == null ? false : true; hasPages = recordHasPart( PagesPartStructure.getInstance().getType() ) == null ? false : true; hasSourceTitle = recordHasPart( SourceTitlePartStructure.getInstance().getType() ) == null ? false : true; // if all true, no need to go further if( hasVolume && hasIssue && hasDate && hasPages && hasSourceTitle ) { return; } // check for matching regex if( regexName.equals( "zooRec" ) ) { // .+ \d+(\(\d+\))?, (.*)? \d{4}: \d+-\d+ if( !hasVolume ) { pattern = Pattern.compile( "\\d+" ); matcher = pattern.matcher( citation ); if( matcher.find() ) { record.createPart( VolumePartStructure.getInstance().getId(), matcher.group() ); } } if( !hasIssue ) { pattern = Pattern.compile( "\\(\\d+\\)" ); matcher = pattern.matcher( citation ); if( matcher.find() ) { record.createPart( IssuePartStructure.getInstance().getId(), matcher.group().replaceAll( "\\D", "" ) ); } } if( !hasDate ) { pattern = Pattern.compile( ", (.*)? \\d{4}:" ); matcher = pattern.matcher( citation ); if( matcher.find() ) { String date = matcher.group().substring( 2, matcher.group().length()-1 ); record.createPart( DatePartStructure.getInstance().getId(), date ); } } if( !hasPages ) { pattern = Pattern.compile( "\\d+-\\d+" ); matcher = pattern.matcher( citation ); if( matcher.find() ) { createPagesPart( matcher.group() ); } } if( !hasSourceTitle ) { pattern = Pattern.compile( "\\D+\\d" ); matcher = pattern.matcher( citation ); if( matcher.find() ) { String sourceTitle = matcher.group().substring( 0, matcher.group().length()-2 ); record.createPart( SourceTitlePartStructure.getInstance().getId(), sourceTitle ); } } } else if( regexName.equals( "animBehavAbs" ) ) { // .+ Vol\. \d+, no\. \d+, (\d+)? pp\.|p\. \d+(-\d+.)? (.*)? \d{4}\.$ if( !hasVolume ) { pattern = Pattern.compile( "Vol\\. \\d+" ); matcher = pattern.matcher( citation ); if( matcher.find() ) { record.createPart( VolumePartStructure.getInstance().getId(), matcher.group().replaceAll( "\\D", "" ) ); } } if( !hasIssue ) { pattern = Pattern.compile( "no\\. \\d+" ); matcher = pattern.matcher( citation ); if( matcher.find() ) { record.createPart( IssuePartStructure.getInstance().getId(), matcher.group().replaceAll( "\\D", "" ) ); } } if( !hasDate ) { pattern = Pattern.compile( "(pp\\.|p\\.) \\d+(-\\d+\\.)? (.*)? \\d{4}\\.$" ); matcher = pattern.matcher( citation ); if( matcher.find() ) { String date = matcher.group().substring( matcher.group().indexOf( " ", 4 ) + 1, matcher.group().length()-1 ); record.createPart( DatePartStructure.getInstance().getId(), date ); } } if( !hasPages ) { pattern = Pattern.compile( "(pp\\.|p\\.) \\d+(-\\d+\\.)?" ); matcher = pattern.matcher( citation ); if( matcher.find() ) { createPagesPart( matcher.group() ); } } if( !hasSourceTitle ) { pattern = Pattern.compile( ".+ \\[" ); matcher = pattern.matcher( citation ); if( matcher.find() ) { String sourceTitle = matcher.group().substring( 0, matcher.group().length() - 2 ); record.createPart( SourceTitlePartStructure.getInstance().getId(), sourceTitle ); } } } else if( regexName.equals( "pubMed" ) ) { // .+ (Volume: \\d+, )?Issue: ((\\d+)|(\\w+)), Date: \\d{4} \\d+ \\d+,( Pages: \\d+-\\d+)? if( !hasVolume ) { pattern = Pattern.compile( "Volume: \\d+" ); matcher = pattern.matcher( citation ); if( matcher.find() ) { record.createPart( VolumePartStructure.getInstance().getId(), matcher.group().replaceAll( "\\D", "" ) ); } } if( !hasIssue ) { pattern = Pattern.compile( "Issue: ((\\d+)|(\\w+))" ); matcher = pattern.matcher( citation ); if( matcher.find() ) { String issue = matcher.group().substring( 7, matcher.group().length() ); record.createPart( IssuePartStructure.getInstance().getId(), issue ); } } if( !hasDate ) { pattern = Pattern.compile( "Date: \\d{4} \\d+ \\d+" ); matcher = pattern.matcher( citation ); if( matcher.find() ) { String date = matcher.group().substring( 6, matcher.group().length() ); date = date.replaceAll( "\\s", "-" ); record.createPart( DatePartStructure.getInstance().getId(), date ); } } if( !hasPages ) { pattern = Pattern.compile( "\\d+-\\d+" ); matcher = pattern.matcher( citation ); if( matcher.find() ) { createPagesPart( matcher.group() ); } } if( !hasSourceTitle ) { pattern = Pattern.compile( ".+\\. Vol" ); matcher = pattern.matcher( citation ); if( matcher.find() ) { String sourceTitle = matcher.group().substring( 0, matcher.group().length()-5 ); record.createPart( SourceTitlePartStructure.getInstance().getId(), sourceTitle ); } } } else if( regexName.equals( "isiWos" ) ) { // ^\d+( \(\d+\))?: \w+-.+(.+)?( \w{3})?( \w{3}-\w{3})?( \d+)? \d{4}$ if( !hasVolume ) { pattern = Pattern.compile( "^\\d+" ); matcher = pattern.matcher( citation ); if( matcher.find() ) { record.createPart( VolumePartStructure.getInstance().getId(), matcher.group() ); } } if( !hasIssue ) { pattern = Pattern.compile( "\\(\\d+\\)" ); matcher = pattern.matcher( citation ); if( matcher.find() ) { record.createPart( IssuePartStructure.getInstance().getId(), matcher.group().replaceAll( "\\D", "" ) ); } } if( !hasDate ) { pattern = Pattern.compile( "( \\w{3})?( \\w{3}-\\w{3})?( \\d+)? \\d{4}$" ); matcher = pattern.matcher( citation ); if( matcher.find() ) { record.createPart( DatePartStructure.getInstance().getId(), matcher.group().trim() ); } } if( !hasPages ) { pattern = Pattern.compile( " \\w+(-\\w+)?" ); matcher = pattern.matcher( citation ); if( matcher.find() ) { createPagesPart( matcher.group().trim() ); } } } else if( regexName.equals( "jstor" ) ) { // .+, Vol\. \d+(, No\. \d+)? if( !hasVolume ) { pattern = Pattern.compile( "Vol\\. \\d+" ); matcher = pattern.matcher( citation ); if( matcher.find() ) { record.createPart( VolumePartStructure.getInstance().getId(), matcher.group().replaceAll( "\\D", "" ) ); } } if( !hasIssue ) { pattern = Pattern.compile( "No\\. \\d+" ); matcher = pattern.matcher( citation ); if( matcher.find() ) { record.createPart( IssuePartStructure.getInstance().getId(), matcher.group().replaceAll( "\\D", "" ) ); } } if( !hasSourceTitle ) { pattern = Pattern.compile( ".+, Vol" ); matcher = pattern.matcher( citation ); if( matcher.find() ) { String sourceTitle = matcher.group().substring( 0, matcher.group().length() - 5 ); record.createPart( SourceTitlePartStructure.getInstance().getId(), sourceTitle ); } } } else if( regexName.equals( "eric" ) ) { // ^v\d+ n|v\d+ p\d+-\d+( \w{3})?( \w{3}-\w{3})?( \d+)? \d{4}$ if( !hasVolume ) { pattern = Pattern.compile( "^v\\d+" ); matcher = pattern.matcher( citation ); if( matcher.find() ) { record.createPart( VolumePartStructure.getInstance().getId(), matcher.group().replaceAll( "\\D", "" ) ); } } if( !hasIssue ) { pattern = Pattern.compile( " (n|v)\\d+" ); matcher = pattern.matcher( citation ); if( matcher.find() ) { record.createPart( IssuePartStructure.getInstance().getId(), matcher.group().trim().replaceAll( "\\D", "" ) ); } } if( !hasDate ) { pattern = Pattern.compile( "( \\w{3})?( \\w{3}-\\w{3})?( \\d+)? \\d{4}$" ); matcher = pattern.matcher( citation ); if( matcher.find() ) { record.createPart( DatePartStructure.getInstance().getId(), matcher.group().trim() ); } } if( !hasPages ) { pattern = Pattern.compile( "\\d+-\\d+" ); matcher = pattern.matcher( citation ); if( matcher.find() ) { createPagesPart( matcher.group() ); } } } else if( regexName.equals( "proquest" ) ) { // ^\d+; \d+(; .+)? if( !hasVolume ) { pattern = Pattern.compile( "^\\d+" ); matcher = pattern.matcher( citation ); if( matcher.find() ) { record.createPart( VolumePartStructure.getInstance().getId(), matcher.group() ); } } if( !hasIssue ) { pattern = Pattern.compile( "; \\d+" ); matcher = pattern.matcher( citation ); if( matcher.find() ) { record.createPart( IssuePartStructure.getInstance().getId(), matcher.group().replaceAll( "\\D", "" ) ); } } if( !hasSourceTitle ) { pattern = Pattern.compile( "; \\D+$" ); matcher = pattern.matcher( citation ); if( matcher.find() ) { record.createPart( SourceTitlePartStructure.getInstance().getId(), matcher.group().substring( 2, matcher.group().length() ) ); } } } else if( regexName.equals( "psycInfo" ) ) { // ^Vol \d+\([\w\p{Punct}]+\)) if( !hasVolume ) { pattern = Pattern.compile( "^Vol \\d+" ); matcher = pattern.matcher( citation ); if( matcher.find() ) { record.createPart( VolumePartStructure.getInstance().getId(), matcher.group().replaceAll( "\\D", "" ) ); } } if( !hasIssue ) { pattern = Pattern.compile( "\\(.+\\)" ); matcher = pattern.matcher( citation ); if( matcher.find() ) { record.createPart( IssuePartStructure.getInstance().getId(), matcher.group().substring( 1, matcher.group().length() - 1 ) ); } } } } catch( org.osid.repository.RepositoryException re ) { LOG.warn( "doRegexParse() failed getting " + "PartStructure Types.", re ); } } } private void createPagesPart( String text ) throws org.osid.repository.RepositoryException { if( text.charAt( 0 ) == ',' ) { // getting a poorly formatted field return; } record.createPart( PagesPartStructure.getInstance().getId(), text ); // get start and end page if possible String [] pages = text.split( "-" ); if( pages.length == 0 ) { // cannot create start/end page. return; } String spage = pages[ 0 ].trim(); // delete all non-digit chars (ie: p., pp., etc.) spage = spage.replaceAll( "\\D", "" ); // create startPage part record.createPart( StartPagePartStructure.getInstance().getId(), spage ); // end page if( pages.length == 2 ) { String epage = pages[ 1 ].trim(); epage = epage.replaceAll( "\\D", "" ); record.createPart( EndPagePartStructure.getInstance().getId(), epage ); } } private String getId() { return "asset" + Math.random() * 1000 + System.currentTimeMillis(); } }