/**********************************************************************************
* $URL: https://source.sakaiproject.org/svn/citations/trunk/citations-osid/xserver/src/java/org/sakaibrary/osid/repository/xserver/AssetIterator.java $
* $Id: AssetIterator.java 105079 2012-02-24 23:08:11Z ottenhoff@longsight.com $
***********************************************************************************
*
* Copyright (c) 2006, 2007, 2008 The Sakai Foundation
*
* Licensed under the Educational Community License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.opensource.org/licenses/ECL-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
**********************************************************************************/
package org.sakaibrary.osid.repository.xserver;
import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.xml.parsers.ParserConfigurationException;
import org.sakaibrary.xserver.session.MetasearchSession;
import org.sakaibrary.xserver.session.MetasearchSessionManager;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;
/**
* @author gbhatnag
* @version
*/
public class AssetIterator extends org.xml.sax.helpers.DefaultHandler
implements org.osid.repository.AssetIterator {
/*
* Xserver error codes
*/
public static final int XSERVER_ERROR_MERGE_LIMIT = 134;
public static final int XSERVER_ERROR_ALL_MERGED = 137;
private static final long serialVersionUID = 1L;
private static final String REGULAR_EXPRESSION_FILE = "/data/citationRegex.txt";
private static final org.apache.commons.logging.Log LOG =
org.apache.commons.logging.LogFactory.getLog(
"org.sakaibrary.osid.repository.xserver.AssetIterator" );
private java.util.LinkedList assetQueue;
private java.util.ArrayList regexArray;
private String guid;
private int totalRecordsCursor = 0;
private int numRecordsReturned = 0;
private org.osid.shared.Id repositoryId;
private org.osid.shared.Id recordStructureId;
private org.osid.repository.Asset asset;
private org.osid.repository.Record record;
/*
* Preferred URL handling
*/
private String preferredUrl;
private String preferredUrlFormat;
// for SAX parsing
private StringBuilder textBuffer;
// session
private MetasearchSessionManager msm;
org.osid.shared.Properties statusProperties;
/**
* Constructs an empty AssetIterator
*
* @param guid globally unique identifier for this session
* @throws org.osid.repository.RepositoryException
*/
protected AssetIterator( String guid )
throws org.osid.repository.RepositoryException {
this.guid = guid;
// get session cache manager
msm = MetasearchSessionManager.getInstance();
// create assetQueue
assetQueue = new java.util.LinkedList();
// load citation regular expressions
try {
regexArray = loadCitationRegularExpressions( REGULAR_EXPRESSION_FILE );
} catch( java.io.IOException ioe ) {
LOG.warn( "AssetIterator() failed reading citation regular " +
"expressions - regex file: " + REGULAR_EXPRESSION_FILE, ioe );
}
}
private java.util.ArrayList loadCitationRegularExpressions( String filename )
throws java.io.IOException {
java.util.ArrayList regexArray = new java.util.ArrayList();
java.io.InputStream is = this.getClass().getResourceAsStream( filename );
try {
java.io.BufferedReader regexes = new java.io.BufferedReader(
new java.io.InputStreamReader( is ) );
try {
// read the regex file and add regexes to array
String regex;
while( ( regex = regexes.readLine() ) != null ) {
String [] nameRegex = regex.split( "=" );
CitationRegex citationRegex = new CitationRegex();
citationRegex.setName( nameRegex[ 0 ].trim() );
citationRegex.setRegex( nameRegex[ 1 ].trim() );
regexArray.add( citationRegex );
}
} finally {
regexes.close();
}
} finally {
is.close();
}
return regexArray;
}
public boolean hasNextAsset()
throws org.osid.repository.RepositoryException {
MetasearchSession metasearchSession = msm.getMetasearchSession( guid );
// get an XServer to check status and update number of records found
org.sakaibrary.xserver.XServer xserver = null;
statusProperties = null;
try {
xserver = new org.sakaibrary.xserver.XServer( guid );
xserver.updateSearchStatusProperties();
statusProperties = xserver.getSearchStatusProperties();
} catch( org.sakaibrary.xserver.XServerException xse ) {
LOG.warn( "X-Server error: " + xse.getErrorCode() +
" - " + xse.getErrorText() );
// throw exception now that status has been updated
throw new org.osid.repository.RepositoryException(
org.sakaibrary.osid.repository.xserver.MetasearchException.
METASEARCH_ERROR );
}
// check status for error/timeout
String status = null;
try {
status = ( String ) statusProperties.getProperty( "status" );
} catch( org.osid.shared.SharedException se ) {
LOG.warn( "hasNextAsset() failed getting status " +
"property", se );
}
if( status != null ) {
// status and statusMessage are set by XServer.updateSearchStatusProperties
if( status.equals( "error" ) ) {
throw new org.osid.repository.RepositoryException(
org.sakaibrary.osid.repository.xserver.MetasearchException.
METASEARCH_ERROR );
} else if( status.equals( "timeout" ) ) {
throw new org.osid.repository.RepositoryException(
org.sakaibrary.osid.repository.xserver.MetasearchException.
SESSION_TIMED_OUT );
} else if( status.equals( "empty" ) ) {
// no records found
return false;
}
} else {
LOG.warn( "hasNextAsset() - status property is null" );
}
// get updated metasearchSession
metasearchSession = msm.getMetasearchSession( guid );
Integer numRecordsFound = metasearchSession.getNumRecordsFound();
if( numRecordsFound == null || numRecordsFound.intValue() == 0 ) {
// still searching for records, return true
return true;
}
// check if passed max number of attainable records
int maxAttainable;
boolean gotMergeError = metasearchSession.isGotMergeError();
if( gotMergeError ) {
maxAttainable = 300;
} else {
maxAttainable = numRecordsFound.intValue();
}
return ( numRecordsReturned < maxAttainable );
}
public org.osid.repository.Asset nextAsset()
throws org.osid.repository.RepositoryException {
LOG.debug( "nextAsset() [entry] - returned: " + numRecordsReturned + "; total: " +
totalRecordsCursor + "; in queue: " + assetQueue.size() );
// return Asset, if ready
if( assetQueue.size() > 0 ) {
numRecordsReturned++;
return ( org.osid.repository.Asset ) assetQueue.removeFirst();
}
// assetQueue is empty - check whether we should get more records
// or throw an Exception
if( hasNextAsset() ) {
// hasNextAsset() will throw timeout/error Exceptions if any
String status = null;
try {
status = ( String ) statusProperties.getProperty( "status" );
} catch( org.osid.shared.SharedException se ) {
LOG.warn( "nextAsset() failed getting status property", se );
}
if( status == null || !status.equals( "ready" ) ) {
// the X-Server is still searching/fetching - try again later
throw new org.osid.repository.RepositoryException(
org.sakaibrary.osid.repository.xserver.
MetasearchException.ASSET_NOT_FETCHED );
}
// get records from the X-Server
MetasearchSession metasearchSession = msm.getMetasearchSession( guid );
org.osid.shared.Id repositoryId = metasearchSession.getRepositoryId();
try {
org.sakaibrary.xserver.XServer xserver =
new org.sakaibrary.xserver.XServer( guid );
LOG.debug( "nextAsset() calling XServer.getRecordsXML() - assets in " +
"queue: " + assetQueue.size() );
createAssets( xserver.getRecordsXML( totalRecordsCursor ),
repositoryId );
} catch( org.sakaibrary.xserver.XServerException xse ) {
LOG.warn( "X-Server error: " + xse.getErrorCode() + " - " +
xse.getErrorText() );
//
// Have all (or too many) records been merged? If so, indicate
// we've fetched everything we can (end-of-file)
//
if ((xse.getErrorCodeIntValue() == XSERVER_ERROR_MERGE_LIMIT) ||
(xse.getErrorCodeIntValue() == XSERVER_ERROR_ALL_MERGED))
{
LOG.debug("nextAsset(), Xserver Error "
+ xse.getErrorCodeIntValue()
+ ", throwing NO_MORE_ITERATOR_ELEMENTS");
throw new org.osid.repository.RepositoryException(
org.osid.shared.SharedException.NO_MORE_ITERATOR_ELEMENTS);
}
//
// Search error
//
throw new org.osid.repository.RepositoryException(
org.sakaibrary.osid.repository.xserver.MetasearchException.
METASEARCH_ERROR );
}
LOG.debug( "nextAsset(), XServer.getRecordsXML() returns - assets in " +
"queue: " + assetQueue.size() );
//
// Make sure there really is an asset available - if not, signal "end-of-file"
//
// Note: this issue can come up if a database provides an estimate but
// no actual results
//
if (assetQueue.size() == 0)
{
LOG.debug("nextAsset(), An asset is expected, but the asset queue is enpty");
throw new org.osid.repository.RepositoryException(
org.osid.shared.SharedException.NO_MORE_ITERATOR_ELEMENTS);
}
//
// records have been fetched and Assets queued
//
totalRecordsCursor += assetQueue.size();
numRecordsReturned++;
return ( org.osid.repository.Asset ) assetQueue.removeFirst();
} else {
// no assets available
throw new org.osid.repository.RepositoryException(
org.osid.shared.SharedException.NO_MORE_ITERATOR_ELEMENTS );
}
}
/**
* This method parses the xml StringBuilder and creates Assets, Records
* and Parts in the Repository with the given repositoryId.
*
* @param xml input xml in "sakaibrary" format
* @param log the log being used by the Repository
* @param repositoryId the Id of the Repository in which to create Assets,
* Records and Parts.
*
* @throws org.osid.repository.RepositoryException
*/
private void createAssets( java.io.ByteArrayInputStream xml,
org.osid.shared.Id repositoryId )
throws org.osid.repository.RepositoryException {
this.repositoryId = repositoryId;
recordStructureId = RecordStructure.getInstance().getId();
textBuffer = new StringBuilder();
// use a SAX parser
javax.xml.parsers.SAXParserFactory factory;
javax.xml.parsers.SAXParser saxParser;
// set up the parser
factory = javax.xml.parsers.SAXParserFactory.newInstance();
factory.setNamespaceAware( true );
// start parsing
try {
saxParser = factory.newSAXParser();
saxParser.parse( xml, this );
xml.close();
} catch (SAXParseException spe) {
// Use the contained exception, if any
Exception x = spe;
if (spe.getException() != null) {
x = spe.getException();
}
// Error generated by the parser
LOG.warn("createAssets() parsing exception: " +
spe.getMessage() + " - xml line " + spe.getLineNumber() +
", uri " + spe.getSystemId(), x );
} catch (SAXException sxe) {
// Error generated by this application
// (or a parser-initialization error)
Exception x = sxe;
if (sxe.getException() != null) {
x = sxe.getException();
}
LOG.warn( "createAssets() SAX exception: " + sxe.getMessage(), x );
} catch (ParserConfigurationException pce) {
// Parser with specified options can't be built
LOG.warn( "createAssets() SAX parser cannot be built with " +
"specified options" );
} catch (IOException ioe) {
// I/O error
LOG.warn( "createAssets() IO exception", ioe );
}
}
//----------------------------------
// SAX DEFAULT HANDLER IMPLEMENTATIONS -
//----------------------------------
/**
* Receive notification of the beginning of an element.
*
* @see DefaultHandler
*/
public void startElement( String namespaceURI, String sName,
String qName, org.xml.sax.Attributes attrs ) throws
org.xml.sax.SAXException {
if( qName.equals( "record" ) ) {
populateAssetFromText( "record_start" );
/*
* No preferred URL seen (yet)
*/
preferredUrl = null;
preferredUrlFormat = null;
}
}
/**
* Receive notification of the end of an element.
*
* @see DefaultHandler
*/
public void endElement( String namespaceURI, String sName, String qName )
throws org.xml.sax.SAXException {
populateAssetFromText( qName );
}
/**
* Receive notification of character data inside an element.
*
* @see DefaultHandler
*/
public void characters( char[] buf, int offset, int len )
throws org.xml.sax.SAXException {
// store character data
String text = new String( buf, offset, len );
if( textBuffer == null ) {
textBuffer = new StringBuilder( text );
} else {
textBuffer.append( text );
}
}
private void populateAssetFromText( String elementName ) {
// new record
if( elementName.equals( "record_start" ) ) {
try {
// create a new asset... need title, description, assetId
asset = new Asset( null, null, getId(), repositoryId );
// create a new record
record = asset.createRecord( recordStructureId );
} catch( org.osid.repository.RepositoryException re ) {
LOG.warn( "populateAssetFromText() failed to " +
"create new Asset/Record pair.", re );
}
} else if( elementName.equals( "record" ) ) {
// a record has ended: do post-processing //
// set dateRetrieved
setDateRetrieved();
// use inLineCitation to fill in other fields, if possible
org.osid.repository.Part inLineCitation;
try {
if( ( inLineCitation = recordHasPart(
InLineCitationPartStructure.getInstance().getType() ) )
!= null ) {
doRegexParse( ( String )inLineCitation.getValue() );
}
} catch( org.osid.repository.RepositoryException re ) {
LOG.warn( "populateAssetFromText() failed to " +
"gracefully process inLineCitation value.", re );
}
// create a preferred URL (if we found all the parts)
try
{
if (preferredUrl != null)
{
if ((preferredUrlFormat != null) &&
!(preferredUrlFormat.equalsIgnoreCase("HTML")))
{
LOG.debug("Unexpected URL format: " + preferredUrlFormat);
}
if ((preferredUrlFormat == null) ||
(preferredUrlFormat.equalsIgnoreCase("HTML")))
{
record.createPart(PreferredUrlPartStructure.getInstance().getId(),
preferredUrl);
}
}
}
catch( org.osid.repository.RepositoryException exception)
{
LOG.warn("Failed to create preferred URL Part", exception);
}
finally
{
preferredUrl = null;
preferredUrlFormat = null;
}
// All done with this asset
assetQueue.add( asset );
return;
}
if( textBuffer == null ) {
return;
}
String text = textBuffer.toString().trim();
if( text.equals( "" ) ) {
return;
}
try {
if( elementName.equals( "title" ) ) {
asset.updateDisplayName( text );
} else if( elementName.equals( "abstract" ) ) {
asset.updateDescription( text );
} else if( elementName.equals( "author" ) ) {
record.createPart( CreatorPartStructure.getInstance().getId(),
text );
} else if( elementName.equals( "date" ) ) {
record.createPart( DatePartStructure.getInstance().getId(),
text );
} else if( elementName.equals( "doi" ) ) {
record.createPart( DOIPartStructure.getInstance().getId(),
text );
} else if( elementName.equals( "edition" ) ) {
record.createPart( EditionPartStructure.getInstance().getId(),
text );
} else if( elementName.equals( "inLineCitation" ) ) {
record.createPart( InLineCitationPartStructure.getInstance().getId(),
text );
} else if( elementName.equals( "isnIdentifier" ) ) {
record.createPart( IsnIdentifierPartStructure.getInstance().getId(),
text );
} else if( elementName.equals( "issue" ) ) {
record.createPart( IssuePartStructure.getInstance().getId(),
text );
} else if( elementName.equals( "language" ) ) {
record.createPart( LanguagePartStructure.getInstance().getId(),
text );
} else if( elementName.equals( "note" ) ) {
record.createPart( NotePartStructure.getInstance().getId(),
text );
} else if( elementName.equals( "openUrl" ) ) {
record.createPart( OpenUrlPartStructure.getInstance().getId(),
text );
} else if( elementName.equals( "pages" ) ) {
createPagesPart( text );
} else if( elementName.equals( "publisherInfo" ) ) {
record.createPart( PublisherPartStructure.getInstance().getId(),
text );
} else if( elementName.equals( "rights" ) ) {
record.createPart( RightsPartStructure.getInstance().getId(),
text );
} else if( elementName.equals( "sourceTitle" ) ) {
record.createPart( SourceTitlePartStructure.getInstance().getId(),
text );
} else if( elementName.equals( "subject" ) ) {
record.createPart( SubjectPartStructure.getInstance().getId(),
text );
} else if( elementName.equals( "type" ) ) {
record.createPart( TypePartStructure.getInstance().getId(),
text );
} else if( elementName.equals( "url" ) ) {
record.createPart( URLPartStructure.getInstance().getId(), text );
preferredUrl = text;
} else if( elementName.equals( "urlLabel" ) ) {
record.createPart( URLLabelPartStructure.getInstance().getId(),
text );
} else if( elementName.equals( "urlFormat" ) ) {
record.createPart( URLFormatPartStructure.getInstance().getId(),
text );
preferredUrlFormat = text;
} else if( elementName.equals( "volume" ) ) {
record.createPart( VolumePartStructure.getInstance().getId(),
text );
} else if( elementName.equals( "volumeIssue" ) ) {
doRegexParse( text );
} else if( elementName.equals( "year" ) ) {
record.createPart( YearPartStructure.getInstance().getId(),
text );
}
} catch( org.osid.repository.RepositoryException re ) {
LOG.warn( "populateAssetFromText() failed to " +
"create new Part.", re );
}
textBuffer = null;
}
private void setDateRetrieved() {
java.util.GregorianCalendar now = new java.util.GregorianCalendar();
int month = now.get( java.util.Calendar.MONTH ) + 1;
int date = now.get( java.util.Calendar.DATE );
String monthStr, dateStr;
if( month < 10 ) {
monthStr = "0" + month;
} else {
monthStr = String.valueOf( month );
}
if( date < 10 ) {
dateStr = "0" + date;
} else {
dateStr = String.valueOf( date );
}
String dateRetrieved = now.get( java.util.Calendar.YEAR ) + "-" +
monthStr + "-" + dateStr;
try {
record.createPart( DateRetrievedPartStructure.getInstance().getId(),
dateRetrieved );
} catch( org.osid.repository.RepositoryException re ) {
LOG.warn( "setDateRetrieved() failed " +
"creating new dateRetrieved Part.", re );
}
}
/**
* This method searches the current record for a Part using its
* PartStructure Type.
*
* @param partStructureType PartStructure Type of Part you need.
* @return the Part if it exists in the current record, null if it does not.
*/
private org.osid.repository.Part recordHasPart(
org.osid.shared.Type partStructureType ) {
try {
org.osid.repository.PartIterator pit = record.getParts();
while( pit.hasNextPart() ) {
org.osid.repository.Part part = pit.nextPart();
if( part.getPartStructure().getType().isEqual( partStructureType ) ) {
return part;
}
}
} catch( org.osid.repository.RepositoryException re ) {
LOG.warn( "recordHasPart() failed getting Parts.", re );
}
// did not find the Part
return null;
}
/**
* This method does its best to map data contained in an inLineCitation to
* other fields such as volume, issue, etc. in the case that they are empty.
* It compares the citation to a known set of regular expressions contained
* in REGULAR_EXPRESSION_FILE. Adding a new regular expression entails
* adding a new case for parsing in this method.
*
* @param citation inLineCitation to be parsed
*/
private void doRegexParse( String citation ) {
String regexName = null;
Pattern pattern;
Matcher matcher;
boolean hasVolume = false;
boolean hasIssue = false;
boolean hasDate = false;
boolean hasPages = false;
boolean hasSourceTitle = false;
for( int i = 0; i < regexArray.size(); i++ ) {
CitationRegex citationRegex = ( CitationRegex ) regexArray.get( i );
pattern = Pattern.compile( citationRegex.getRegex() );
matcher = pattern.matcher( citation );
if( matcher.find() ) {
regexName = citationRegex.getName();
break;
}
}
if( regexName != null ) {
// determine which fields are necessary
try {
hasVolume =
recordHasPart( VolumePartStructure.getInstance().getType() )
== null ? false : true;
hasIssue =
recordHasPart( IssuePartStructure.getInstance().getType() )
== null ? false : true;
hasDate =
recordHasPart( DatePartStructure.getInstance().getType() )
== null ? false : true;
hasPages =
recordHasPart( PagesPartStructure.getInstance().getType() )
== null ? false : true;
hasSourceTitle =
recordHasPart( SourceTitlePartStructure.getInstance().getType() )
== null ? false : true;
// if all true, no need to go further
if( hasVolume && hasIssue && hasDate && hasPages && hasSourceTitle ) {
return;
}
// check for matching regex
if( regexName.equals( "zooRec" ) ) {
// .+ \d+(\(\d+\))?, (.*)? \d{4}: \d+-\d+
if( !hasVolume ) {
pattern = Pattern.compile( "\\d+" );
matcher = pattern.matcher( citation );
if( matcher.find() ) {
record.createPart( VolumePartStructure.getInstance().getId(),
matcher.group() );
}
}
if( !hasIssue ) {
pattern = Pattern.compile( "\\(\\d+\\)" );
matcher = pattern.matcher( citation );
if( matcher.find() ) {
record.createPart( IssuePartStructure.getInstance().getId(),
matcher.group().replaceAll( "\\D", "" ) );
}
}
if( !hasDate ) {
pattern = Pattern.compile( ", (.*)? \\d{4}:" );
matcher = pattern.matcher( citation );
if( matcher.find() ) {
String date = matcher.group().substring( 2,
matcher.group().length()-1 );
record.createPart( DatePartStructure.getInstance().getId(),
date );
}
}
if( !hasPages ) {
pattern = Pattern.compile( "\\d+-\\d+" );
matcher = pattern.matcher( citation );
if( matcher.find() ) {
createPagesPart( matcher.group() );
}
}
if( !hasSourceTitle ) {
pattern = Pattern.compile( "\\D+\\d" );
matcher = pattern.matcher( citation );
if( matcher.find() ) {
String sourceTitle = matcher.group().substring( 0,
matcher.group().length()-2 );
record.createPart(
SourceTitlePartStructure.getInstance().getId(),
sourceTitle );
}
}
} else if( regexName.equals( "animBehavAbs" ) ) {
// .+ Vol\. \d+, no\. \d+, (\d+)? pp\.|p\. \d+(-\d+.)? (.*)? \d{4}\.$
if( !hasVolume ) {
pattern = Pattern.compile( "Vol\\. \\d+" );
matcher = pattern.matcher( citation );
if( matcher.find() ) {
record.createPart( VolumePartStructure.getInstance().getId(),
matcher.group().replaceAll( "\\D", "" ) );
}
}
if( !hasIssue ) {
pattern = Pattern.compile( "no\\. \\d+" );
matcher = pattern.matcher( citation );
if( matcher.find() ) {
record.createPart( IssuePartStructure.getInstance().getId(),
matcher.group().replaceAll( "\\D", "" ) );
}
}
if( !hasDate ) {
pattern = Pattern.compile( "(pp\\.|p\\.) \\d+(-\\d+\\.)? (.*)? \\d{4}\\.$" );
matcher = pattern.matcher( citation );
if( matcher.find() ) {
String date = matcher.group().substring(
matcher.group().indexOf( " ", 4 ) + 1,
matcher.group().length()-1 );
record.createPart( DatePartStructure.getInstance().getId(),
date );
}
}
if( !hasPages ) {
pattern = Pattern.compile( "(pp\\.|p\\.) \\d+(-\\d+\\.)?" );
matcher = pattern.matcher( citation );
if( matcher.find() ) {
createPagesPart( matcher.group() );
}
}
if( !hasSourceTitle ) {
pattern = Pattern.compile( ".+ \\[" );
matcher = pattern.matcher( citation );
if( matcher.find() ) {
String sourceTitle = matcher.group().substring( 0,
matcher.group().length() - 2 );
record.createPart(
SourceTitlePartStructure.getInstance().getId(),
sourceTitle );
}
}
} else if( regexName.equals( "pubMed" ) ) {
// .+ (Volume: \\d+, )?Issue: ((\\d+)|(\\w+)), Date: \\d{4} \\d+ \\d+,( Pages: \\d+-\\d+)?
if( !hasVolume ) {
pattern = Pattern.compile( "Volume: \\d+" );
matcher = pattern.matcher( citation );
if( matcher.find() ) {
record.createPart( VolumePartStructure.getInstance().getId(),
matcher.group().replaceAll( "\\D", "" ) );
}
}
if( !hasIssue ) {
pattern = Pattern.compile( "Issue: ((\\d+)|(\\w+))" );
matcher = pattern.matcher( citation );
if( matcher.find() ) {
String issue = matcher.group().substring( 7,
matcher.group().length() );
record.createPart( IssuePartStructure.getInstance().getId(),
issue );
}
}
if( !hasDate ) {
pattern = Pattern.compile( "Date: \\d{4} \\d+ \\d+" );
matcher = pattern.matcher( citation );
if( matcher.find() ) {
String date = matcher.group().substring( 6,
matcher.group().length() );
date = date.replaceAll( "\\s", "-" );
record.createPart( DatePartStructure.getInstance().getId(),
date );
}
}
if( !hasPages ) {
pattern = Pattern.compile( "\\d+-\\d+" );
matcher = pattern.matcher( citation );
if( matcher.find() ) {
createPagesPart( matcher.group() );
}
}
if( !hasSourceTitle ) {
pattern = Pattern.compile( ".+\\. Vol" );
matcher = pattern.matcher( citation );
if( matcher.find() ) {
String sourceTitle = matcher.group().substring( 0,
matcher.group().length()-5 );
record.createPart(
SourceTitlePartStructure.getInstance().getId(),
sourceTitle );
}
}
} else if( regexName.equals( "isiWos" ) ) {
// ^\d+( \(\d+\))?: \w+-.+(.+)?( \w{3})?( \w{3}-\w{3})?( \d+)? \d{4}$
if( !hasVolume ) {
pattern = Pattern.compile( "^\\d+" );
matcher = pattern.matcher( citation );
if( matcher.find() ) {
record.createPart( VolumePartStructure.getInstance().getId(),
matcher.group() );
}
}
if( !hasIssue ) {
pattern = Pattern.compile( "\\(\\d+\\)" );
matcher = pattern.matcher( citation );
if( matcher.find() ) {
record.createPart( IssuePartStructure.getInstance().getId(),
matcher.group().replaceAll( "\\D", "" ) );
}
}
if( !hasDate ) {
pattern = Pattern.compile( "( \\w{3})?( \\w{3}-\\w{3})?( \\d+)? \\d{4}$" );
matcher = pattern.matcher( citation );
if( matcher.find() ) {
record.createPart( DatePartStructure.getInstance().getId(),
matcher.group().trim() );
}
}
if( !hasPages ) {
pattern = Pattern.compile( " \\w+(-\\w+)?" );
matcher = pattern.matcher( citation );
if( matcher.find() ) {
createPagesPart( matcher.group().trim() );
}
}
} else if( regexName.equals( "jstor" ) ) {
// .+, Vol\. \d+(, No\. \d+)?
if( !hasVolume ) {
pattern = Pattern.compile( "Vol\\. \\d+" );
matcher = pattern.matcher( citation );
if( matcher.find() ) {
record.createPart( VolumePartStructure.getInstance().getId(),
matcher.group().replaceAll( "\\D", "" ) );
}
}
if( !hasIssue ) {
pattern = Pattern.compile( "No\\. \\d+" );
matcher = pattern.matcher( citation );
if( matcher.find() ) {
record.createPart( IssuePartStructure.getInstance().getId(),
matcher.group().replaceAll( "\\D", "" ) );
}
}
if( !hasSourceTitle ) {
pattern = Pattern.compile( ".+, Vol" );
matcher = pattern.matcher( citation );
if( matcher.find() ) {
String sourceTitle = matcher.group().substring( 0,
matcher.group().length() - 5 );
record.createPart(
SourceTitlePartStructure.getInstance().getId(),
sourceTitle );
}
}
} else if( regexName.equals( "eric" ) ) {
// ^v\d+ n|v\d+ p\d+-\d+( \w{3})?( \w{3}-\w{3})?( \d+)? \d{4}$
if( !hasVolume ) {
pattern = Pattern.compile( "^v\\d+" );
matcher = pattern.matcher( citation );
if( matcher.find() ) {
record.createPart( VolumePartStructure.getInstance().getId(),
matcher.group().replaceAll( "\\D", "" ) );
}
}
if( !hasIssue ) {
pattern = Pattern.compile( " (n|v)\\d+" );
matcher = pattern.matcher( citation );
if( matcher.find() ) {
record.createPart( IssuePartStructure.getInstance().getId(),
matcher.group().trim().replaceAll( "\\D", "" ) );
}
}
if( !hasDate ) {
pattern = Pattern.compile( "( \\w{3})?( \\w{3}-\\w{3})?( \\d+)? \\d{4}$" );
matcher = pattern.matcher( citation );
if( matcher.find() ) {
record.createPart( DatePartStructure.getInstance().getId(),
matcher.group().trim() );
}
}
if( !hasPages ) {
pattern = Pattern.compile( "\\d+-\\d+" );
matcher = pattern.matcher( citation );
if( matcher.find() ) {
createPagesPart( matcher.group() );
}
}
} else if( regexName.equals( "proquest" ) ) {
// ^\d+; \d+(; .+)?
if( !hasVolume ) {
pattern = Pattern.compile( "^\\d+" );
matcher = pattern.matcher( citation );
if( matcher.find() ) {
record.createPart( VolumePartStructure.getInstance().getId(),
matcher.group() );
}
}
if( !hasIssue ) {
pattern = Pattern.compile( "; \\d+" );
matcher = pattern.matcher( citation );
if( matcher.find() ) {
record.createPart( IssuePartStructure.getInstance().getId(),
matcher.group().replaceAll( "\\D", "" ) );
}
}
if( !hasSourceTitle ) {
pattern = Pattern.compile( "; \\D+$" );
matcher = pattern.matcher( citation );
if( matcher.find() ) {
record.createPart( SourceTitlePartStructure.getInstance().getId(),
matcher.group().substring( 2, matcher.group().length() ) );
}
}
} else if( regexName.equals( "psycInfo" ) ) {
// ^Vol \d+\([\w\p{Punct}]+\))
if( !hasVolume ) {
pattern = Pattern.compile( "^Vol \\d+" );
matcher = pattern.matcher( citation );
if( matcher.find() ) {
record.createPart( VolumePartStructure.getInstance().getId(),
matcher.group().replaceAll( "\\D", "" ) );
}
}
if( !hasIssue ) {
pattern = Pattern.compile( "\\(.+\\)" );
matcher = pattern.matcher( citation );
if( matcher.find() ) {
record.createPart( IssuePartStructure.getInstance().getId(),
matcher.group().substring( 1,
matcher.group().length() - 1 ) );
}
}
}
} catch( org.osid.repository.RepositoryException re ) {
LOG.warn( "doRegexParse() failed getting " +
"PartStructure Types.", re );
}
}
}
private void createPagesPart( String text )
throws org.osid.repository.RepositoryException {
if( text.charAt( 0 ) == ',' ) {
// getting a poorly formatted field
return;
}
record.createPart( PagesPartStructure.getInstance().getId(), text );
// get start and end page if possible
String [] pages = text.split( "-" );
if( pages.length == 0 ) {
// cannot create start/end page.
return;
}
String spage = pages[ 0 ].trim();
// delete all non-digit chars (ie: p., pp., etc.)
spage = spage.replaceAll( "\\D", "" );
// create startPage part
record.createPart( StartPagePartStructure.getInstance().getId(),
spage );
// end page
if( pages.length == 2 ) {
String epage = pages[ 1 ].trim();
epage = epage.replaceAll( "\\D", "" );
record.createPart( EndPagePartStructure.getInstance().getId(),
epage );
}
}
private String getId() {
return "asset" +
Math.random() * 1000 +
System.currentTimeMillis();
}
}