package org.solrmarc.mixin;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.marc4j.marc.DataField;
import org.marc4j.marc.Record;
import org.marc4j.marc.Subfield;
import org.marc4j.marc.VariableField;
import org.solrmarc.index.SolrIndexerMixin;
import org.solrmarc.index.indexer.IndexerSpecException;
import org.solrmarc.tools.DataUtil;
public class VideoInfoMixin extends SolrIndexerMixin
{
Pattern releaseDatePattern = null;
boolean isVideo = true;
public void perRecordInit(Record record)
{
isVideo = (record.getLeader().getTypeOfRecord() == 'g');
}
/**
* Extract the runtime of a video item from characters 18-20 of the 008 field
*
* @param record the record being processed
* @return String representing the runtime in minutes for the video (or null)
*/
public String getVideoRunTime(Record record)
{
if (isVideo)
{
String runtime = indexer.getFirstFieldVal(record, null, "008[18-20]");
if (runtime != null && runtime.matches("[0-9][0-9][0-9]"))
{
return(runtime.replaceAll("^0*", ""));
}
}
return(null);
}
/**
* Extract the targetAudience of a video item from the 521a subfield (if present)
*
* @param record the record being processed
* @return String representing the targetAudience for the video (or null)
*/
public String getVideoTargetAudience(Record record)
{
if (isVideo)
{
Set<String> target = indexer.removeTrailingPunct(record, "521a");
if (target == null || target.size() == 0)
{
return (null);
}
return(target.iterator().next());
}
return(null);
}
/**
* Attempt to heuristically determine the Rating of a video item from the 521a subfield (if present)
*
* @param record the record being processed
* @return String a normalized String representing the "Rating" of the video item
*/
public String getVideoRating(Record record)
{
if (isVideo)
{
Set<String> target = indexer.removeTrailingPunct(record, "521a");
if (target == null || target.size() == 0)
{
return ("None Listed");
}
String ratingString = target.iterator().next();
String rating = getRating(ratingString);
return(rating);
}
return(null);
}
/**
* Routine that actually does the work to heuristically determine the Rating of a video item from the 521a subfield (if present)
*
* @param ratingString the value of the 521a subfield extracted from the record being processed
* @return String a normalized String representing the "Rating" of the video item
*/
private String getRating(String ratingString)
{
String rating = "Can't Determine";
if (ratingString.matches(".*PG[- ]?13.*")) rating = "Rated: PG-13";
else if (ratingString.matches(".*TV[- ]?14.*")) rating = "Rated: TV-14";
else if (ratingString.matches(".*TV[- ]?G.*")) rating = "Rated: TV-G";
else if (ratingString.matches(".*TV[- ]?PG.*")) rating = "Rated: TV-PG";
else if (ratingString.matches(".*TV[- ]?Y7.*")) rating = "Rated: TV-Y7";
else if (ratingString.matches(".*TV[- ]?Y.*")) rating = "Rated: TV-Y";
else if (ratingString.matches(".*TV[- ]?MA.*")) rating = "Rated: TV-MA";
else if (ratingString.matches(".*NC[- ]?17.*")) rating = "Rated: NC-17";
else if (ratingString.matches(".*[Rr]at(ed|ing)[^A-Za-z]*UR.*")) rating = "Unrated";
else if (ratingString.matches(".*[Rr]at(ed|ing)[^A-Za-z]*NR.*")) rating = "Not Rated";
else if (ratingString.matches(".*[Rr]at(ed|ing)[^A-Za-z]*13([ ]?(and )?)[Uu][Pp].*")) rating = "Rated: PG-13";
else if (ratingString.matches(".*[Rr]at(ed|ing)[^A-Za-z]*X[^A-Za-z].*")) rating = "Rated: X ";
else if (ratingString.matches(".*[Rr]at(ed|ing)[^A-Za-z]*M[^A-Za-z].*")) rating = "Rated: M";
else if (ratingString.matches(".*[Rr]at(ed|ing)?[^A-Za-z]*R[^A-Za-z].*")) rating = "Rated: R";
else if (ratingString.matches(".*[Rr]at(ed|ing)?[^A-Za-z]*R")) rating = "Rated: R";
else if (ratingString.matches(".*[Rr]at(ed|ing)[^A-Za-z]*PG.*")) rating = "Rated: PG";
else if (ratingString.matches(".*[Rr]at(ed|ing)[^A-Za-z]*18.*")) rating = "Rated: 18+ years";
else if (ratingString.matches(".*[Rr]at(ed|ing)[^A-Za-z]*T[^A-Za-z].*")) rating = "Rated: T";
else if (ratingString.matches(".*[Rr]at(ed|ing)[^A-Za-z]*T")) rating = "Rated: T";
else if (ratingString.matches(".*R [Rr]at(ed|ing).*")) rating = "Rated: R";
else if (ratingString.matches(".*: R[^A-Za-z].*")) rating = "Rated: R";
else if (ratingString.matches(".*[Rr]at(ed|ing)[^A-Za-z]*G[^A-Za-z].*")) rating = "Rated: G";
else if (ratingString.matches(".*[Rr]at(ed|ing)[^A-Za-z]*U[^A-Za-z].*")) rating = "Rated: G";
else if (ratingString.matches(".*[Rr]at(ed|ing)[^A-Za-z]*G")) rating = "Rated: G";
else if (ratingString.matches(".*[Uu]n[-]?rated.*")) rating = "Unrated";
else if (ratingString.matches(".*PG.*")) rating = "Rated: PG";
else if (ratingString.matches(".*NR.*")) rating = "Not Rated";
else if (ratingString.matches(".*[Nn]ot [Rr]ated.*")) rating = "Not Rated";
else if (ratingString.matches(".*[Gg]rade[s]?( level)?[^0-9A-Za-z]*[K0-9]+.*"))
{
rating = ratingString.replaceAll(".*[Gg]rade[s]?( level)?[^0-9A-Za-z]*([K0-9]+).*", "Rated: $2+ grade" );
}
else if (ratingString.matches(".*[Nn]o[t]? (be )?[Rr]ecom[m]?[ea]nd[^0-9]*[0-9]+.*"))
{
rating = ratingString.replaceAll(".*[Nn]o[t]? (be )?[Rr]ecom[m]?[ea]nd[^0-9]*([0-9]+).*", "Rated: $2+ years" );
}
else if (ratingString.matches(".*([Ss]uitable|[Rr]ecommended|[Ii]tended|[Ss]uggested).*[Ff]or[^0-9]*[0-9]+.*"))
{
rating = ratingString.replaceAll(".*([Ss]uitable|[Rr]ecommended|[Ii]tended|[Ss]uggested).*[Ff]or[^0-9]*([0-9]+).*", "Rated: $2+ years" );
}
else if (ratingString.matches(".*[Rr]estricted.*[Tt]o[^0-9]*[0-9]+.*"))
{
rating = ratingString.replaceAll(".*[Rr]estricted.*[Tt]o[^0-9]*([0-9]+).*", "Rated: $1+ years" );
}
else if (ratingString.matches(".*[Mm]ayores [Dd]e[^0-9]*[0-9]+.*"))
{
rating = ratingString.replaceAll(".*[Mm]ayores [Dd]e[^0-9]*([0-9]+).*", "Rated: $1+ years" );
}
else if (ratingString.matches(".*[Ff]reigegeben [Aa]b[^0-9]*[0-9]+.*"))
{
rating = ratingString.replaceAll(".*[Ff]reigegeben [Aa]b[^0-9]*([0-9]+).*", "Rated: $1+ years" );
}
else if (ratingString.matches(".*[Jj]unior.*([Hh]igh.*|[Aa]dult|[Cc]ollege).*")) rating = "Rated: Junior High+";
else if (ratingString.matches(".*[Hh]igh.*([Ss]chool|[Aa]dult|[Cc]ollege).*")) rating = "Rated: High School+";
else if (ratingString.matches(".*([Cc]ollege).*")) rating = "Rated: College+";
else if (ratingString.matches(".*[Oo]ver [0-9]+ [Yy]ears.*"))
{
rating = ratingString.replaceAll(".*[Oo]ver ([0-9]+) [Yy]ears.*", "Rated: $1+ years" );
}
else if (ratingString.matches(".*[Aa]ge[sd]? [0-9]+.*([Aa]bove|[Aa]dult|[Oo]lder|[Oo]ver|[Uu]p).*"))
{
rating = ratingString.replaceAll(".*[Aa]ge[sd]? ([0-9]+).*", "Rated: $1+ years" );
}
else if (ratingString.matches("[0-9]+(years)?.*([Aa]dult|[Oo]lder|[Oo]ver|[Uu]p).*"))
{
rating = ratingString.replaceAll("([0-9]+).*", "Rated: $1+ years" );
}
else if (ratingString.matches(".* [0-9]+(years)?.*([^A-Za-z])([Aa]dult|[Oo]lder|[Oo]ver|[Uu]p).*"))
{
rating = ratingString.replaceAll(".* ([0-9]+).*", "Rated: $1+ years" );
}
else if (ratingString.matches(".*[Ss]uggested[^0-9]*[0-9]*[+]"))
{
rating = ratingString.replaceAll(".*[Ss]uggested[^0-9]*([0-9]+).*", "Rated: $1+ years" );
}
else if (ratingString.matches("^General.*")) rating = "Rated: G";
else if (ratingString.matches("^Ge.?ne.?ral.*")) rating = "Rated: G";
else if (ratingString.matches(".*: General.*")) rating = "Rated: G";
else if (ratingString.matches(".*Universal.*")) rating = "Rated: G";
else if (ratingString.matches(".*[Pp]er [Tt]utti.*")) rating = "Rated: G";
else if (ratingString.matches("^[\"]?Restricted.*")) rating = "Rated: R";
else if (ratingString.matches("^R[^A-Za-z].*")) rating = "Rated: R";
else if (ratingString.matches(".*(:|[Ff]or) [Aa]dult.*")) rating = "Rated: Adult";
else if (ratingString.matches("^Adult.*")) rating = "Rated: Adult";
else if (ratingString.matches("^Mature.*")) rating = "Rated: Mature";
else if (ratingString.matches(".*(:|[Ff]or) [Mm]ature.*")) rating = "Rated: Mature";
else if (ratingString.matches(".*14A.*")) rating = "Rated: 14A";
else if (ratingString.matches(".*15A.*")) rating = "Rated: 15A";
else if (ratingString.matches(".*18A.*")) rating = "Rated: 18A";
else if (ratingString.matches(".*[0-9]+.*([Uu]p|[Oo]lder|[+])A.*"))
{
rating = ratingString.replaceAll(".*([0-9]+).*75k", "Rated: $1+ years" );
}
else if (ratingString.matches(".*[Vv]iewer [Dd]iscretion.*")) rating = "Discretion Advised";
else if (ratingString.matches(".*[Pp]arental ([Dd]iscretion|[Gg]uidance).*")) rating = "Discretion Advised";
else if (ratingString.matches(".*[Ss]uitable.*[Ff]or.*([Gg]eneral|[Aa]ll).*") &&
!ratingString.matches("[Nn]ot.*[Ss]uitable.*[Ff]or.*([Gg]eneral|[Aa]ll).*")) rating = "Rated: G";
else if (ratingString.matches(".*[Aa]ll [Aa]ge[s]?.*")) rating = "Rated: G";
else if (ratingString.matches(".*G [Rr]at(ed|ing).*")) rating = "Rated: G";
return(rating);
}
/**
* Attempt to heuristically determine the Director of a video item based on looking in the 245c, subfield, the 508a subfield and 700 fields
*
* @param record the record being processed
* @return Set<String> a set of Strings representing the name(s) of the director(s) of the video (or an empty set, if none can be found)
*/
public Set<String> getVideoDirector(Record record)
{
Set<String> result = new LinkedHashSet<String>();
if (isVideo)
{
String responsibility = indexer.getFirstFieldVal(record, null, "245c");
if (responsibility != null)
{
Set<String> directors = getVideoDirectorsFromTextField(responsibility, true);
result.addAll(directors);
}
Set<String> credits = indexer.getFieldList(record, "508a");
for (String credit : credits)
{
Set<String> directors = getVideoDirectorsFromTextField(credit, true);
result.addAll(directors);
}
if (result.size() == 0)
{
Set<String> notes = indexer.getFieldList(record, "500a:505a:505t");
for (String note : notes)
{
if (note.contains("direct") || note.contains("Direct"))
{
Set<String> directors = getVideoDirectorsFromTextField(note, false);
result.addAll(directors);
}
}
String subtitle = indexer.getFirstFieldVal(record, null, "245b");
if (subtitle != null && (subtitle.contains("direct") || subtitle.contains("Direct")))
{
addError(new IndexerSpecException("Director information erroneously included in the 245b subtitle field"));
Set<String> directors = getVideoDirectorsFromTextField(subtitle, false);
result.addAll(directors);
}
String medium = indexer.getFirstFieldVal(record, null, "245h");
if (medium != null && (medium.contains("direct") || medium.contains("Direct")))
{
addError(new IndexerSpecException("Director information erroneously included in the 245h medium field"));
Set<String> directors = getVideoDirectorsFromTextField(medium, false);
result.addAll(directors);
}
}
List<VariableField> personalNames = record.getVariableFields("700");
for (VariableField vf : personalNames)
{
DataField df = (DataField)vf;
// this could be overly broad, and could also grab music directors or other roles containing the word "director"
if (ChkSubfield(df, '4', "drt") || ChkSubfield(df, 'c', ".*director.*") || ChkSubfield(df, 'e', ".*direct.*"))
{
String name = df.getSubfield('a').getData();
name = DataUtil.cleanData(name);
name = name.replaceAll("([A-Z][^,]*),[ ]?(.*)", "$2 $1");
result.add(name);
}
}
}
return(result);
}
private boolean ChkSubfield(DataField df, char c, String pattern)
{
List<Subfield> sfs = df.getSubfields(c);
for (Subfield sf : sfs)
{
if (sf.getData().matches(pattern)) return(true);
}
return false;
}
/**
* Routine that actually does the work to heuristically determine the Director of a video based on the text of the string passed in
*
* @param ratingString the value of 245c subfield or a 508a subfield extracted from the record being processed
* @return Set<String> a set of Strings representing the name(s) of the director(s) of the video (or an empty set, if none can be found)
*/
public static Set<String> getVideoDirectorsFromTextField(String responsibility, boolean greedy)
{
// First do some initital processing on the passed in string
boolean reverseName = false;
Set<String> result = new LinkedHashSet<String>();
Set<String> squeezedresult = new LinkedHashSet<String>();
responsibility = responsibility.replaceAll("\\[sic[.]?[]]", "");
responsibility = responsibility.replaceAll("([a-z][a-z][a-z])[.]", "$1;");
responsibility = responsibility.replaceAll("([a-z][a-z])[.] ", "$1; ");
responsibility = responsibility.replaceAll("direção de produção", "producer");//porteguese
responsibility = responsibility.replaceAll("direct(or|ion|eur) de (la )?[ ]?produc[ct]i(ó|o)n", "producer");//french/spanish
responsibility = responsibility.replaceAll("direcci(ó|o)n art\\B*", "artguy");//spanish
responsibility = responsibility.replaceAll("produit par\\b", "produced by");
String responsibility1 = responsibility.replaceAll("[Rr]eg(i|í)[ea]?\\b", "didrector");//german/italian/swedish
responsibility1 = responsibility1.replaceAll("[Rr]eggia\\b", "didrector");//spanish
responsibility1 = responsibility1.replaceAll("[Rr]e(ž|z)ie", "didrector");//czech
responsibility1 = responsibility1.replaceAll("[Rr]e(ż|z|a)yseria", "didrector");//polish
responsibility1 = responsibility1.replaceAll("[Dd]irecci(ó|o)n", "didrector");//spanish
responsibility1 = responsibility1.replaceAll("[Rr](é|e)alisation", "didrection");//french
responsibility1 = responsibility1.replaceAll("[Rr]ealizaci(ó|o)n", "didrection");//spanish
responsibility1 = responsibility1.replaceAll("[Rr]éalisé( et [a-z]*)? (par|by)", "didrected$1 by");//french
responsibility1 = responsibility1.replaceAll("[Dd]irected por", "didrected by");//spanish
responsibility1 = responsibility1.replaceAll("[Dd]irigé par", "didrected by");//french
responsibility1 = responsibility1.replaceAll("[Dd]irreción", "didrection");//spanish
responsibility1 = responsibility1.replaceAll("[Dd]ireccíon", "didrection");//spanish
responsibility1 = responsibility1.replaceAll("[Dd]irector(a|es)", "didrector");//spanish
responsibility1 = responsibility1.replaceAll("[Dd]ire(ç|c)ão", "didrector");//porteguese
responsibility1 = responsibility1.replaceAll("[Dd]iretto da", "didrector");//italian
responsibility1 = responsibility1.replaceAll("[Dd]irecteur", "didrector");//french
responsibility1 = responsibility1.replaceAll("[Dd]irect(e|io)r", "didrector");//typo
responsibility1 = responsibility1.replaceAll("[Dd]irigid[oa]", "didrector");//porteguese
responsibility1 = responsibility1.replaceAll("[Tt]asriṭ u-vimui", "didrector");//hebrew
responsibility1 = responsibility1.replaceAll("[Ii]khr(ā|a)j", "didrector");//arabic
responsibility1 = responsibility1.replaceAll("[Rr]ezhisser[a]?", "didrector");//russian
responsibility1 = responsibility1.replaceAll("[Yy]öneten", "didrector");//turkish
responsibility1 = responsibility1.replaceAll("[Nn]irdeśaka", "didrector");//hindi
responsibility1 = responsibility1.replaceAll("[Pp]ostanovka", "didrector");//russian
responsibility1 = responsibility1.replaceAll("(un )?film[e]? d[ei]\\b", "a flim by");//french
responsibility1 = responsibility1.replaceAll("un film d'", "a flim by ");//french
responsibility1 = responsibility1.replaceAll("(an|en) film av", "a flim by"); //swedish
responsibility1 = responsibility1.replaceAll("[Ee]in [Ff]ilm von", "a flim by"); //german
responsibility1 = responsibility1.replaceAll("un(e|a) pel(í|i)cula de", "a flim by");//spanish
responsibility1 = responsibility1.replaceAll("[Mm]is[e]? en sc(e|è)ne( de)?", "a flim by");//french
responsibility1 = responsibility1.replaceAll("Film by", "a flim by");
//responsibility1 = responsibility1.replaceAll("\\bpar\\b", "maybe by");
//responsibility1 = responsibility1.replaceAll("^by", "maybe by");
//responsibility1 = responsibility1.replaceAll("^von", "maybe by");
// if (!responsibility1.equals(responsibility))
// {
// responsibility = responsibility1;
// }
responsibility1 = handleASoAndSoFilm(responsibility1);
responsibility1 = responsibility1.replaceAll("[Dd]ao yan", "didrector");//chinese
responsibility1 = responsibility1.replaceAll("[Kk]antoku", "didrector");//japanese
responsibility1 = responsibility1.replaceAll("[Kk]amdok", "didrector");//korean
responsibility1 = responsibility1.replaceAll("[Yy]ŏnchʻul", "didrector");//korean
if (!responsibility1.equals(responsibility))
{
responsibility = responsibility1;
}
// Now split the string into subparts separated by ; (or -- or : )
for (int loop = 0; loop < 5; loop++)
{
if (loop == 3)
{
responsibility = responsibility.replaceAll("didrect", "direct");
responsibility = responsibility.replaceAll("a flim by", "a film by");
if (responsibility.matches(".*filmed[,a-z ]*by\\b.*"))
{
responsibility = responsibility.replaceAll("filmed", "directed");
}
}
String semiparts[] = responsibility.split(";|--| : ");
for (String part0 : semiparts)
{
String part = part0;
part = part.trim();
if (((loop == 0 || loop == 3) && part.matches(".*[Dd]irect(ed|or[s]?|ion)\\b.*")) ||
((loop == 1 || loop == 4) && part.matches(".*a film by.*"))) // ||
// (loop == 5 && part.matches(".*maybe by.*")) )
{
String trimmed;
// part = part.replaceAll("\\[sic[.][]]", "");
if (!greedy && part.matches(".*[Dd]irector[']?s.*"))
{
continue;
}
if (!greedy && part.matches(".*\"[^,\"]*[Dd]irect(ed by|or[s]?|ion)[^,\"]*\".*"))
{
part = part.replaceAll("\"[^\",]*?\"", "XXX");
}
// Try to split apart "brothers" ie. the Hughes Brothers becomes Albert Hughes and Allen Hughes
if (part.matches(".*(the )?[A-Z][^ ]* [Bb]rothers.*"))
{
String name = part.replaceFirst(".*(the )?([A-Z][^ ]*) [Bb]rothers.*", "$2");
if (part.matches(".*(the )?[A-Z][^ ]* [Bb]rothers(, | \\()[A-Z][a-z]* and [A-Z][a-z]* "+name+".*"))
{
part = part.replaceFirst("((the )?([A-Z][^ ]*) [Bb]rothers)(, | \\()([A-Z][a-z]*) and ([A-Z][a-z]*) "+name+"\\)?", "$5 $3 and $6 $3");
}
else if (part.matches(".*(the )?[A-Z][^ ]* [Bb]rothers(, | \\()[A-Z][a-z]* and [A-Z][a-z]*.*"))
{
part = part.replaceFirst("((the )?([A-Z][^ ]*) [Bb]rothers)(, | \\()([A-Z][a-z]*) and ([A-Z][a-z]*)\\)?", "$5 $3 and $6 $3");
}
else
{
for (String otherPart : semiparts)
{
if (otherPart.equals(part0)) continue;
if (otherPart.matches(".*[A-Z][^ ]* "+name+".*[A-Z][^ ]* "+name+".*"))
{
String names = otherPart.replaceFirst(".*([A-Z]([a-z]*|[.]) )+"+name+"[, ].*([A-Z]([a-z]*|[.]) )+"+name+"[^A-Za-z].*",
"$1"+name+" and $3"+name);
part = part.replaceFirst("(.*?)([Tt]he )?([A-Z][^ ]*) [Bb]rothers(.*)", "$1"+names+"$4");
break;
}
}
}
}
String part1 = part.replaceAll("[ ]?[(][ ]*([a-z]*[/]|assistant )?[Dd]irector([/][a-z]*)?[ ]*[)]", " director"); // change (director) to director
part1 = part1.replaceAll("[ ]?[(][^)]*[)]", ""); // throw away parenthetical phrases
if (!part.equals(part1))
{
part = part1;
}
if (part.matches(".*[Dd]irector[s]? (of|de|de la) ([Pp]h|[Ff])otogra(ph|f)(y|(i|í)[ea]).*"))
{
part = part.replaceFirst("[Dd]irector[s]? (of|de|de la) ([Pp]h|[Ff])otogra(ph|f)(y|(i|í)[ea])", "cinematographer");
}
if (part.matches(".*[Dd]irector[s]? (of|de|de la) ([Aa]nimation).*"))
{
part = part.replaceFirst("[Dd]irector[s]? (of|de|de la) ([Aa]nimation)", "supervising animator");
}
else if (part.matches(".*, [Dd]irector[s]? (of|for) .*"))
{
part = part.replaceFirst("[Dd]irector[s]? (of|for)((( ([A-Z][a-z]*|the|of|and|a))+)[,]?)+", "director");
}
else if (part.matches(".*[Dd]irector[s]?( and writer)? (of|for) .*"))
{
part = part.replaceFirst("[Dd]irector[s]?( and writer)? (of|for) [A-Za-z' ]*", "director");
}
if (part.contains("collaboration") || part.contains("participation"))
{
part = part.replaceAll("(in collaboration with)|(with the collaboration of)|(with the participation of)", "with");
}
if (part.matches(".*[Dd]irector .* director.*"))
{
part = part.replaceAll(".*([Dd]irector .*)director.*", "$1");
}
// Pattern matching when the subpart is of the form: Some Name Director
if (part.matches(".*[Dd]irector[^A-Z]*") && !part.matches(".*of the [Dd]irector.*"))
{
if (part.matches(".*([Aa]rt(istic)?|[Mm]usic(al)?|[Ss]tage|[Pp]roduction|[Pp]roject|[Pp]hotography|[Aa]nimation|[Mm]edical|[Cc]asting|[Tt]echnical|[Dd]ance|[Ee]diting) [Dd]irector.*" ) ||
part.matches(".*[Dd]irector[s]? ((of[ ]?(([Pp]hotography)))|(de (la )?fotograf(i|í)a)|(de arte)).*"))
continue;
part = part.replaceAll(" *[\\[]", ", ");
part = part.replaceAll("^\"", "");
part = part.replaceAll("[\\]]", "");
part = part.replaceAll("director.*", "director");
part = part.replaceAll(" [-A-Za-z/]*director[-A-Za-z]*", " director");
part = part.replaceAll(" [a-z/]+/director", " director");
part = part.replaceAll(" co-director", " director");
part = part.replaceAll(" [a-z ,]+ director", " director");
if (greedy)
part = part.replaceAll(".*: (([A-Z][A-Za-z.]* )*[A-Z][A-Za-z.]*)(, |,| )director", "$1, director");
else
part = part.replaceAll(".*? (([A-Z][A-Za-z.]* )*[A-Z][A-Za-z.]*)(, )?director", "$1, director");
part = part.replaceFirst(".* (of|by)", "by");
part = part.replaceAll(" (and|und|/) ", " & ");
part = part.replaceFirst("by ", "");
part = part.replaceAll(", (Jr[.]?|Sr[.]?|Inc[.]?|II|III|IV|M[.]D[.]|B[.]S[.]N[.])", "# $1");
part = part.replaceFirst("[,]?[ ]?director", "");
part = part.replaceAll("([,][ ]?|[ ]?&[ ]?)", "|");
part = part.replaceAll("[#]", ",");
String commaparts[] = part.split("[|]+");
for (String subpart : commaparts)
{
addCleanedName(result, squeezedresult, subpart, reverseName, greedy);
}
}
else if (!greedy && part.matches(".*, [Dd]irector (of|for|on).*"))
{
continue;
}
// Pattern matching when the subpart is of the form: Directed by Some Name
else if (part.matches(".*[Dd]irect(ed|ion).*?by.*")|| part.matches(".*a film by.*") || part.matches(".*maybe by.*"))
{
if (part.matches(".*([Aa]rt(istic)?|[Mm]usic(al)?|[Ss]tage|[Pp]roduction|[Pp]roject|[Pp]hotographic|[Aa]nimation|[Mm]edical|[Cc]asting|[Tt]echnical|[Dd]ance|[Ee]diting) [Dd]irection.*?by.*" ) ||
part.matches(".*[Dd]irector[s]? ((of[ ]?(([Pp]hotography)))|(de (la )?fotograf(i|í)a)|(de arte)).*"))
continue;
part = part.replaceFirst(".*[Dd]irect(ed|ion).*?by[]:,)]?[ ]?", "directified by ");
part = part.replaceFirst(".*a film by", "directified by ");
part = part.replaceFirst(".*maybe by", "directified by ");
part = part.replaceAll("/", " & ");
part = part.replaceAll("\\[|\\]", "");
part = part.replaceFirst("et al", "");
part = part.replaceAll(" (and|with|et|und) ", " & ");
part = part.replaceAll(", (Jr[.]?|Sr[.]?|Inc[.]?|II|III|IV|M[.]D[.]|B[.]S[.]N[.])", "# $1");
part = part.replaceAll("[.][.][.]", "");
part = part.replaceAll("([A-Z][^ .][^ .][^ .]+)[.].*", "$1");
part = part.replaceAll("brothers", "Brothers");
part = part.replaceAll("directified by[ a-z,]*(([\"]?([A-Z]|\\p{Lu}|[*ʻ]|\\p{M})[^ ]*[\"]?[,]?[ ]*|[ ]?&[ ]|zur |von |van |de[rl]?[ ]?|the |d[']|al-|da-)+).*", "$1");
part = part.replaceAll("^([A-Z][^ .]+) & ([A-Z][^ .,]+) ([A-Z][^ .,]+)", "$1 $3 & $2 $3");
part = part.replaceAll("([,][ ]?|[ ]?&[ ]?)", "|");
part = part.replaceAll("[#]", ",");
part = part.replaceAll("[ ][ ]+", " ");
part = part.replaceFirst("directified by", "");
String commaparts[] = part.split("[|]+");
for (String subpart : commaparts)
{
addCleanedName(result, squeezedresult, subpart, reverseName, greedy);
}
}
else if (part.matches(".*[Dd]irected.*") || part.matches(".*[Dd]irected( (and|&) [-a-z]*)?,.*"))
{
part = part.replaceFirst(".*[Dd]irected( (and|&) [-a-z]*),","directified by");
part = part.replaceFirst(".*[Dd]irected[]:,)]?[ ]?", "directified by ");
part = part.replaceAll("/", " & ");
part = part.replaceAll("\\[|\\]", "");
part = part.replaceFirst("et al", "");
part = part.replaceAll(" (and|with|et|und) ", " & ");
part = part.replaceAll(", (Jr[.]?|Sr[.]?|Inc[.]?|II|III|IV|M[.]D[.]|B[.]S[.]N[.])", "# $1");
part = part.replaceAll("[.][.][.]", "");
part = part.replaceAll("([A-Z][^ .][^ .][^ .]+)[.].*", "$1");
part = part.replaceAll("brothers", "Brothers");
part = part.replaceAll("directified by[ a-z,]*(([\"]?([A-Z]|\\p{Lu}|[*ʻ]|\\p{M})[^ ]*[\"]?[,]?[ ]*|[ ]?&[ ]|zur |von |van |de[rl]?[ ]?|the |d[']|al-|da-)+).*", "$1");
part = part.replaceAll("^([A-Z][^ .]+) & ([A-Z][^ .,]+) ([A-Z][^ .,]+)", "$1 $3 & $2 $3");
part = part.replaceAll("([,][ ]?|[ ]?&[ ]?)", "|");
part = part.replaceAll("[#]", ",");
part = part.replaceAll("[ ][ ]+", " ");
part = part.replaceFirst("directified by", "");
String commaparts[] = part.split("[|]+");
for (String subpart : commaparts)
{
addCleanedName(result, squeezedresult, subpart, reverseName, greedy);
}
}
// Pattern matching when the subpart is of the form: Director Some Name
else if (part.matches(".*[Dd]irector[^a-rt-z\'].*[A-Z].*"))
{
if (part.matches(".*([Aa]rt(istic)?|[Mm]usic(al)?|[Ss]tage|[Pp]roduction|[Pp]roject|[Pp]hotography|[Aa]nimation|[Mm]edical|[Cc]asting|[Tt]echnical|[Dd]ance|[Ee]diting) [Dd]irector.*" ) ||
part.matches(".*[Dd]irector[s]? ((of[ ]?(([Pp]hotography)))|(de (la )?fotograf(i|í)a)|(de arte)).*"))
continue;
part = part.replaceFirst("Executive", "executive");
part = part.replaceFirst("Writer", "writer");
part = part.replaceFirst("Story", "story");
part = part.replaceFirst("Producer", "producer");
part = part.replaceFirst("Produced", "produced");
part = part.replaceFirst(", English", ", english");
part = part.replaceFirst("Researcher", "researcher");
part = part.replaceFirst(".*?[Dd]irector", "director");
part = part.replaceAll("[ ]?([.][.][.])?[ ]?[\\[][^\\]]*[\\]]", "");
part = part.replaceAll("[]]", "");
part = part.replaceAll(", (Jr[.]?|Sr[.]?|Inc[.]?|II|III|IV|M[.]D[.]|B[.]S[.]N[.])", "# $1");
part = part.replaceAll("director[-A-Za-z/]*", "director");
part = part.replaceAll("director (for|and|of)( [A-Z][A-Za-z]*)+", "director");
if (!greedy)
part = part.replaceFirst("director( and [a-z]*)?, [a-z].*", "");
part = part.replaceFirst("director[^A-Z]* ([ʻ*]?[A-Z])", "director= $1");
part = part.replaceFirst("with the [a-z][A-Za-z ]*", "");
part = part.replaceFirst("et al", "");
if (!greedy)
part = part.replaceAll("[,]? and [a-z].*", "");
part = part.replaceAll(" (and|with|et) ", " & ");
part = part.replaceAll(",[ ]?[a-z].*", "");
part = part.replaceAll("= [^(]*[)], ", ": ");
if (greedy)
part = part.replaceAll("director=[ a-z,]*(([\"]?([A-Z]|\\p{Lu}|[*ʻ]|\\p{M})[^ ]*[\"]?[,]?[ ]*|[ ]?&[ ]|von |zur |van |de[rl]?[ ]?|the |in |d[']|al-|da-)+)[^|&]*", "$1");
else // strict
part = part.replaceAll("director=[ a-z,]*(([\"]?([A-Z]|\\p{Lu}|[*ʻ]|\\p{M})[^ ]*[\"]?[,]?[ ]*|[ ]?&[ ]|von |zur |van |de[rl]?[ ]?|the |in |d[']|al-|da-)+)( [a-z].*|$)", "$1");
part = part.replaceAll("^([A-Z][^ .]+) & ([A-Z][^ .]+) ([A-Z][^ .]+)", "$1 $3 & $2 $3");
part = part.replaceAll("([,][ ]?|[ ]?[|&][ ]?)", "|");
part = part.replaceAll("[#]", ",");
part = part.replaceAll("[ ][ ]+", " ");
String commaparts[] = part.split("[|]+");
for (String subpart : commaparts)
{
addCleanedName(result, squeezedresult, subpart, reverseName, greedy);
}
}
// Pattern matching when the subpart is of the form: Direction Some Name
else if (part.matches(".*[Dd]irection.*"))
{
if (part.matches(".*([Aa]rt|[Mm]usic(al)?|[Ss]tage|[Pp]roject|[Aa]nimation|[Mm]edical|[Cc]asting|[Tt]echnical|[Oo]rchestra|[Ee]diting) [Dd]irection.*" )||
part.matches(".*[Dd]irection (of )?(de )?(la )?((f|ph)otogra(f|ph)ie|production|[Cc]in(é|e)matographie|artistique|art[e]?|musicale).*"))
continue;
part = part.replaceFirst(".*[Dd]irection[^A-Z]*", "direction: ");
part = part.replaceAll(", (Jr[.]?|Sr[.]?|Inc[.]?|II|III|IV|M[.]D[.]|B[.]S[.]N[.])", "# $1");
part = part.replaceAll(" (and|with|et) ", " & ");
part = part.replaceAll("[\\]]", "");
part = part.replaceAll("([,][ ]?|[ ]?&[ ]?)", "|");
part = part.replaceAll("[#]", ",");
part = part.replaceAll("[ ][ ]+", " ");
part = part.replaceFirst("direction: ", "");
String commaparts[] = part.split("[|]+");
for (String subpart : commaparts)
{
addCleanedName(result, squeezedresult, subpart, reverseName, greedy);
}
}
}
}
if (result.size() > 0) break;
}
return(result);
}
private static String handleASoAndSoFilm(String responsibility)
{
String aOrAn = "\\ba[n]?[ ]+"; // matches a or an but has boundary marker so as to not match pa or man
String namePart = "(?:\\p{Lu}(?:\\p{L}|\\p{M}|[-'])*(?:\\p{Ll}|\\p{M}))"; // Example matches: Jadme-Lillo or Sánchez or SiCa or O'Malley
String initialOrNamePart = "(?:\\p{Lu}[.]|"+namePart+")"; // Example matches: B. or Jadme-Lillo or Sánchez or SiCa or O'Malley
String optionalSuffix = "(?:[,]? (?:Jr[.]?|Sr[.]?|II|III|IV|M[.]D[.]|B[.]S[.]N[.]))?"; // Example matches: , Jr. or , III or , M.D.
String nameGap = "[- ]";
String film = " film\\b"; // matches film but not films uses boundary marker
String name = initialOrNamePart+nameGap+"(?:"+initialOrNamePart+nameGap+")?"+namePart+optionalSuffix;
String multiNameConnector = "(?:, | - | and | ?/ ?)";
String responsibility1;
responsibility1= responsibility.replaceAll(aOrAn+"("+name+")"+film, "a film by $1");
responsibility1= responsibility1.replaceAll(aOrAn+"("+name+")"+multiNameConnector+"("+name+")"+film, "a film by $1, $2");
responsibility1= responsibility1.replaceAll("("+name+")'s"+film, "a film by $1");
// responsibility1= responsibility.replaceAll("(^|[^a-z])a(n)?[ ]+(([A-Z]([.]|(?:[-a-zA-Z']|\\p{M})*(?:[a-z]|\\p{M})) ([A-Z]([.]|(?:[-a-zA-Z']|\\p{M})*(?:[a-z]|\\p{M}))[- ]?)?[A-Z](?:[-a-zA-Z']|\\p{M})*(?:[a-z]|\\p{M})(, (Jr[.]?|Sr[.]?|Inc[.]?|II|III|IV|M[.]D[.]|B[.]S[.]N[.]))?) film([^a-z]|$)", "$1a film by $3$10");
// responsibility1 = responsibility1.replaceAll("(^|[^a-z])a(n)?[ ]+((([A-Z]([.]|[-a-zA-Z']*[a-z])) (([A-Z]([.]|[-a-zA-Z']*[a-z])[- ]?)?[A-Z][-a-zA-Z']*[a-z](, (Jr[.]?|Sr[.]?|Inc[.]?|II|III|IV|M[.]D[.]|B[.]S[.]N[.]))?))(, | - | and | ?/ ?))+(([A-Z]([.]|[-a-z]*[a-z])) ([A-Z]([.]|[-a-z]*[a-z])[- ]?)?[A-Z][-a-z]*[a-z](, (Jr[.]?|Sr[.]?|Inc[.]?|II|III|IV|M[.]D[.]|B[.]S[.]N[.]))?) film([^a-z]|$)", "$1a film by $4, $13$20");
if (!responsibility1.equals(responsibility))
{
responsibility = responsibility1;
}
return(responsibility);
}
private static void addCleanedName(Set<String> result, Set<String> squeezedresult, String subpart, boolean reverseName, boolean greedy)
{
subpart = nameClean(subpart, greedy);
if (subpart == null) return;
if (reverseName)
subpart = subpart.replaceFirst("([^ ]+)[ ]+(.*)", "$2 $1");
String squeezedpart = subpart.replaceAll(" ", "");
if (!squeezedresult.contains(squeezedpart))
{
squeezedresult.add(squeezedpart);
result.add(subpart);
}
}
private static String nameClean(String subpart, boolean greedy)
{
if (subpart.matches(".*[(].*[)]"));
subpart = subpart.replaceAll("(.*)[(].*[)]", "$1");
if (subpart.matches("[^()]*[)]"));
subpart = subpart.replaceAll("([^()]*)[)]", "$1");
if (subpart.matches("\".*") || subpart.matches(".*\""))
subpart = subpart.replaceAll("\"?(.*?)\"?", "$1");
if (subpart.matches(".* for .*"))
subpart = subpart.replaceAll("(.*) for .*", "$1");
if (subpart.matches(".*, Inc[.]"))
return(null);
if (subpart.matches(".*( of | a | in ).*"))
return(null);
if (subpart.matches(".*'s"))
subpart = subpart.replaceFirst("'s$", "");
if (subpart.matches(".*? [a-z][a-z ]*"))
subpart = subpart.replaceAll("(.*?) [a-z][a-z ]*", "$1");
if (subpart.matches("[a-z]*"))
return(null);
if (subpart.contains("didrector"))
subpart = subpart.replaceAll("didrector", "");
if (subpart.matches(".*[ .]+$"))
subpart = subpart.replaceAll("(.*?)[ .][ .]+$", "$1");
if (!greedy && subpart.matches(".*[0-9]+.*"))
return(null);
if (!greedy && subpart.matches("[Tt]he( .*|$)"))
return(null);
if (!greedy && subpart.matches("[A-Z][a-z]*"))
return(null);
if (!greedy && subpart.matches("^[\"]?[a-z].*"))
return(null);
if (subpart.matches("[Tt]he( .*|$)"))
subpart = subpart.replaceFirst("[Tt]he[ ]?", "");
if (subpart.contains("Group")) return(null);
if (subpart.contains("Studio[s]?")) return(null);
if (subpart.contains("Entertainment")) return(null);
if (subpart.contains("Department")) return(null);
if (subpart.contains("National")) return(null);
if (subpart.contains("Museum")) return(null);
if (subpart.contains("Films")) return(null);
if (subpart.contains("TV")) return(null);
if (subpart.contains("Response")) return(null);
if (subpart.contains("Cities")) return(null);
if (subpart.contains("High")) return(null);
if (subpart.endsWith("Productions")) return(null);
if (subpart.startsWith("Written")) return(null);
if (subpart.startsWith("Writer")) return(null);
if (subpart.equalsIgnoreCase("Various")) return(null);
if (subpart.equalsIgnoreCase("Editor")) return(null);
if (subpart.equalsIgnoreCase("Executive")) return(null);
if (subpart.equalsIgnoreCase("Story")) return(null);
if (subpart.startsWith("English")) return(null);
if (subpart.startsWith("Company")) return(null);
if (subpart.matches(".*[Pp]roducer.*")) return(null);
if (subpart.matches("[Dd]irector[s]?")) return(null);
if (subpart.equalsIgnoreCase("Screenplay")) return(null);
if (subpart.contains(":")|| subpart.replaceAll("[^ ]", "").length() > 5)
return(null);
subpart = DataUtil.cleanData(subpart);
if (subpart.length() == 0) return(null);
return(subpart);
}
/**
* Attempt to heuristically determine the original release of a video item based on 500a, subfield
*
* @param record the record being processed
* @return String a String representing the original of the video (or an empty set, if none can be found)
*/
public String getOriginalReleaseDate(Record record)
{
if (releaseDatePattern == null)
{
releaseDatePattern = Pattern.compile(".*?([Rr]eleased|[Rr]elease [Oo]f|[Vv]ideorecording|[Vv]ideocassette|[Ii]ssued|[Rr]ecorded|[Bb]roadcast|[Ff]ilmed|[Ee]dited|[Pp]roduced|[Mm]ade|[Dd]elivered).*?[^0-9]([0-9][0-9][0-9][0-9])([^0-9].*)?$");
}
if (isVideo)
{
String date008 = indexer.getFirstFieldVal(record, null, "008[11-14]");
Set<String> notesFields = indexer.getFieldList(record, "500a");
String date500 = null;
for (String note : notesFields)
{
Matcher match = releaseDatePattern.matcher(note);
if (match.matches())
{
date500 = match.group(2);
break;
}
}
String datePub = indexer.getPublicationDate(record);
boolean validDatePub = false;
int iPub = 0;
if (datePub != null && datePub.matches("[1-2][0189][0-9][0-9]"))
{
validDatePub = true;
iPub = Integer.parseInt(datePub);
}
String dateReturn = null;
if (date008 != null && date500 != null)
{
boolean m008 = date008.matches("[1-2][0189][0-9][0-9]");
boolean m500 = date500.matches("[1-2][0189][0-9][0-9]");
if (m008 && m500)
{
int i008 = Integer.parseInt(date008);
int i500 = Integer.parseInt(date500);
if (i008 <= i500)
dateReturn = date008;
else if (i008 > i500)
dateReturn = date500;
}
else if (m008)
{
dateReturn = date008;
}
else if (m500)
{
dateReturn = date500;
}
}
else if (date008 != null && date008.matches("[1-2][0189][0-9][0-9]"))
{
dateReturn = date008;
}
else if (date500 != null && date500.matches("[1-2][0189][0-9][0-9]"))
{
dateReturn = date500;
}
if (dateReturn != null)
{
int iReturn = Integer.parseInt(dateReturn);
if (validDatePub && iPub < iReturn)
{
dateReturn = datePub;
}
}
else if (validDatePub)
{
dateReturn = datePub;
}
return(dateReturn);
}
return(null);
}
/**
* Attempt to heuristically determine the "genre" of a video item based on 650 and 655 fields
*
* @param record the record being processed
* @return String a String representing the original of the video (or an empty set, if none can be found)
*/
public static Pattern genreActionAdventure = Pattern.compile("(^|[^a-z])(action|adventure|espionage|martial arts|samurai|spies|spy thriller|bond, james)([^a-z]|$)");
public static Pattern genreAnimation = Pattern.compile("(^|[^a-z])(animated|animation)([^a-z]|$)");
public static Pattern genreBiography = Pattern.compile("(^|[^a-z])(biograph(ical|y))([^a-z]|$)");
public static Pattern genreChildren = Pattern.compile("(^|[^a-z])((children's) (stories|films|literature|poetry|songs|television programs))|(television programs|video recordings|dance) for children([^a-z]|$)");
public static Pattern genreComedy = Pattern.compile("(^|[^a-z])(comed(y|ies)|humor)([^a-z]|$)");
public static Pattern genreCrimeMystery = Pattern.compile("(^|[^a-z])(assassins|cop|crime|criminal[s]?|detective[s]?|fugitives|gangster[s]?|investigation|kidnapping|legal|murder(ers)?|mystery|police|prison[s]?|robbery|suspense|swindlers|thrillers|thieves)([^a-z]|$)");
public static Pattern genreDocumentary = Pattern.compile("(^|[^a-z])(documentary|newsreels)([^a-z]|$)");
public static Pattern genreDrama = Pattern.compile("(^|[^a-z])(drama|melodrama[s]?)([^a-z]|$)");
public static Pattern genreExperimental = Pattern.compile("(^|[^a-z])(experimental|performance art|video art)([^a-z]|$)");
public static Pattern genreFilmNoir = Pattern.compile("(^|[^a-z])(noir)([^a-z]|$)");
public static Pattern genreHistorical = Pattern.compile("(^|[^a-z])(apartheid|civil rights|historical|history|holocaust)([^a-z]|$)");
public static Pattern genreHorror = Pattern.compile("(^|[^a-z])(ghost[s]?|horror|monster[s]?|supernatural|vampire[s]?|zombie[s]?)([^a-z]|$)");
public static Pattern genreMusical = Pattern.compile("(^|[^a-z])(blues|concert[s]?|jazz|music|musical|musicals|operas|rock)([^a-z]|$)");
public static Pattern genreRomance = Pattern.compile("(^|[^a-z])(love|roman(tic|ce))([^a-z]|$)");
public static Pattern genreSciFiFantasy = Pattern.compile("(^|[^a-z])(alien|fantasy|interplanetary|planets|science fiction|time travel)([^a-z]|$)");
public static Pattern genreTelevision = Pattern.compile("(^|[^a-z])(television|tv)([^a-z]|$)");
public static Pattern genreWar = Pattern.compile("(^|[^a-z])(war|warfare|bomb|pearl harbor)([^a-z]|$)");
public static Pattern genreWestern = Pattern.compile("(^|[^a-z])(western)([^a-z]|$)");
public static Object[][] genreMap ={{genreActionAdventure, "Action/Adventure"}, {genreAnimation, "Animation"}, {genreBiography, "Biography"},
{genreChildren, "Children/Family"}, {genreComedy , "Comedy"}, {genreCrimeMystery, "Crime/Mystery"},
{genreDocumentary, "Documentary"}, {genreDrama, "Drama" }, { genreExperimental, "Experimental" },
{genreFilmNoir, "Film Noir"}, { genreHistorical, "Historical" }, {genreHorror, "Horror"},
{genreMusical, "Music/Musical"}, {genreRomance, "Romance"}, {genreSciFiFantasy, "SciFi/Fantasy"},
{genreTelevision, "Television"}, {genreWar, "War"}, {genreWestern, "Western" }};
public Set<String> getVideoGenre(Record record)
{
Set<String> result = new LinkedHashSet<String>();
if (isVideo)
{
Set<String> subjectFields = indexer.getFieldList(record, "650a:655a");
for (String subject : subjectFields)
{
subject = subject.toLowerCase();
for (Object[] mapEntry : genreMap)
{
Pattern patternToMatch = (Pattern) mapEntry[0];
String valueToAssign = (String) mapEntry[1];
if (patternToMatch.matcher(subject).find())
{
result.add(valueToAssign);
}
}
}
}
return(result);
}
}