package org.solrmarc.mixin;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Set;
import org.marc4j.marc.ControlField;
import org.marc4j.marc.DataField;
import org.marc4j.marc.Record;
import org.marc4j.marc.Subfield;
import org.marc4j.marc.VariableField;
import org.solrmarc.index.SolrIndexerMixin;
import org.solrmarc.index.indexer.IndexerSpecException;
import org.solrmarc.tools.DataUtil;
public class DirectorMixin extends SolrIndexerMixin
{
// Pattern releaseDatePattern = null;
// boolean isVideo = false;
// SolrIndexerMixin formatmixin = null;
// public void perRecordInit(Record record)
// {
// ControlField cf008 = (ControlField)record.getVariableField("008");
// isVideo = (cf008 != null && cf008.getData().charAt(33) == 'v') && record.getLeader().getTypeOfRecord() == 'g';
//// Set<String> formats = indexer.getFormat(record);
//// isVideo = (Utils.setItemContains(formats, "Video"));
// }
private static final String[] FIELDS_ARRAY = {"500", "505"};
/**
* Attempt to heuristically determine the Director of a video item based on looking in the 245c, subfield, the 508a subfield and 700 fields
*
* @param record the record being processed
* @return Set<String> a set of Strings representing the name(s) of the director(s) of the video (or an empty set, if none can be found)
*/
public Set<String> getVideoDirector(Record record)
{
ControlField cf008 = (ControlField)record.getVariableField("008");
List<VariableField> cf007 = record.find("007","^v");
boolean isVideo = ((cf008 != null && cf008.getData().length() > 33 && cf008.getData().charAt(33) == 'v') || cf007.size() > 0) &&
record.getLeader().getTypeOfRecord() == 'g';
Set<String> result = new LinkedHashSet<String>();
DataField f245 = ((DataField)record.getVariableField("245"));
if (isVideo)
{
String responsibility = (f245 != null) ? f245.getSubfieldsAsString("c") : null;
if (responsibility != null)
{
Set<String> directors = getVideoDirectorsFromTextField(responsibility, true);
result.addAll(directors);
}
for (VariableField df : record.getVariableFields("508"))
{
for (Subfield credit : ((DataField)df).getSubfields('a'))
{
Set<String> directors = getVideoDirectorsFromTextField(credit.getData(), true);
result.addAll(directors);
}
}
if (result.size() == 0)
{
for (VariableField df : record.getVariableFields(FIELDS_ARRAY) )
{
for (Subfield noteField : ((DataField)df).getSubfields("at"))
{
final String note = noteField.getData();
if (note.contains("direct") || note.contains("Direct") || note.contains("A film by") || note.contains("a film by"))
{
Set<String> directors = getVideoDirectorsFromTextField(note, false);
result.addAll(directors);
}
}
}
String subtitle = f245 != null ? f245.getSubfieldsAsString("b") : null;
if (subtitle != null && (subtitle.contains("direct") || subtitle.contains("Direct")))
{
addError(new IndexerSpecException("Director information erroneously included in the 245b subtitle field"));
Set<String> directors = getVideoDirectorsFromTextField(subtitle, false);
result.addAll(directors);
}
String medium = f245 != null ? f245.getSubfieldsAsString("h") : null;
if (medium != null && (medium.contains("direct") || medium.contains("Direct")))
{
addError(new IndexerSpecException("Director information erroneously included in the 245h medium field"));
Set<String> directors = getVideoDirectorsFromTextField(medium, false);
result.addAll(directors);
}
}
List<VariableField> personalNames = record.getVariableFields("700");
for (VariableField vf : personalNames)
{
DataField df = (DataField)vf;
// this could be overly broad, and could also grab music directors or other roles containing the word "director"
if (ChkSubfield(df, '4', "drt") || ChkSubfield(df, 'c', "[(]?([Ff]ilm )?[Dd]irector[)]?[^a-z]*") ||
(ChkSubfield(df, 'e', "(co-|film )?[Dd]irect(or|er|ion|eur|ed by)[^a-z]*") ))
{
String name = df.getSubfield('a').getData();
name = DataUtil.cleanData(name);
name = name.replaceAll("([A-Z][^,]*),[ ]?(.*)", "$2 $1");
result.add(name);
}
}
}
return(result);
}
private boolean ChkSubfield(DataField df, char c, String pattern)
{
List<Subfield> sfs = df.getSubfields(c);
for (Subfield sf : sfs)
{
if (sf.getData().matches(pattern)) return(true);
}
return false;
}
/**
* Routine that actually does the work to heuristically determine the Director of a video based on the text of the string passed in
*
* @param ratingString the value of 245c subfield or a 508a subfield extracted from the record being processed
* @return Set<String> a set of Strings representing the name(s) of the director(s) of the video (or an empty set, if none can be found)
*/
public static Set<String> getVideoDirectorsFromTextField(String responsibility, boolean greedy)
{
// First do some initital processing on the passed in string
boolean reverseName = false;
Set<String> result = new LinkedHashSet<String>();
Set<String> squeezedresult = new LinkedHashSet<String>();
responsibility = responsibility.replaceAll("\\[sic[.]?[]]", "");
responsibility = responsibility.replaceAll("([a-z][a-z][a-z])[.]", "$1;");
responsibility = responsibility.replaceAll("([a-z][a-z])[.] ", "$1; ");
responsibility = responsibility.replaceAll("direc\u0327a\u0303o de produc\u0327a\u0303o", "producer");//porteguese
responsibility = responsibility.replaceAll("direct(or|ion|eur) de (la )?[ ]?produc[ct]i(o\u0301|o)n", "producer");//french/spanish
responsibility = responsibility.replaceAll("direcci(o\u0301|o)n art\\B*", "artguy");//spanish
responsibility = responsibility.replaceAll("produit par\\b", "produced by");
String responsibility1 = responsibility.replaceAll("[Rr]eg(i|i\u0301)[ea]?\\b", "didrector");//german/italian/swedish
responsibility1 = responsibility1.replaceAll("[Rr]eggia\\b", "didrector");//spanish
responsibility1 = responsibility1.replaceAll("[Rr]e(z\u030c|z)ie", "didrector");//czech
responsibility1 = responsibility1.replaceAll("[Rr]e(z\u0307|z|a)yseria", "didrector");//polish
responsibility1 = responsibility1.replaceAll("[Dd]irecci(o\u0301|o)n", "didrector");//spanish
responsibility1 = responsibility1.replaceAll("[Rr](e\u0301|e)alisation", "didrection");//french
responsibility1 = responsibility1.replaceAll("[Rr]ealizaci(o\u0301|o)n", "didrection");//spanish
responsibility1 = responsibility1.replaceAll("[Rr]e\u0301alise\u0301( et [a-z]*)? (par|by)", "didrected$1 by");//french
responsibility1 = responsibility1.replaceAll("[Dd]irected por", "didrected by");//spanish
responsibility1 = responsibility1.replaceAll("[Dd]irige\u0301 par", "didrected by");//french
responsibility1 = responsibility1.replaceAll("[Dd]irrecio\u0301n", "didrection");//spanish
responsibility1 = responsibility1.replaceAll("[Dd]irecci\u0301on", "didrection");//spanish
responsibility1 = responsibility1.replaceAll("[Dd]irector(a|es)", "didrector");//spanish
responsibility1 = responsibility1.replaceAll("[Dd]ire(c\u0327|c)a\u0303o", "didrector");//porteguese
responsibility1 = responsibility1.replaceAll("[Dd]iretto da", "didrector");//italian
responsibility1 = responsibility1.replaceAll("[Dd]irecteur", "didrector");//french
responsibility1 = responsibility1.replaceAll("[Dd]irect(e|io)r", "didrector");//typo
responsibility1 = responsibility1.replaceAll("[Dd]irigid[oa]", "didrector");//porteguese
responsibility1 = responsibility1.replaceAll("[Tt]asrit\u0323 u-vimui", "didrector");//hebrew
responsibility1 = responsibility1.replaceAll("[Ii]khr(a\u0304|a)j", "didrector");//arabic
responsibility1 = responsibility1.replaceAll("[Rr]ezhisser[a]?", "didrector");//russian
responsibility1 = responsibility1.replaceAll("[Yy]o\u0308neten", "didrector");//turkish
responsibility1 = responsibility1.replaceAll("[Nn]irdes\u0301aka", "didrector");//hindi
responsibility1 = responsibility1.replaceAll("[Pp]ostanovka", "didrector");//russian
responsibility1 = responsibility1.replaceAll("(un )?film[e]? d[ei]\\b", "a flim by");//french
responsibility1 = responsibility1.replaceAll("un film d'", "a flim by ");//french
responsibility1 = responsibility1.replaceAll("(an|en) film av", "a flim by"); //swedish
responsibility1 = responsibility1.replaceAll("[Ee]in [Ff]ilm von", "a flim by"); //german
responsibility1 = responsibility1.replaceAll("un(e|a) pel(i\u0301|i)cula de", "a flim by");//spanish
responsibility1 = responsibility1.replaceAll("[Mm]is[e]? en sc(e|e\u0300)ne( de)?", "a flim by");//french
responsibility1 = responsibility1.replaceAll("Film by", "a flim by");
//responsibility1 = responsibility1.replaceAll("\\bpar\\b", "maybe by");
//responsibility1 = responsibility1.replaceAll("^by", "maybe by");
//responsibility1 = responsibility1.replaceAll("^von", "maybe by");
// if (!responsibility1.equals(responsibility))
// {
// responsibility = responsibility1;
// }
responsibility1 = handleASoAndSoFilm(responsibility1);
responsibility1 = responsibility1.replaceAll("[Dd]ao yan", "didrector");//chinese
responsibility1 = responsibility1.replaceAll("[Kk]antoku", "didrector");//japanese
responsibility1 = responsibility1.replaceAll("[Kk]amdok", "didrector");//korean
responsibility1 = responsibility1.replaceAll("[Yy]o\u0306nch\u02bbul", "didrector");//korean
if (!responsibility1.equals(responsibility))
{
responsibility = responsibility1;
}
// Now split the string into subparts separated by ; (or -- or : )
for (int loop = 0; loop < 5; loop++)
{
if (loop == 3)
{
responsibility = responsibility.replaceAll("didrect", "direct");
responsibility = responsibility.replaceAll("a flim by", "a film by");
if (responsibility.matches(".*filmed[,a-z ]*by\\b.*"))
{
responsibility = responsibility.replaceAll("filmed", "directed");
}
}
String semiparts[] = responsibility.split(";|--| : ");
for (String part0 : semiparts)
{
String part = part0;
part = part.trim();
if (((loop == 0 || loop == 3) && part.matches(".*[Dd]irect(ed|or[s]?|ion)\\b.*")) ||
((loop == 1 || loop == 4) && part.matches(".*a film by.*"))) // ||
// (loop == 5 && part.matches(".*maybe by.*")) )
{
String trimmed;
// part = part.replaceAll("\\[sic[.][]]", "");
if (!greedy && part.matches(".*[Dd]irector[']?s.*"))
{
continue;
}
if (!greedy && part.matches(".*\"[^,\"]*[Dd]irect(ed by|or[s]?|ion)[^,\"]*\".*"))
{
part = part.replaceAll("\"[^\",]*?\"", "XXX");
}
// Try to split apart "brothers" ie. the Hughes Brothers becomes Albert Hughes and Allen Hughes
if (part.matches(".*(the )?[A-Z][^ ]* [Bb]rothers.*"))
{
String name = part.replaceFirst(".*(the )?([A-Z][^ ]*) [Bb]rothers.*", "$2");
if (part.matches(".*(the )?[A-Z][^ ]* [Bb]rothers(, | \\()[A-Z][a-z]* and [A-Z][a-z]* "+name+".*"))
{
part = part.replaceFirst("((the )?([A-Z][^ ]*) [Bb]rothers)(, | \\()([A-Z][a-z]*) and ([A-Z][a-z]*) "+name+"\\)?", "$5 $3 and $6 $3");
}
else if (part.matches(".*(the )?[A-Z][^ ]* [Bb]rothers(, | \\()[A-Z][a-z]* and [A-Z][a-z]*.*"))
{
part = part.replaceFirst("((the )?([A-Z][^ ]*) [Bb]rothers)(, | \\()([A-Z][a-z]*) and ([A-Z][a-z]*)\\)?", "$5 $3 and $6 $3");
}
else
{
for (String otherPart : semiparts)
{
if (otherPart.equals(part0)) continue;
if (otherPart.matches(".*[A-Z][^ ]* "+name+".*[A-Z][^ ]* "+name+".*"))
{
String names = otherPart.replaceFirst(".*([A-Z]([a-z]*|[.]) )+"+name+"[, ].*([A-Z]([a-z]*|[.]) )+"+name+"[^A-Za-z].*",
"$1"+name+" and $3"+name);
part = part.replaceFirst("(.*?)([Tt]he )?([A-Z][^ ]*) [Bb]rothers(.*)", "$1"+names+"$4");
break;
}
}
}
}
String part1 = part.replaceAll("[ ]?[(][ ]*([a-z]*[/]|assistant )?[Dd]irector([/][a-z]*)?[ ]*[)]", " director"); // change (director) to director
part1 = part1.replaceAll("[ ]?[(][^)]*[)]", ""); // throw away parenthetical phrases
if (!part.equals(part1))
{
part = part1;
}
if (part.matches(".*[Dd]irector[s]? (of|de|de la) ([Pp]h|[Ff])otogra(ph|f)(y|(i|i\u0301)[ea]).*"))
{
part = part.replaceFirst("[Dd]irector[s]? (of|de|de la) ([Pp]h|[Ff])otogra(ph|f)(y|(i|i\u0301)[ea])", "cinematographer");
}
if (part.matches(".*[Dd]irector[s]? (of|de|de la) ([Aa]nimation).*"))
{
part = part.replaceFirst("[Dd]irector[s]? (of|de|de la) ([Aa]nimation)", "supervising animator");
}
else if (part.matches(".*, [Dd]irector[s]? (of|for) .*"))
{
part = part.replaceFirst("[Dd]irector[s]? (of|for)((( ([A-Z][a-z]*|the|of|and|a))+)[,]?)+", "director");
}
else if (part.matches(".*[Dd]irector[s]?( and writer)? (of|for) .*"))
{
part = part.replaceFirst("[Dd]irector[s]?( and writer)? (of|for) [A-Za-z' ]*", "director");
}
if (part.contains("collaboration") || part.contains("participation"))
{
part = part.replaceAll("(in collaboration with)|(with the collaboration of)|(with the participation of)", "with");
}
if (part.matches(".*[Dd]irector .* director.*"))
{
part = part.replaceAll(".*([Dd]irector .*)director.*", "$1");
}
// Pattern matching when the subpart is of the form: Some Name Director
if (part.matches(".*[Dd]irector[^A-Z]*") && !part.matches(".*of the [Dd]irector.*"))
{
if (part.matches(".*([Aa]rt(istic)?|[Mm]usic(al)?|[Ss]tage|[Pp]roduction|[Pp]roject|[Pp]hotography|[Aa]nimation|[Mm]edical|[Cc]asting|[Tt]echnical|[Dd]ance|[Ee]diting) [Dd]irector.*" ) ||
part.matches(".*[Dd]irector[s]? ((of[ ]?(([Pp]hotography)))|(de (la )?fotograf(i|i\u0301)a)|(de arte)).*"))
continue;
part = part.replaceAll(" *[\\[]", ", ");
part = part.replaceAll("^\"", "");
part = part.replaceAll("[\\]]", "");
part = part.replaceAll("director.*", "director");
part = part.replaceAll(" [-A-Za-z/]*director[-A-Za-z]*", " director");
part = part.replaceAll(" [a-z/]+/director", " director");
part = part.replaceAll(" co-director", " director");
part = part.replaceAll(" [a-z ,]+ director", " director");
if (greedy)
part = part.replaceAll(".*: (([A-Z][A-Za-z.]* )*[A-Z][A-Za-z.]*)(, |,| )director", "$1, director");
else
part = part.replaceAll(".*? (([A-Z][A-Za-z.]* )*[A-Z][A-Za-z.]*)(, )?director", "$1, director");
part = part.replaceFirst(".* (of|by)", "by");
part = part.replaceAll(" (and|und|/) ", " & ");
part = part.replaceFirst("by ", "");
part = part.replaceAll(", (Jr[.]?|Sr[.]?|Inc[.]?|II|III|IV|M[.]D[.]|B[.]S[.]N[.])", "# $1");
part = part.replaceFirst("[,]?[ ]?director", "");
part = part.replaceAll("([,][ ]?|[ ]?&[ ]?)", "|");
part = part.replaceAll("[#]", ",");
String commaparts[] = part.split("[|]+");
for (String subpart : commaparts)
{
addCleanedName(result, squeezedresult, subpart, reverseName, greedy);
}
}
else if (!greedy && part.matches(".*, [Dd]irector (of|for|on).*"))
{
continue;
}
// Pattern matching when the subpart is of the form: Directed by Some Name
else if (part.matches(".*[Dd]irect(ed|ion).*?by.*")|| part.matches(".*a film by.*") || part.matches(".*maybe by.*"))
{
if (part.matches(".*([Aa]rt(istic)?|[Mm]usic(al)?|[Ss]tage|[Pp]roduction|[Pp]roject|[Pp]hotographic|[Aa]nimation|[Mm]edical|[Cc]asting|[Tt]echnical|[Dd]ance|[Ee]diting) [Dd]irection.*?by.*" ) ||
part.matches(".*[Dd]irector[s]? ((of[ ]?(([Pp]hotography)))|(de (la )?fotograf(i|i\u0301)a)|(de arte)).*"))
continue;
part = part.replaceFirst(".*[Dd]irect(ed|ion).*?by[]:,)]?[ ]?", "directified by ");
part = part.replaceFirst(".*a film by", "directified by ");
part = part.replaceFirst(".*maybe by", "directified by ");
part = part.replaceAll("/", " & ");
part = part.replaceAll("\\[|\\]", "");
part = part.replaceFirst("et al", "");
part = part.replaceAll(" (and|with|et|und) ", " & ");
part = part.replaceAll(", (Jr[.]?|Sr[.]?|Inc[.]?|II|III|IV|M[.]D[.]|B[.]S[.]N[.])", "# $1");
part = part.replaceAll("[.][.][.]", "");
part = part.replaceAll("([A-Z][^ .][^ .][^ .]+)[.].*", "$1");
part = part.replaceAll("brothers", "Brothers");
part = part.replaceAll("directified by[ a-z,]*(([\"]?([A-Z]|\\p{Lu}|[*\u02bb]|\\p{M})[^ ]*[\"]?[,]?[ ]*|[ ]?&[ ]|zur |von |van |de[rl]?[ ]?|the |d[']|al-|da-)+).*", "$1");
part = part.replaceAll("^([A-Z][^ .]+) & ([A-Z][^ .,]+) ([A-Z][^ .,]+)", "$1 $3 & $2 $3");
part = part.replaceAll("([,][ ]?|[ ]?&[ ]?)", "|");
part = part.replaceAll("[#]", ",");
part = part.replaceAll("[ ][ ]+", " ");
part = part.replaceFirst("directified by", "");
String commaparts[] = part.split("[|]+");
for (String subpart : commaparts)
{
addCleanedName(result, squeezedresult, subpart, reverseName, greedy);
}
}
else if (part.matches(".*[Dd]irected.*") || part.matches(".*[Dd]irected( (and|&) [-a-z]*)?,.*"))
{
part = part.replaceFirst(".*[Dd]irected( (and|&) [-a-z]*),","directified by");
part = part.replaceFirst(".*[Dd]irected[]:,)]?[ ]?", "directified by ");
part = part.replaceAll("/", " & ");
part = part.replaceAll("\\[|\\]", "");
part = part.replaceFirst("et al", "");
part = part.replaceAll(" (and|with|et|und) ", " & ");
part = part.replaceAll(", (Jr[.]?|Sr[.]?|Inc[.]?|II|III|IV|M[.]D[.]|B[.]S[.]N[.])", "# $1");
part = part.replaceAll("[.][.][.]", "");
part = part.replaceAll("([A-Z][^ .][^ .][^ .]+)[.].*", "$1");
part = part.replaceAll("brothers", "Brothers");
part = part.replaceAll("directified by[ a-z,]*(([\"]?([A-Z]|\\p{Lu}|[*\u02bb]|\\p{M})[^ ]*[\"]?[,]?[ ]*|[ ]?&[ ]|zur |von |van |de[rl]?[ ]?|the |d[']|al-|da-)+).*", "$1");
part = part.replaceAll("^([A-Z][^ .]+) & ([A-Z][^ .,]+) ([A-Z][^ .,]+)", "$1 $3 & $2 $3");
part = part.replaceAll("([,][ ]?|[ ]?&[ ]?)", "|");
part = part.replaceAll("[#]", ",");
part = part.replaceAll("[ ][ ]+", " ");
part = part.replaceFirst("directified by", "");
String commaparts[] = part.split("[|]+");
for (String subpart : commaparts)
{
addCleanedName(result, squeezedresult, subpart, reverseName, greedy);
}
}
// Pattern matching when the subpart is of the form: Director Some Name
else if (part.matches(".*[Dd]irector[^a-rt-z\'].*[A-Z].*"))
{
if (part.matches(".*([Aa]rt(istic)?|[Mm]usic(al)?|[Ss]tage|[Pp]roduction|[Pp]roject|[Pp]hotography|[Aa]nimation|[Mm]edical|[Cc]asting|[Tt]echnical|[Dd]ance|[Ee]diting) [Dd]irector.*" ) ||
part.matches(".*[Dd]irector[s]? ((of[ ]?(([Pp]hotography)))|(de (la )?fotograf(i|i\u0301)a)|(de arte)).*"))
continue;
part = part.replaceFirst("Executive", "executive");
part = part.replaceFirst("Writer", "writer");
part = part.replaceFirst("Story", "story");
part = part.replaceFirst("Producer", "producer");
part = part.replaceFirst("Produced", "produced");
part = part.replaceFirst(", English", ", english");
part = part.replaceFirst("Researcher", "researcher");
part = part.replaceFirst(".*?[Dd]irector", "director");
part = part.replaceAll("[ ]?([.][.][.])?[ ]?[\\[][^\\]]*[\\]]", "");
part = part.replaceAll("[]]", "");
part = part.replaceAll(", (Jr[.]?|Sr[.]?|Inc[.]?|II|III|IV|M[.]D[.]|B[.]S[.]N[.])", "# $1");
part = part.replaceAll("director[-A-Za-z/]*", "director");
part = part.replaceAll("director (for|and|of)( [A-Z][A-Za-z]*)+", "director");
if (!greedy)
part = part.replaceFirst("director( and [a-z]*)?, [a-z].*", "");
part = part.replaceFirst("director[^A-Z]* ([\u02bb*]?[A-Z])", "director= $1");
part = part.replaceFirst("with the [a-z][A-Za-z ]*", "");
part = part.replaceFirst("et al", "");
if (!greedy)
part = part.replaceAll("[,]? and [a-z].*", "");
part = part.replaceAll(" (and|with|et) ", " & ");
part = part.replaceAll(",[ ]?[a-z].*", "");
part = part.replaceAll("= [^(]*[)], ", ": ");
if (greedy)
part = part.replaceAll("director=[ a-z,]*(([\"]?([A-Z]|\\p{Lu}|[*\u02bb]|\\p{M})[^ ]*[\"]?[,]?[ ]*|[ ]?&[ ]|von |zur |van |de[rl]?[ ]?|the |in |d[']|al-|da-)+)[^|&]*", "$1");
else // strict
part = part.replaceAll("director=[ a-z,]*(([\"]?([A-Z]|\\p{Lu}|[*\u02bb]|\\p{M})[^ ]*[\"]?[,]?[ ]*|[ ]?&[ ]|von |zur |van |de[rl]?[ ]?|the |in |d[']|al-|da-)+)( [a-z].*|$)", "$1");
part = part.replaceAll("^([A-Z][^ .]+) & ([A-Z][^ .]+) ([A-Z][^ .]+)", "$1 $3 & $2 $3");
part = part.replaceAll("([,][ ]?|[ ]?[|&][ ]?)", "|");
part = part.replaceAll("[#]", ",");
part = part.replaceAll("[ ][ ]+", " ");
String commaparts[] = part.split("[|]+");
for (String subpart : commaparts)
{
addCleanedName(result, squeezedresult, subpart, reverseName, greedy);
}
}
// Pattern matching when the subpart is of the form: Direction Some Name
else if (part.matches(".*[Dd]irection.*"))
{
if (part.matches(".*([Aa]rt|[Mm]usic(al)?|[Ss]tage|[Pp]roject|[Aa]nimation|[Mm]edical|[Cc]asting|[Tt]echnical|[Oo]rchestra|[Ee]diting) [Dd]irection.*" )||
part.matches(".*[Dd]irection (of )?(de )?(la )?((f|ph)otogra(f|ph)ie|production|[Cc]in(e\u0301|e)matographie|artistique|art[e]?|musicale).*"))
continue;
part = part.replaceFirst(".*[Dd]irection[^A-Z]*", "direction: ");
part = part.replaceAll(", (Jr[.]?|Sr[.]?|Inc[.]?|II|III|IV|M[.]D[.]|B[.]S[.]N[.])", "# $1");
part = part.replaceAll(" (and|with|et) ", " & ");
part = part.replaceAll("[\\]]", "");
part = part.replaceAll("([,][ ]?|[ ]?&[ ]?)", "|");
part = part.replaceAll("[#]", ",");
part = part.replaceAll("[ ][ ]+", " ");
part = part.replaceFirst("direction: ", "");
String commaparts[] = part.split("[|]+");
for (String subpart : commaparts)
{
addCleanedName(result, squeezedresult, subpart, reverseName, greedy);
}
}
}
}
if (result.size() > 0) break;
}
return(result);
}
private static String handleASoAndSoFilm(String responsibility)
{
String aOrAn = "\\ba[n]?[ ]+"; // matches a or an but has boundary marker so as to not match pa or man
String namePart = "(?:\\p{Lu}(?:\\p{L}|\\p{M}|[-'])*(?:\\p{Ll}|\\p{M}))"; // Example matches: Jadme-Lillo or Sa\u0301nchez or SiCa or O'Malley
String initialOrNamePart = "(?:\\p{Lu}[.]|"+namePart+")"; // Example matches: B. or Jadme-Lillo or Sa\u0301nchez or SiCa or O'Malley
String optionalSuffix = "(?:[,]? (?:Jr[.]?|Sr[.]?|II|III|IV|M[.]D[.]|B[.]S[.]N[.]))?"; // Example matches: , Jr. or , III or , M.D.
String nameGap = "[- ]";
String film = " film\\b"; // matches film but not films uses boundary marker
String name = initialOrNamePart+nameGap+"(?:"+initialOrNamePart+nameGap+")?"+namePart+optionalSuffix;
String multiNameConnector = "(?:, | - | and | ?/ ?)";
String responsibility1;
responsibility1= responsibility.replaceAll(aOrAn+"("+name+")"+film, "a film by $1");
responsibility1= responsibility1.replaceAll(aOrAn+"("+name+")"+multiNameConnector+"("+name+")"+film, "a film by $1, $2");
responsibility1= responsibility1.replaceAll("("+name+")'s"+film, "a film by $1");
// responsibility1= responsibility.replaceAll("(^|[^a-z])a(n)?[ ]+(([A-Z]([.]|(?:[-a-zA-Z']|\\p{M})*(?:[a-z]|\\p{M})) ([A-Z]([.]|(?:[-a-zA-Z']|\\p{M})*(?:[a-z]|\\p{M}))[- ]?)?[A-Z](?:[-a-zA-Z']|\\p{M})*(?:[a-z]|\\p{M})(, (Jr[.]?|Sr[.]?|Inc[.]?|II|III|IV|M[.]D[.]|B[.]S[.]N[.]))?) film([^a-z]|$)", "$1a film by $3$10");
// responsibility1 = responsibility1.replaceAll("(^|[^a-z])a(n)?[ ]+((([A-Z]([.]|[-a-zA-Z']*[a-z])) (([A-Z]([.]|[-a-zA-Z']*[a-z])[- ]?)?[A-Z][-a-zA-Z']*[a-z](, (Jr[.]?|Sr[.]?|Inc[.]?|II|III|IV|M[.]D[.]|B[.]S[.]N[.]))?))(, | - | and | ?/ ?))+(([A-Z]([.]|[-a-z]*[a-z])) ([A-Z]([.]|[-a-z]*[a-z])[- ]?)?[A-Z][-a-z]*[a-z](, (Jr[.]?|Sr[.]?|Inc[.]?|II|III|IV|M[.]D[.]|B[.]S[.]N[.]))?) film([^a-z]|$)", "$1a film by $4, $13$20");
if (!responsibility1.equals(responsibility))
{
responsibility = responsibility1;
}
return(responsibility);
}
private static void addCleanedName(Set<String> result, Set<String> squeezedresult, String subpart, boolean reverseName, boolean greedy)
{
subpart = nameClean(subpart, greedy);
if (subpart == null) return;
if (reverseName)
subpart = subpart.replaceFirst("([^ ]+)[ ]+(.*)", "$2 $1");
String squeezedpart = subpart.replaceAll(" ", "");
if (!squeezedresult.contains(squeezedpart))
{
squeezedresult.add(squeezedpart);
result.add(subpart);
}
}
private static String nameClean(String subpart, boolean greedy)
{
if (subpart.matches(".*[(].*[)]"));
subpart = subpart.replaceAll("(.*)[(].*[)]", "$1");
if (subpart.matches("[^()]*[)]"));
subpart = subpart.replaceAll("([^()]*)[)]", "$1");
if (subpart.matches("\".*") || subpart.matches(".*\""))
subpart = subpart.replaceAll("\"?(.*?)\"?", "$1");
if (subpart.matches(".* for .*"))
subpart = subpart.replaceAll("(.*) for .*", "$1");
if (subpart.matches(".*, Inc[.]"))
return(null);
if (subpart.matches(".*( of | a | in ).*"))
return(null);
if (subpart.matches(".*'s"))
subpart = subpart.replaceFirst("'s$", "");
if (subpart.matches(".*? [a-z][a-z ]*"))
subpart = subpart.replaceAll("(.*?) [a-z][a-z ]*", "$1");
if (subpart.matches("[a-z]*"))
return(null);
if (subpart.contains("didrector"))
subpart = subpart.replaceAll("didrector", "");
if (subpart.matches(".*[ .]+$"))
subpart = subpart.replaceAll("(.*?)[ .][ .]+$", "$1");
if (!greedy && subpart.matches(".*[0-9]+.*"))
return(null);
if (!greedy && subpart.matches("[Tt]he( .*|$)"))
return(null);
if (!greedy && subpart.matches("[A-Z][a-z]*"))
return(null);
if (!greedy && subpart.matches("^[\"]?[a-z].*"))
return(null);
if (subpart.matches("[Tt]he( .*|$)"))
subpart = subpart.replaceFirst("[Tt]he[ ]?", "");
if (subpart.contains("Group")) return(null);
if (subpart.contains("Studio[s]?")) return(null);
if (subpart.contains("Entertainment")) return(null);
if (subpart.contains("Department")) return(null);
if (subpart.contains("National")) return(null);
if (subpart.contains("Museum")) return(null);
if (subpart.contains("Films")) return(null);
if (subpart.contains("TV")) return(null);
if (subpart.contains("Response")) return(null);
if (subpart.contains("Cities")) return(null);
if (subpart.contains("High")) return(null);
if (subpart.endsWith("Productions")) return(null);
if (subpart.startsWith("Written")) return(null);
if (subpart.startsWith("Writer")) return(null);
if (subpart.equalsIgnoreCase("Various")) return(null);
if (subpart.equalsIgnoreCase("Editor")) return(null);
if (subpart.equalsIgnoreCase("Executive")) return(null);
if (subpart.equalsIgnoreCase("Story")) return(null);
if (subpart.startsWith("English")) return(null);
if (subpart.startsWith("Company")) return(null);
if (subpart.matches(".*[Pp]roducer.*")) return(null);
if (subpart.matches("[Dd]irector[s]?")) return(null);
if (subpart.equalsIgnoreCase("Screenplay")) return(null);
if (subpart.contains(":")|| subpart.replaceAll("[^ ]", "").length() > 5)
return(null);
subpart = DataUtil.cleanData(subpart);
if (subpart.length() == 0) return(null);
return(subpart);
}
}