mirror of
https://github.com/fhem/fhem-mirror.git
synced 2025-05-04 22:19:38 +00:00
1218 lines
47 KiB
Perl
1218 lines
47 KiB
Perl
package Normalize::Text::Music_Fields; # Music_Normalize_Fields
|
||
$VERSION = '0.02';
|
||
use strict;
|
||
use Config;
|
||
#use utf8; # Needed for 5.005...
|
||
|
||
my %tr;
|
||
my %short;
|
||
|
||
sub translate_dots ($) {
|
||
my $a = shift;
|
||
$a =~ s/^\s+//;
|
||
$a =~ s/\s+$//;
|
||
$a =~ s/\s+/ /g;
|
||
$a =~ s/\b(\w)\.\s*/$1 /g;
|
||
$a =~ s/(\w\.)\s*/$1 /g;
|
||
lc $a
|
||
}
|
||
|
||
sub translate_tr ($) {
|
||
my $a = shift;
|
||
$a = $tr{translate_dots $a} or return;
|
||
return $a;
|
||
}
|
||
|
||
sub strip_years ($) { # strip dates
|
||
my ($a) = (shift);
|
||
my @rest;
|
||
return $a unless $a =~ s/\s+((?:\([-\d,]+\)(\s+|$))+)$//;
|
||
@rest = split /\s+/, $1;
|
||
return $a, @rest;
|
||
}
|
||
|
||
sub strip_duplicate_dates { # Remove $d[0] if it matches $d_r
|
||
my ($d_r, @d) = @_;
|
||
return unless @d;
|
||
$d_r = substr $d_r, 1, length($d_r) - 2; # Parens
|
||
my $dd = substr $d[0], 1, length($d[0]) - 2; # Parens
|
||
my @dates_r = split /,|--|-(?=\d\d\d\d)/, $d_r;
|
||
my @dates = split /,|--|-(?=\d\d\d\d)/, $dd;
|
||
for my $d (@dates) {
|
||
return @d unless grep /^\Q$d\E(-|$)/, @dates_r;
|
||
}
|
||
return @d[1..$#d];
|
||
}
|
||
|
||
sub __split_person ($) {
|
||
# Non-conflicting ANDs (0x438 is cyrillic "i", word is cyrillic "per")
|
||
split /([,;:]\s+(?:\x{043f}\x{0435}\x{0440}\.\s+)?|\s+(?:[-&\x{0438}ei]|and|et)\s+|\x00)/, shift;
|
||
}
|
||
|
||
sub _translate_person ($$$);
|
||
sub _translate_person ($$$) {
|
||
my ($self, $aa, $with_year) = (shift, shift, shift);
|
||
my $fail = ($with_year & 2);
|
||
$with_year &= 1;
|
||
my $ini_a = $aa;
|
||
$aa = $aa->[0] if ref $aa; # [value, handler]
|
||
$aa =~ s/\s+$//;
|
||
load_lists() unless %tr;
|
||
# Try early fixing:
|
||
my $a1 = translate_tr $aa;
|
||
return ref $ini_a ? [$a1, $ini_a->[1]] : $a1 if $a1 and $with_year;
|
||
my ($a, @date) = strip_years($aa);
|
||
my $tr_a = translate_tr $a;
|
||
if (not defined $tr_a and $a =~ /(.*?)\s*,\s*(.*)/s) { # Schumann, Robert
|
||
$tr_a = translate_tr "$2 $1";
|
||
}
|
||
if (not defined $tr_a) {
|
||
return if $fail;
|
||
my $ini = $aa;
|
||
# Normalize "translated" to "transl."
|
||
# echo "¯¥à¥¢®¤" | perl -wnle 'BEGIN{binmode STDIN, q(encoding(cp866))}printf qq(\\x{%04x}), ord $_ for split //'
|
||
$aa =~ s/(\s\x{043f}\x{0435}\x{0440})\x{0435}\x{0432}\x{043e}\x{0434}\x{0435}?(\s)/$1.$2/g;
|
||
$aa =~ s/(\s+)\x{0432}\s+(?=\x{043f}\x{0435}\x{0440}\.)/;$1/g; # v per. ==> , per.
|
||
$aa =~ s/[,;.]\s+(\x{043f}\x{0435}\x{0440}\.)\s*/; $1 /g; # normalize space, punct
|
||
$aa =~ s/\b(transl)ated\b/$1./g;
|
||
|
||
my @parts = __split_person $aa;
|
||
if (@parts <= 1) { # At least normalize spacing:
|
||
# Add dots after initials
|
||
$aa =~ s/\b(\w)\s+(?=(\w))/
|
||
($1 ne lc $1 and $2 ne lc $2) ? "$1." : "$1 " /eg;
|
||
# Separate initials by spaces unless in a group of initials
|
||
$aa =~ s/\b(\w\.)(?!$|[-\s]|\w\.)/$1 /g;
|
||
return ref $ini_a ? [$aa, $ini_a->[1]] : $aa;
|
||
}
|
||
for my $i (0..$#parts) {
|
||
next if $i % 2; # Separator
|
||
my $val = _translate_person($self, $parts[$i], $with_year | 2); # fail
|
||
# Deal with cases (currently, in Russian only, after "transl.")
|
||
if (not defined $val and $i
|
||
and $parts[$i-1] =~ /^;\s+\x{043f}\x{0435}\x{0440}\.\s+$/ # per
|
||
and $parts[$i] =~ /(.*)\x{0430}$/s) {
|
||
$val = _translate_person($self, "$1", $with_year | 2); # fail
|
||
}
|
||
$val ||= _translate_person($self, $parts[$i], $with_year); # cosmetic too
|
||
$parts[$i] = $val if defined $val;
|
||
}
|
||
$tr_a = join '', @parts;
|
||
return $ini_a if $tr_a eq $ini;
|
||
@date = (); # Already taken into account...
|
||
}
|
||
my ($short, @date_r) = strip_years($tr_a); # Real date
|
||
@date = strip_duplicate_dates($date_r[0], @date) if @date_r == 1 and @date;
|
||
$tr_a = $short unless $with_year;
|
||
$a = join ' ', $tr_a, @date;
|
||
return ref $ini_a ? [$a, $ini_a->[1]] : $a;
|
||
}
|
||
|
||
sub normalize_person ($$) {
|
||
return _translate_person(shift, shift, 1);
|
||
}
|
||
|
||
for my $field (qw(artist artist_collection)) {
|
||
no strict 'refs';
|
||
*{"normalize_$field"} = \&normalize_person;
|
||
}
|
||
|
||
sub short_person ($$);
|
||
sub short_person ($$) {
|
||
my ($self, $a) = (shift, shift);
|
||
my $ini_a = $a;
|
||
$a = $a->[0] if ref $a; # [value, handler]
|
||
$a = _translate_person($self, $a, 0); # Normalize, no dates of life
|
||
$a =~ s/\s+$//;
|
||
($a, my @date) = strip_years($a);
|
||
my @parts;
|
||
if (exists $short{$a}) {
|
||
$a = $short{$a};
|
||
} elsif (@parts = __split_person $a and @parts > 1) {
|
||
for my $i (0..$#parts) {
|
||
next if $i % 2; # Separator
|
||
$parts[$i] = short_person($self, $parts[$i]);
|
||
}
|
||
$a = join '', @parts;
|
||
} else {
|
||
# Drop years of life
|
||
shift @date if @date and $date[0] =~ /^\(\d{4}-[-\d,]*\d{4,}[-\d,]*\)$/;
|
||
# Add dots after initials
|
||
$a =~ s/\b(\w)\s+(?=(\w))/
|
||
($1 ne lc $1 and $2 ne lc $2) ? "$1." : "$1 " /eg;
|
||
# Separate initials by spaces unless in a group of initials
|
||
$a =~ s/\b(\w\.)(?!$|[-\s]|\w\.)/$1 /g;
|
||
my @a = split /\s+/, $a;
|
||
# Skip shorting if there are strange non upcased parts (e.g., "-") or '()')
|
||
my @check = @a;
|
||
my $von = (@a > 2 and $a[-2] =~ /^[a-z]+$/);
|
||
splice @check, $#a - 1, 1 if $von;
|
||
# Ignore mid parts (skip if there are non upcased parts (e.g., "-") or '()')
|
||
unless (grep lc eq $_, @check or @a <= 1 or $a =~ /\(|[,;]\s/) {
|
||
my $i = substr($a[0], 0, 1);
|
||
$a[0] = "$i." if $a[0] =~ /^\w\w/ and lc($i) ne $i;
|
||
# Keep "from" in L. van Beethoven, M. di Falla, I. von Held, J. du Pre
|
||
@a = @a[0,($von ? -2 : ()),-1];
|
||
}
|
||
$a = join ' ', @a;
|
||
}
|
||
$a = join ' ', $a, @date;
|
||
return ref $ini_a ? [$a, $ini_a->[1]] : $a;
|
||
}
|
||
|
||
my %comp;
|
||
|
||
sub normalize_file_lines ($$) { # Normalizing speeds up load_composer()
|
||
my ($self, $fn) = @_;
|
||
open my $f, '<', $fn or die "Can't open file $fn for read";
|
||
local $_;
|
||
print "# normalized\n";
|
||
while (<$f>) {
|
||
next if /^#\s*normalized\s*$/;
|
||
chomp;
|
||
$_ = normalize_piece($self, $_) unless /^\s*#/;
|
||
print "$_\n";
|
||
}
|
||
close $f or die "Can't close file $fn for read";
|
||
}
|
||
|
||
sub _significant ($$$) { # Try to extract "actual name" of the piece
|
||
my ($tbl, $l, $r) = (shift, shift, shift);
|
||
my ($pre, $opus);
|
||
if ($tbl->{no_opus_no}) { # Remove year-like comment
|
||
($pre) = ($l =~ /^(.*\S)\s*\(\d{4}\b[^()]*\)$/s);
|
||
} else {
|
||
($pre, $opus) = ($l =~ /$r/);
|
||
}
|
||
$pre = $l unless $pre;
|
||
my ($significant) = ($pre =~ /^(.*?\bNo[.]?\s*\d+)/is); # Up to No. NN
|
||
($significant) = ($pre =~ /^(.*?);/s) unless $significant;
|
||
($significant) = $pre unless $significant;
|
||
(lc $significant, $opus);
|
||
}
|
||
|
||
my $def_opus_rx = qr/\b(?:Op(?:us\b|\.)|WoO)\s*\d+[a-d]?(?:[.,;\s]\s*No\.\s*\d+(?:\.\d+)*)?/;
|
||
|
||
sub _read_composer_file ($$*$$) {
|
||
my($self, $f, $fh, $tbl, $aka) = (shift,shift,shift,shift,shift);
|
||
my($normalized, $l, @works, %aka, $opened);
|
||
my $opus_rx = $tbl->{opus_rx} || $def_opus_rx;
|
||
my $opus_pref = $tbl->{opus_prefix} || 'Op.';
|
||
local $/ = "\n"; # allow customization
|
||
if (defined $fh) {
|
||
$f |= "composer's file" . (eval {' for ' . $self->name_for_field_normalization} || '');
|
||
} else {
|
||
open COMP, "< $f" or die "Can't read $f: $!";
|
||
$fh = \*COMP;
|
||
$f = "`$f'";
|
||
$opened = 1;
|
||
}
|
||
while (defined ($l = <$fh>)) {
|
||
next if $l =~ /^\s*(?:##|$)/;
|
||
if ($l =~ /^#\s*normalized\s*$/) {
|
||
$normalized++; # Very significant optimization (unless mail-header)
|
||
} elsif ($l =~ /^#\s*opus_rex\s(.*?)\s*$/) {
|
||
$opus_rx = $tbl->{opus_rx} = qr/$1/;
|
||
} elsif ($l =~ /^#\s*dup_opus_rex\s(.*?)\s*$/) {
|
||
$tbl->{dup_opus_rx} = qr/$1/;
|
||
} elsif ($l =~ /^#\s*opus_prefix\s(.*?)\s*$/) {
|
||
$opus_pref = $tbl->{opus_prefix} = $1;
|
||
} elsif ($l =~ /^#\s*no_opus_no\s*$/) {
|
||
$tbl->{no_opus_no} = 1;
|
||
} elsif ($l =~ /^#\s*opus_dup\s+(.*?)\s*$/) {
|
||
$tbl->{dup_opus}{lc $1} = 1;
|
||
} elsif ($l =~ /^#\s*prev_aka\s+(.*?)\s*$/) {
|
||
$aka->{$1} = $works[-1]; # recognize also alternative names
|
||
} elsif ($l =~ /^#\s*format\s*=\s*(line|mail-header)\s*$/) {
|
||
$/ = ($1 eq 'line' ? "\n" : '');
|
||
} elsif ($l =~ /^#[^#]/) {
|
||
warn "Unrecognized line of $f: $l"
|
||
} elsif ($l !~ /^##/) { # Recursive call to ourselves...
|
||
if ($normalized) {
|
||
$l =~ s/\s*$//; # chomp...
|
||
} elsif ($/) {
|
||
$l = normalize_piece($self, $l);
|
||
} else {
|
||
$l = normalize_piece_mail_header($self, $l, $opus_rx, $opus_pref);
|
||
}
|
||
push @works, $l;
|
||
}
|
||
}
|
||
not $opened or close $fh or die "Error reading $f: $!";
|
||
@works;
|
||
}
|
||
|
||
sub read_composer_file ($$;*) {
|
||
my($self, $f, $fh) = (shift,shift,shift);
|
||
$self = prepare_tag_object_comp($self) unless ref $self;
|
||
_read_composer_file($self, $f, $fh,{},{});
|
||
}
|
||
|
||
my @path;
|
||
@path = ("$ENV{HOME}/.music_fields")
|
||
if defined $ENV{HOME} and -d "$ENV{HOME}/.music_fields";
|
||
push @path, '-';
|
||
@path = split /\Q$Config{path_sep}/, $ENV{MUSIC_FIELDS_PATH}
|
||
if defined $ENV{MUSIC_FIELDS_PATH};
|
||
|
||
sub set_path {
|
||
@path = @_;
|
||
}
|
||
|
||
(my $myself = __PACKAGE__) =~ s,::,/,g; # 'Normalize/Text/Music_Fields.pm'
|
||
my @f = $INC{"$myself.pm"};
|
||
warn("panic: can't find myself"), @f = () unless -r $f[0];
|
||
s(\.pm$)()i or (@f=(), warn "panic: misformed myself") for @f;
|
||
|
||
sub get_path () {
|
||
map +($_ eq '-' ? @f : $_), @path;
|
||
}
|
||
|
||
sub load_composer ($$) {
|
||
my ($self, $c) = @_;
|
||
eval {$c = $self->shorten_person($c)};
|
||
my $ini = $c;
|
||
return $comp{$ini} if exists $comp{$ini};
|
||
$c =~ s/[^-\w]/_/g;
|
||
$c =~ s/__/_/g;
|
||
# XXX See Wikipedia "Opus number" for more complete logic
|
||
$comp{$ini}{opus_rx} = $def_opus_rx;
|
||
$comp{$ini}{opus_prefix} = 'Op.';
|
||
my @dirs = get_path();
|
||
my @files = grep -r $_, map "$_/$c.comp", @dirs or return 0;
|
||
my $f = $files[0];
|
||
# $f = $c =~ tr( ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ\x80-\x9F)
|
||
# ( !cLXY|S"Ca<__R~o+23'mP.,1o>...?AAAAAAACEEEEIIIIDNOOOOOx0UUUUYpbaaaaaaaceeeeiiiidnooooo:ouuuuyPy_)
|
||
# unless -r $f;
|
||
#warn "file looked up is $f";
|
||
return $comp{$ini} unless -r $f;
|
||
my $tbl = $comp{$ini};
|
||
my ($normalized);
|
||
my @works = _read_composer_file($self, $f, undef, $tbl, \my %aka);
|
||
return unless @works;
|
||
# Piano Trio No. 8 (Arrangement of the Septet; Op. 20)); Op. 38 (1820--1823)
|
||
# so can't m/.*?/
|
||
my $r = qr/^(.*($tbl->{opus_rx}))/s;
|
||
# Name "as in Wikipedia:Naming conventions (pieces of music)"
|
||
my (%opus, %name, %dup, %dupop);
|
||
for my $l (@works) {
|
||
my ($significant, $opus) = _significant($tbl, $l, $r);
|
||
if ($significant and $name{$significant}) {
|
||
$dup{$significant}++;
|
||
warn "Duplicate name `$significant': <$l> <$name{$significant}>"
|
||
if $ENV{MUSIC_DEBUG_TABLE};
|
||
}
|
||
$name{$significant} = $l if $significant;
|
||
$opus or next;
|
||
$opus = lc $opus;
|
||
if ($opus{$opus}) {
|
||
$dupop{$opus}++;
|
||
warn "Duplicate opus number `$opus': <$l> <$opus{$opus}>"
|
||
unless $tbl->{dup_opus_rx} and $opus =~ /$tbl->{dup_opus_rx}/
|
||
or $tbl->{dup_opus}{$opus};
|
||
}
|
||
$opus{$opus} = $l;
|
||
}
|
||
delete $name{$_} for keys %dup;
|
||
delete $opus{$_} for keys %dupop;
|
||
for my $s (keys %aka) {
|
||
my ($n) = _significant($tbl, $s, $r);
|
||
warn "Duplicate and/or unnecessary A.K.A. name `$s' for <$aka{$s}>"
|
||
if $name{$n};
|
||
$name{$n} = $aka{$s};
|
||
$name{"\0$s"} = "\0$n"; # put into values(), see normalize_person()
|
||
}
|
||
$tbl->{works} = \@works;
|
||
$tbl->{opus} = \%opus if %opus;
|
||
$tbl->{name} = \%name if %name;
|
||
$tbl;
|
||
}
|
||
|
||
sub translate_signature ($$$$) { # One should be able to override this
|
||
shift;
|
||
join '', @_;
|
||
}
|
||
$Normalize::Text::Music_Fields::translate_signature = \&translate_signature;
|
||
|
||
my %alteration = (dur => 'major', moll => 'minor');
|
||
my %mod = (is => 'sharp', es => 'flat', s => 'flat', # since Es means Ees
|
||
'#' => 'sharp', b => 'flat');
|
||
|
||
# XXXX German ==> English (nontrivial): H ==> B, His ==> B sharp, B ==> B flat
|
||
# XXXX Do not touch B (??? Check "Klavier" etc to detect German???)
|
||
my %key = (H => 'B');
|
||
|
||
sub normalize_signature ($$$$) {
|
||
my ($self, $key, $mod, $alteration) = @_;
|
||
$alteration ||= ($key =~ /[A-Z]/) ? ' major' : ' minor';
|
||
$alteration = lc $alteration;
|
||
$alteration =~ s/^-?\s*/ /;
|
||
$alteration =~ s/(\w+)/ $alteration{$1} || $1 /e;
|
||
$mod =~ s/^-?\s*/ / if $mod; # E-flat, Cb
|
||
$mod = lc $mod;
|
||
$mod =~ s/(\w+|#)/ $mod{$1} || $1 /e;
|
||
$key = uc $key;
|
||
$key = $key{$key} || $key;
|
||
&$Normalize::Text::Music_Fields::translate_signature($self,$key,$mod,$alteration);
|
||
}
|
||
|
||
my $post_opus_rex = qr/(?:[\-\/](?=\d)|(?:[,;.]?|\s)\s*(?:\bN(?:[or]|(?=\d))\.?|#|\x{2116}\.?))\s*(?=\d)/;
|
||
|
||
sub normalize_opus ($$$) {
|
||
my ($self, $op, $no) = (shift, shift, shift);
|
||
my $have_no = ( $op =~ s/\b(?:[,;.]?|\s)\s*(?=No\.\s*\d+)/, / );
|
||
$no = '' unless defined $no;
|
||
# nr12 n12 12 -12 #12 Numero_Sign 12 - but only if $op has no number already!
|
||
$no =~ s/^$post_opus_rex/, No. / unless $have_no;
|
||
# Now the tricky part: normalize the stuff in unknown format;
|
||
# XXXX Now support only "B. NNN" stuff
|
||
$op =~ s/^(\w)(\b|(?=\d))\.?\s*/\U$1. /;
|
||
"$op$no"
|
||
}
|
||
|
||
# 1: prefix ("in" etc.), 2: letter, 3: modifier ("b" etc), 4: alteration: minor etc.
|
||
my $signature_rex = qr/(\s*(?:\bin\b|[,;.:]|^|\((?:in\s+)?(?=[-a-zA-Z#\s]+\)))\s*)([a-h])(\s*[b#]|(?:\s+|-)(?:flat|sharp)|[ie]s|(?<=e)s|)((?:(?:\s+|-)(?:major|minor|dur|moll))?)\)?(?=\s*[-;":]|$)/i;
|
||
|
||
# All these should match in
|
||
# mp3info2 -D -a beethoven -t "# 28" ""
|
||
# (should give the same results): "wind in C" "tattoo" "WoO 20"
|
||
# "sonata in F#" "piano in F#" "op78" "Op. 10-2" "Op. 10, #2" "sonata #22" "WoO 205-1"
|
||
|
||
sub find_person ($) {
|
||
my $self = shift;
|
||
eval {$self->name_for_field_normalization} || eval {$self->composer}
|
||
|| $self->artist;
|
||
}
|
||
|
||
# See test_normalize_piece()
|
||
sub _normalize_piece ($$$$) {
|
||
my ($self, $n, $improve_opus, $by_opus) = (shift, shift, shift, shift);
|
||
my $ini_n = $n;
|
||
$n = $n->[0] if ref $n; # [value, handler]
|
||
return $ini_n unless $n;
|
||
$n =~ s/^\s+//;
|
||
$n =~ s/\s+$//;
|
||
return $ini_n unless $n;
|
||
$n =~ s/\s{2,}/ /g;
|
||
|
||
# Opus numbers
|
||
$n =~ s/\bOp(us\s+(?=\d)|[.\s]\s*|\.?(?=\d))/Op. /gi; # XXXX posth.???
|
||
$n =~ s/\bN(?:[or]|(?=\d))\.?\s*(?=\d)/No. /gi; # nr12 n12
|
||
$n =~ s/(?<!\w)[#\x{2116}]\s*(?=\d)/No. /gi; # #12, Numero Sign 12
|
||
|
||
my $c = find_person $self;
|
||
my $tbl = ($c and load_composer($self, $c)) || {};
|
||
my $opus_rx = $tbl->{opus_rx} || $def_opus_rx;
|
||
|
||
# XXXX Is this `?' for good?
|
||
$n =~ s/(?<=[^(.,;\s])(\s*[.,;])?\s*\b(?=$opus_rx)/; /gi
|
||
if $improve_opus; # punctuation before Op.
|
||
|
||
# punctuation between Op. and No (as in Wikipedia for most expanded listings)
|
||
# $n =~ s/\b((Op\.|WoO)\s+\d+[a-d]?)(?:[,;.]?|\s)\s*(?=No\.\s*\d+)/$1, /gi;
|
||
$n =~ s/($opus_rx)($post_opus_rex\d+)?/ normalize_opus($self, $1, $2) /gie;
|
||
|
||
# Tricky part: normalize "In b#"; allow just b# after punctuation too
|
||
$n =~ s/$signature_rex/
|
||
((not $1 or 'i' eq substr($1,0,1)) ? '' : ' ') . "in "
|
||
. normalize_signature($self,"$2","$3","$4")/ie;
|
||
my $canon;
|
||
{
|
||
$tbl or last;
|
||
# Convert Op. 23-3 to Op. and No
|
||
# my ($o, $no) = ($n =~ /\b(Op\.\s+\d+[a-d]?[-\/]\d+[a-d]?)((?:[,;.]?|\s)\s*(?:No\.\s*\d+))?/);
|
||
# $n =~ s/\b(Op\.\s+\d+[a-d]?)[-\/](\d+[a-d]?)/$1, No. $2/i
|
||
# if $o and not $no and $o !~ /^$opus_rx$/;
|
||
$tbl->{works} or last;
|
||
# XXX See Wikipedia "Opus number" for more complete logic
|
||
my ($opus) = ($n =~ /^.*($opus_rx)/); # at the end (one not in comments!)
|
||
if ($opus and $by_opus) {
|
||
$canon = $tbl->{opus}{lc $opus} or last;
|
||
} else { # $significant: Up to the first "No. NNN.N", or to the first ";"
|
||
my ($significant, $pre, $no, $post) =
|
||
($n =~ /^((.*?)\bNo\b[.]?\s*(\d+(?:\.\d+)*))\s*(.*)/is);
|
||
($significant) = ($n =~ /^(.*?);/s) unless $significant;
|
||
$significant ||= $n;
|
||
$canon = $tbl->{name}{lc $significant}; # Try exact match
|
||
if (not $canon) { # Try harder: match word-for-word
|
||
my ($ton, $rx_pre, $rx_post) = ('') x 3;
|
||
my $nn = $n;
|
||
if ($nn =~ s/\b(in\s+[A-H](?:\s+(?:flat|sharp))?\s+(?:minor|major))\b//) {
|
||
$ton = $1;
|
||
($significant, $pre, $no, $post) = # Redo with $nn
|
||
($nn =~ /^((.*?)\bNo\b[.]?\s*(\d+(?:\.\d+)*))\s*(.*)/is);
|
||
($significant) = ($nn =~ /^(.*?);/s) unless $significant;
|
||
$significant ||= $nn;
|
||
$ton = '.*\b' . (quotemeta $ton) . '\b';
|
||
}
|
||
$pre = $significant unless defined $pre; # Same with No removed
|
||
# my @parts2 = split '\W+', $post;
|
||
if ($pre and $pre =~ /\w/) {
|
||
$rx_pre = '\b' . join('\b.*\b', split /\W+/, $pre) . '\b';
|
||
}
|
||
if ($post and $post =~ /\w/) {
|
||
$rx_post = '.*' . join '\b.*\b', split /\W+/, $post;
|
||
}
|
||
# warn "<$no> <$n> <$nn> <$ton> <$rx_pre> <$rx_post>";
|
||
$no = '.*\bNo\.\s*' . (quotemeta $no) . '\b(?!\.\d)' if $no;
|
||
$no = '' unless defined $no;
|
||
last unless "$rx_pre$no$ton$rx_post";
|
||
my $sep = $tbl->{no_opus_no} ? '' : '.*;';
|
||
my $rx = qr/$rx_pre$no$ton$rx_post$sep/is;
|
||
my @matches = grep /$rx/, values %{$tbl->{name}};
|
||
if (@matches == 1) {
|
||
$canon = $matches[0];
|
||
} elsif (!@matches) {
|
||
last;
|
||
} else { # Many matches; maybe the shortest is substr of the rest?
|
||
my ($l, $s, $diff) = 1e100;
|
||
$l > length and ($s = $_, $l = length) for @matches;
|
||
$s eq substr $_, 0, $l or ($diff = 1, last) for @matches;
|
||
last if $diff;
|
||
$canon = $s;
|
||
}
|
||
$canon = $tbl->{name}{$canon} if $canon =~ s/^\0//s; # short name
|
||
}
|
||
}
|
||
# if ($canon) {
|
||
# my (%w, %w1);
|
||
# for my $w (split /[-.,;\s]+/, $canon) {
|
||
# $w{lc $w}++;
|
||
# }
|
||
# for my $w (split /[-.,;\s]+/, $n) {
|
||
# $w1{lc $w}++ unless $w{lc $w};
|
||
# }
|
||
# if (%w1) {
|
||
# warn "Unknown words in title: `", join("` '", sort keys %w1), "'"
|
||
# unless $ENV{MUSIC_TRANSLATE_FIELDS_SKIP_WARNINGS};
|
||
# last
|
||
# }
|
||
# }
|
||
$n = $canon; # XXXX Simple try (need to compare word-for-word)
|
||
}
|
||
return ref $ini_n ? [$n, $ini_n->[1]] : $n;
|
||
}
|
||
|
||
sub normalize_piece ($$) {
|
||
_normalize_piece(shift, shift, 'improve opus', 'by opus');
|
||
}
|
||
|
||
sub opus_parser ($) {
|
||
my $tag = shift;
|
||
my $c = find_person $tag;
|
||
my $tbl = ($c and load_composer($tag, $c));
|
||
my $opus_rx = $tbl->{opus_rx} || $def_opus_rx;
|
||
my $opus_pre = $tbl->{opus_prefix} || 'Op.';
|
||
($opus_rx, $opus_pre, $c)
|
||
}
|
||
|
||
sub full_opus ($$;$$) {
|
||
my ($tag, $short, $opus_rx, $opus_pref) = (shift, shift, shift, shift);
|
||
($opus_rx, $opus_pref) = opus_parser($tag) unless $opus_rx;
|
||
|
||
$short = "$opus_pref $short" if $short =~ /^\d/ and not $short =~ /$opus_rx/;
|
||
$short =~ s/^($opus_rx)($post_opus_rex\d+)?/ normalize_opus($tag, $1, $2) /gie;
|
||
$short
|
||
}
|
||
|
||
# Currently used Title-* fields: RAW, Opus, Dates, Key, Name, Related-Name,
|
||
# Alternative-Name, Punct, Type, Count, For, Type-After-Name, In-Movements
|
||
# Related-On, Comment, Related-After, Name-By-First-Row
|
||
## [When new added, change also the "merging" logic in merge_info().]
|
||
sub normalize_mail_header_line ($$;$$) {
|
||
my ($tag, $in, $opus_rx, $opus_pref) = (shift, shift, shift, shift);
|
||
my ($t, $v) = $in =~ /^([-\w]+):\s*(.*)$/s or die;
|
||
$v = "($v)" if $t eq 'Title-Dates';
|
||
$v = full_opus $tag, $v, $opus_rx, $opus_pref
|
||
if $t eq 'Title-Opus' and $v =~ /(^\d|[\-\/])/;
|
||
$v = "; $v" if $t eq 'Title-Opus';
|
||
$v = qq("$v") if $t =~ /^Title(-Related)?-Name$/;
|
||
$v = qq(["$v"]) if $t =~ /^Title-Name-By-First-Row$/;
|
||
$v = qq(; "$v") if $t eq 'Title-Alternative-Name';
|
||
$v =~ s/^(in\s+)?/in /i if $t =~ 'Title-Key';
|
||
$v = "No. $v" if $t eq 'Title-No';
|
||
$v = "for $v" if $t eq 'Title-For';
|
||
$v = "on $v" if $t eq 'Title-Related-On';
|
||
$v = "(lyrics by $v)" if $t eq 'Title-Lyrics-By';
|
||
$v = ", $v" if $t eq 'Title-Type-After-Name';
|
||
$v;
|
||
}
|
||
|
||
## perl -wple "BEGIN {print q(# format = mail-header)} s/#\s*normalized\s*$//; $_ = qq(Title: $_) unless /^\s*(#|$)/; $_ = qq(\n$_) if $p and not /^##/; $_ .= qq(\n) unless $p = /^##/" Normalize::Text::Music_Fields-G_Gershwin.comp >Music_Fields-G_Gershwin.comp-mail
|
||
sub normalize_piece_mail_header ($$;$$) {
|
||
my ($tag, $in, $opus_rx, $opus_pref) = (shift, shift, shift, shift);
|
||
return $1 if $in =~ /^Title:\s*(.*?)\s*$/m;
|
||
my @pieces = map normalize_mail_header_line($tag, $_, $opus_rx, $opus_pref),
|
||
grep /^Title-[-\w]+:\s/, split /\n/, $in;
|
||
for my $i (1 .. @pieces - 1) {
|
||
$pieces[$i-1] .= ' '
|
||
unless $pieces[$i-1] =~ /[\(\[\{]$/ or $pieces[$i] =~ /^[\)\]\}.,;:?!]/;
|
||
}
|
||
return join '', @pieces;
|
||
}
|
||
|
||
sub shorten_opus ($$$$) { # $mp3, $str, $pre
|
||
my ($tag, $op, $pref, $rx) = (shift, shift, shift, shift);
|
||
my ($out, $cut) = ($op, '');
|
||
if ($out =~ s/^\Q$pref\E\s*(?=\d)//) {
|
||
if ($out =~ $rx) { # back up if shortened version causes confusion
|
||
$out = $op;
|
||
} else {
|
||
$cut = $pref;
|
||
}
|
||
}
|
||
my $out1 = $out;
|
||
if ($out =~ s/(\d[a-i]?),\s+No\.\s*(?=\d)/$1-/) {
|
||
my $o = full_opus($tag, $out, $rx, $pref);
|
||
if ($op ne $o or $out =~ /^$rx$/) { # check again
|
||
$out = $out1;
|
||
unless ($out eq $op) { # Extra sanity check
|
||
$o = full_opus($tag, $out, $rx, $pref);
|
||
$out = $op unless $op eq $o;
|
||
}
|
||
}
|
||
}
|
||
$out
|
||
}
|
||
|
||
my $main_instr = join '|', qw(Piano Violin Viola Cello Horn String Wind Harp
|
||
Instrument Clarinet Alto);
|
||
my $for_instr = join '|', qw(Mandolin Harpsichord chorus soprano alt bass
|
||
basses tenor mezzo-soprano \(mezzo\)soprano baritone contralto hand
|
||
soli soloists woodwinds celesta accordion instrumentalists large small
|
||
double violoncello clarinet oboe english french bassoon trombone organ
|
||
flute voice orchestra military band chamber symphonic symphony electric
|
||
percussion double-bass vibraphone pantomime instrumental ensemble tape
|
||
timpani bells keyboard guitar triple percussionist counter-tenor alto
|
||
counter-alto male female children's boys' mixed a capella cappella choir
|
||
basssoli chamberorchestra metronome triangle harmonium trumpet);
|
||
my $multiplets = join '|', qw(solo duo duet trio quartet quintet sextet septet octet);
|
||
my $pieces = join '|', qw(Serenada Serenade Romance Song Notturno Aria Mass
|
||
Allemande Chorus Allegretto Rondo Opera Fantasia Polonaise Contredanse
|
||
Prelude Andante Cadenza Bagatelle Cantata Aria Joke Waltz Waltzes Minuet
|
||
Ländler March Rondino Variations Equali Fugue Piece Symphony Sonata
|
||
Concerto Sonatina Dance Mignon Fantasy Scherzo Polka Moderato Fragment
|
||
Transcription Orchestration Suite Music Reduction Passacaglia Arrangement
|
||
accompaniment choral score Operetta Ballet oratorio Choruses Intermezzo
|
||
Overture Dialogue Epilogue Aphorism Monologue Gallop Interlude
|
||
Re-orchestration Reorchestration Cycle Potpourri Nocturne Capriccio
|
||
Mazurek Mazurka Impromptu Humoresque Ballade Ballads Gavotte Requiem
|
||
Fanfares Motet Rhapsodies Rhapsody Intermezzi Poem Marches Theme
|
||
Melody);
|
||
|
||
my $numb_rx = qr/one|two|three|four|five|six|seven|eight|nine/i;
|
||
|
||
my $count_rx = qr/ \d+
|
||
| (?:$numb_rx)(?:teen)?
|
||
| ten|eleven|twelve|thirteen|fifteen|eighteen
|
||
| (?:twenty|thirty|fourty|fifty|sixty|seventy|eighty|ninety)
|
||
(?: (?:\s+ | -) (?:$numb_rx) )? /ix;
|
||
|
||
#no utf8; # `use' is needed by 5.005
|
||
|
||
my $for_rx = qr/ (?:\s+|^)
|
||
for
|
||
(?: (?:\s+|(?<=\/)) \(?
|
||
(?:and|or|&|vocal\s+soloist|$main_instr|$for_instr|prepared\s+piano|magnetic\s+tape|stage\s+orchestra|jazz\s+ensemble|(?:vocal\s+)?(?:$multiplets)|$count_rx|[23456789]|[12345]\d|Große Fuge)
|
||
(?:s|\(s\))? \)?
|
||
[,\/]?
|
||
)+
|
||
/ix;
|
||
|
||
my $piece_rx = qr/ (?: (?:Transcription|Orchestration|Reduction|Arrangement|Suite|Instrumentation|Re-?orchestration)
|
||
\s+ of
|
||
(?: \s+ (?: $main_instr | the | $count_rx ) )?
|
||
\s+ )? # Mod
|
||
(?:
|
||
(?: $main_instr | Vocal | secular | sacred
|
||
| Double | Triple | Easy | Trio | Symphonic )
|
||
\s+ )? # Prefix
|
||
(?:Concerto\s+grosso | $multiplets
|
||
| Ecossaise?
|
||
| (?:[123456]-part\s+)? (?:riddle\s+)? Canon
|
||
| (?:sets\s+of\s+)? (?: chorale\s+preludes? | $pieces )
|
||
(?: s? \s* (?:\band\b|&) \s* (?:$pieces))?
|
||
| Incidental\s+music | electronic\s+composition
|
||
| chorale\s+prelude
|
||
| Musical\s+greetings? | choral\s+score | vocal\s+quartet
|
||
| (?:heroic|comic|tragic|historical)\s+opera
|
||
| scenic\s+composition | symphonic\s+poem ) # Main type
|
||
(?: s? \s+ in \s+ (?:$numb_rx) \s+ act )?
|
||
/ix;
|
||
|
||
#use utf8; # needed by 5.005
|
||
|
||
my $name_rx = qr/ (?: [A-Z]\w* \.? \s+)* [A-Z][-\'\w]+ /x;
|
||
|
||
my $rel_piece_rx = # Two Pieces for Erwin Dressel's Opera "Armer Columbus"
|
||
qr/ \b
|
||
(?:to|from|of|a\s+fter|for|on(?:\s+motives\s+of)?)
|
||
(?:
|
||
\s+ (?: \s+ music \s+ to)? (?: the | $name_rx\'s ) # Erwin Dressel's
|
||
(?: \s+ (?: (?:(?:silent|animated)\s+)? film | spectacle | comedy
|
||
| TV[-\s]+production | music\s+to\s+the\s+film
|
||
| play | (?:Chamber-?\s*)? opera | stage \s+ revue | novel))?)? \b
|
||
/ix;
|
||
|
||
|
||
sub strip_known_from_end ($$$) {
|
||
my ($tag, $in, $try_key, @tail) = (shift, shift, shift);
|
||
# E.g., when the second name is based on the first line of lyrics:
|
||
unshift @tail, "Title-Lyrics-By: $1" if $in =~ s/\s+\(lyrics\s+by\s+([^()]+)\)$//;
|
||
unshift @tail, "Title-Alternative-Name: $4"
|
||
while $in =~ s/^(.*?".*?".*)(\s*[.:,;])?\s+(?(2)|(?=\())(\()?"([^\"]+)"(?(3)\)|)$/$1/;
|
||
|
||
# Too much recognized as this if ???
|
||
while ( $in =~ s/ \s* ( $rel_piece_rx | (?!$) [.:,;]? )
|
||
(?: \s+
|
||
( (\[)? ["\x{201E}]([^\"\x{201C}\x{201E}]+)["\x{201C}] (?(3) \] | )
|
||
| \(["\x{201E}]([^\"\x{201C}\x{201E}]+)["\x{201C}]\) )) $
|
||
//xo ) {
|
||
if (length $1 <= 1) {
|
||
unshift @tail, "Title-Name: $+";
|
||
} else {
|
||
unshift @tail, "Title-Related-Name: $+" if $2;
|
||
unshift @tail, "Title-Related-How: $1";
|
||
}
|
||
}
|
||
unshift @tail, "Title-Related-By: after $1"
|
||
if $in =~ s/ \s* after \s+ ($name_rx) $//xo;
|
||
|
||
unshift @tail, "Title-Related-On: $+" # Variation and Fugue
|
||
if $in =~ s/ ( \b variations? (?: \s+ and \s+ $piece_rx)? (?:$for_rx)? )
|
||
\s+ on \s+ # on a Hungarian melody
|
||
(an? \s+ (?: (?: $name_rx | original ) \s+)? $piece_rx
|
||
(?: \s+ by \s+ $name_rx)? )$/$1/xio; # XXXX Why $+ needed?
|
||
|
||
unshift @tail, "Title-In-Movements: $1"
|
||
if $in =~ s/\s*(in\s+(a\s+single|$numb_rx|\d)\s+(movement|episode)s?)$//;
|
||
|
||
unshift @tail, "Title-Key: " . normalize_signature($tag, "$2", "$3", "$4")
|
||
if $in =~ s/\s*$signature_rex$//;
|
||
if ($in =~ s/\s*([.,;:])?\s+No\.\s*(\d+[a-d]?(\.\d+)?)$//i) {
|
||
unshift @tail, "Title-No: $2";
|
||
unshift @tail, "Title-Punct: $1" if $1;
|
||
}
|
||
|
||
unshift @tail, "Title-Key: " . normalize_signature($tag, "$2", "$3", "$4")
|
||
if $try_key and $in =~ s/[:;,]?\s*$signature_rex$//;
|
||
|
||
my $f;
|
||
($f = $1) =~ s/^\s*for\s*//, unshift @tail, "Title-For: $f"
|
||
if $in =~ s/($for_rx)$//io; # XXXX: foo arranged for piano ???
|
||
|
||
if ($in =~ s/\s*([.,;:])?\s+No.\s*(\d+[a-d]?(\.\d+)?)$//i) { # Repeat
|
||
unshift @tail, "Title-No: $2";
|
||
unshift @tail, "Title-Punct: $1" if $1;
|
||
}
|
||
|
||
($in, @tail);
|
||
}
|
||
|
||
sub parse_piece ($$$$$$$); # Predeclaration for recursive call without ()
|
||
sub parse_piece ($$$$$$$) {
|
||
my ($after_name, $at_end, $at_start, $tag, $in, $opus_pref, $opus_rx, @tail)
|
||
= (shift, shift, shift, shift, shift, shift, shift);
|
||
if ($at_end) {
|
||
unshift @tail, "Title-Dates: $2"
|
||
if $in =~ s/(.*\S)\s*\(([^()]*\b\d{4}\b[^()]*)\)$/$1/ # $1 makes greedy
|
||
or $at_end and not $at_start and
|
||
$in =~ s/^()\s*\(([^()]*\b\d{4}\b[^()]*)\)$/$1/; # $1 makes greedy
|
||
unshift @tail, "Title-Opus: " . shorten_opus($tag, "$2", $opus_pref, $opus_rx)
|
||
while $in =~ s/(.*);\s+($opus_rx)\s*$/$1/;
|
||
unshift @tail, "Title-Key: " . normalize_signature($tag, "$2", "$3", "$4")
|
||
if $in =~ s/\s*$signature_rex$//;
|
||
}
|
||
($in, my @r) = strip_known_from_end($tag, $in, 'look for key');
|
||
unshift @tail, @r;
|
||
|
||
# Now recognize comment as everything after a key (except, maybe, name)
|
||
if ($in =~ /^(.*\S)\s*$signature_rex\s*(?:"([^\"]+)"\s*)?(?:([.,:;])\s)?(.*)$/) {
|
||
$in = $1;
|
||
my $k = normalize_signature($tag, "$3", "$4", "$5");
|
||
my($n,$rest) = ($6, $8);
|
||
if (length $rest) {{ # Localize match
|
||
unshift @tail,
|
||
'Title-'. ($8 =~ /^[^\s\w]$/ ? 'Punct' : 'Comment'). ": $rest";
|
||
}}
|
||
unshift @tail, "Title-Punct: $7" if $7;
|
||
my $alt = ($in =~ /".*"/ ? '-Alternative' : '');
|
||
unshift @tail, "Title$alt-Name: $n" if defined $n and length $n;
|
||
unshift @tail, "Title-Key: $k";
|
||
}
|
||
|
||
# Now repeat looking for known fields
|
||
($in, @r) = strip_known_from_end($tag, $in, not 'look for key');
|
||
unshift @tail, @r;
|
||
|
||
if ($at_start) { # and (@tail or not $at_end)
|
||
unshift @tail, "Title-Type: $1" if $in =~ s/^($piece_rx s?)\s*$//iox;
|
||
unshift @tail, "Title-Count: $1" , "Title-Type: $2"
|
||
if $in =~ s/^($count_rx)\s+( $piece_rx s?)\s*$//iox;
|
||
unshift @tail, "Title-Count: $1"
|
||
if $in =~ s/^($count_rx)\s*$//iox;
|
||
}
|
||
if (not @tail and $at_start and $at_end) {
|
||
unshift @tail, "Title: $in";
|
||
} elsif (not length $in) { # Do nothing
|
||
} elsif ($in =~ /^\s*[-,:;.()\[\]{}]\s*$/) {
|
||
unshift @tail, "Title-Punct: $in";
|
||
} elsif ($after_name and $in =~ /^(by|after)((\s+and)?\s+[A-Z][-\'\w]+)+\s*$/) {
|
||
unshift @tail, "Title-Related-By: $in";
|
||
} elsif ($after_name and $in =~ /^([-,;:])\s+($piece_rx s?)\s*$/iox) {
|
||
unshift @tail, "Title-Type-After-Name: $2";
|
||
} elsif ($at_start and $in =~ /^"([^\"]+)"\s*$/iox) {
|
||
unshift @tail, "Title-Name: $1";
|
||
} else {
|
||
if ($at_start and $in =~ /^"([^\"]+)"[,.;:]\s*(\S.*?)\s*$/) {
|
||
my $name = $1; # Pretend we are at start:
|
||
my @rest = parse_piece 'after_name', ($at_end and not @tail), 'start',
|
||
$tag, "$2", $opus_pref, $opus_rx;
|
||
unshift @rest, "Title-Punct: ,"
|
||
unless $rest[0] =~ s/^Title-Type:/Title-Type-After-Name:/;
|
||
return("Title-Name: $name", @rest, @tail)
|
||
unless (join "\n", '', @rest) =~ /\nTitle-RAW:/;
|
||
}
|
||
unshift @tail, "Title-RAW: $in";
|
||
}
|
||
@tail;
|
||
}
|
||
|
||
my %html_esc = qw( amp & lt < gt > );
|
||
|
||
sub naive_format ($$$) { # Used to find glaring errors in conversion only
|
||
my ($tag, $in, $opus_rx, $opus, @out) = (shift,shift,shift);
|
||
$in =~ s/^($opus_rx)\n/$1: /;
|
||
my @in = split /\s*\n\s*/, $in;
|
||
if ($in[0] =~ s/^($opus_rx)[:,]\s*/Title-RAW: /) {
|
||
($opus = $1) =~ s/^Opus\b/Op./;
|
||
}
|
||
for my $l (@in) {
|
||
if ($l =~ s/^Title-Bold:\s*//) {
|
||
push @out, qq("$l");
|
||
} elsif ($l =~ s/^Title-Opus:\s*//) {
|
||
push @out, '; ' . full_opus $tag, "$l";
|
||
} elsif ($l =~ s/^Title-Dates:\s*//) {
|
||
push @out, "($l)";
|
||
} elsif ($l =~ s/^X-\w[-\w]*:\s*//) { # Do nothing
|
||
} elsif ($l =~ s/^Title-(RAW|Comment):\s*//) {
|
||
push @out, $l if length $l;
|
||
} else {
|
||
warn "Naive formatting: Unknown line format `$l'"
|
||
}
|
||
}
|
||
if (defined $opus) {
|
||
my @year;
|
||
@year = $1 if @out and $out[-1] =~ s/\s*(\([^()]*\b\d{4}\b[^()]*\))$//;
|
||
pop @out unless @out and length $out[-1];
|
||
push @out, "; $opus", @year;
|
||
}
|
||
for my $n (1..$#out) {
|
||
$out[$n] =~ s/^(?![.,;:])/ /;
|
||
}
|
||
join '', @out
|
||
}
|
||
|
||
# Convert from line-format to mail-header format:
|
||
## perl -MNormalize::Text::Music_Fields -wlne "BEGIN {$tag = Normalize::Text::Music_Fields::prepare_tag_object_comp(shift @ARGV); print q(# format = mail-header)} print Normalize::Text::Music_Fields::emit_as_mail_header($tag,$_, 0,$pre)" gershwin Music_Fields-G_Gershwin.comp-line >Music_Fields-G_Gershwin.comp-mail1
|
||
# (inverse transformation:) Dump pieces listed in mail-header format
|
||
## perl -MNormalize::Text::Music_Fields -wle "print for Normalize::Text::Music_Fields::read_composer_file(shift, shift)" gershwin Music_Fields-G_Gershwin.comp-mail > o
|
||
#
|
||
## perl -MNormalize::Text::Music_Fields -00wnle "BEGIN {$tag = Normalize::Text::Music_Fields::prepare_tag_object_comp(shift @ARGV); print q(# format = mail-header)} print Normalize::Text::Music_Fields::emit_as_mail_header($tag,$_, q(bold,xml,opus),$pre)" shostakovich o-xslt-better >Music_Fields-D_Shostakovich.comp-mail1
|
||
## perl -MNormalize::Text::Music_Fields -wnle "BEGIN {$tag = Normalize::Text::Music_Fields::prepare_tag_object_comp(shift @ARGV); print qq(# format = mail-header\n)} print Normalize::Text::Music_Fields::emit_as_mail_header($tag,$_, q(opus), $pre)" schnittke o-schnittke-better >Music_Fields-A_Schnittke.comp-mail2
|
||
sub emit_as_mail_header ($$$$) { # $mp3, $str, $has_bold_parts_etc, $pre [R/W]
|
||
my ($tag, $in, $preformatted) = (shift, shift, shift);
|
||
$in =~ s/#\s*normalized\s*$//;
|
||
#return "\n" if $in =~ /^\s*$/;
|
||
my @out;
|
||
unless ($in =~ /^\s*(#|$)/) {
|
||
return "\n\n" if $preformatted and $in =~ /^<\?xml\b/;
|
||
my $ini = my $ini_raw = $in;
|
||
$in =~ s/&(amp|lt|gt);/$html_esc{$1}/g if $preformatted =~ /\bxml\b/;
|
||
$in =~ s/&#x([\da-f]+);/chr hex $1/gei if $preformatted =~ /\bxml\b/;
|
||
|
||
my ($opus_rx, $opus_pre) = opus_parser($tag);
|
||
|
||
my $have_op = ($in =~ /^$opus_rx:/);
|
||
# When $use_only_opus, all the text but Opus-No is ignored; bad for update
|
||
my $use_only_opus = ($preformatted =~ /\bonly_by_opus\b/);
|
||
$in = _normalize_piece($tag, $in, !$have_op, $use_only_opus)
|
||
unless $preformatted =~ /\bbold\b/;
|
||
|
||
$ini = naive_format($tag, $in, $opus_rx) if $preformatted =~ /\b(opus|bold)\b/;
|
||
my @op;
|
||
my $prefix = ($preformatted =~ /\bbold\b/ ? 'Title-RAW: ' : '');
|
||
if ($in =~ s/^($opus_rx)(?:[:,](?:[ \t]+|(?=\n))|\n\s*)/$prefix/) {
|
||
my $op = $1;
|
||
my $o_pre = $opus_pre;
|
||
$o_pre = 'Opus' if $op =~ /^Opus\b/;
|
||
@op = "Title-Opus: " . shorten_opus($tag, $op, $o_pre, $opus_rx);
|
||
} elsif ($preformatted =~ /\bopus\b/) {
|
||
warn "Expected to start with `Opus NUMBER: ': <<<$in>>>";
|
||
}
|
||
if ($preformatted =~ /\bbold\b/) {
|
||
my @parts = split /\s*\n\s*/, $in;
|
||
my ($after_for, $after_name);
|
||
for my $n (0..$#parts) {
|
||
my $p = $parts[$n];
|
||
$p =~ s/\s+$//;
|
||
if ($p =~ s/^Title-Bold:\s*//) {
|
||
my $rel = $after_for ? '-Related' : '';
|
||
push @out, "Title$rel-Name: $p";
|
||
$after_for = 0, $after_name = 1;
|
||
next;
|
||
} elsif ($p =~ /^Title-RAW:\s*$/) { # Do nothing
|
||
next;
|
||
} elsif ($after_for =
|
||
($n != $#parts and $parts[$n+1] =~ /^Title-Bold:\s*/
|
||
and $parts[$n] =~ /^Title-RAW:\s*/
|
||
# Title-RAW: Two Pieces for Erwin Dressel's Opera "Armer Columbus"
|
||
and $p =~ s/ \s* ( $rel_piece_rx \s*$ )//ixo)) {
|
||
my $how = $1;
|
||
$p =~ s/^Title-RAW:\s+//
|
||
or warn "Expected to start with Title-RAW: <<<$p>>>";
|
||
push @out,
|
||
parse_piece $after_name,!'end', !$n, $tag, $p, $opus_pre, $opus_rx;
|
||
push @out, "Title-Related-How: $how";
|
||
} elsif ($p =~ s/^Title-Opus:\s+// ) {
|
||
push @out, 'Title-Opus: ' . full_opus $tag, $p, $opus_rx, $opus_pre;
|
||
$after_name = 0;
|
||
} elsif ($p =~ /^(Title-(Opus|Comment|Dates)|X-Title-Opus-Alt):\s+/ ) { # Keep intact
|
||
push @out, $p;
|
||
$after_name = 0;
|
||
} else {
|
||
$p =~ s/^Title-RAW:\s+// or warn "Expected to start with `Title-RAW: ': <<<$p>>>";
|
||
push @out, parse_piece $after_name, $n==$#parts, !$n, $tag, $p, $opus_pre, $opus_rx;
|
||
$after_name = 0;
|
||
}
|
||
}
|
||
} else {
|
||
@out = parse_piece 0, 'at_end', 'at_start', $tag, $in, $opus_pre, $opus_rx;
|
||
}
|
||
my @y;
|
||
unshift @y, pop @out while $out[-1] =~ /^Title-Dates:\s/;
|
||
push @out, @op, @y;
|
||
$out[0] =~ s/^Title:/Title-RAW:/ if @out > 1; # Opus 1: foo
|
||
$in = join "\n", @out, ($preformatted =~ /\bbold\b/ ? ('','') : ()); # \n\n
|
||
|
||
my $res = normalize_piece_mail_header($tag, $in, $opus_rx, $opus_pre);
|
||
warn "# Mismatch:\n# in = $ini\n# out = $res\n#rawin= $ini_raw\n" unless $res eq $ini;
|
||
}
|
||
$in = "\n$in" if $in !~ /^\s*##/ and $_[0] and not $preformatted =~ /\bbold\b/;
|
||
$in .= qq(\n) unless $preformatted =~ /\bbold\b/ or $_[0] = ($in =~ /^##/);
|
||
$in; # Caller appends extra \n
|
||
}
|
||
|
||
## perl -MNormalize::Text::Music_Fields -wnle "BEGIN {$tag = Normalize::Text::Music_Fields::prepare_tag_object_comp(shift @ARGV); print qq(# format = mail-header\n)} next unless s/^\s*\+\+\s*//; print Normalize::Text::Music_Fields::merge_info($tag,$_, q(opus))" brahms o-brahms-op-no1-xslt
|
||
sub merge_info ($$$;$$) { # $update not fully implemented
|
||
my ($tag, $in, $preformatted, $soft, $update) = (shift, shift, shift, shift, shift);
|
||
my $parsed = emit_as_mail_header($tag, $in, $preformatted, my $pre);
|
||
my $op_n = ($parsed =~ /^Title-Opus: (.*)/m and $1);
|
||
die "Can't find opus number in `$in'" unless defined $op_n;
|
||
my $op_no = full_opus $tag, $op_n;
|
||
|
||
$parsed =~ s/^Title-Punct:\s*-\nTitle-Name:/Title-Name-By-First-Row:/;
|
||
$soft ||= qr(^(?!)); # Never match
|
||
warn "Opus [$op_n]: Type `$1' interpreted as Title-Name\n"
|
||
if $op_n =~ $soft and $parsed =~ s/^Title-Type:/Title-Name:/m
|
||
and $parsed =~ /^Title-Name:\s*(.*)/;
|
||
warn("Too many fields in `$parsed', skipping"), return ''
|
||
if $parsed =~ /^(?=.)(?!Title-(?:Opus|RAW|Name(?:-By-First-Row)?|Key|Dates):)/m;
|
||
|
||
my $name = normalize_piece $tag, $op_no; # expand opus+no to the full name
|
||
|
||
if ($name eq $op_no) { # No current information
|
||
my ($opus_rx, $opus_pre) = opus_parser($tag);
|
||
die "No subopus number in `$op_no' (from `$in')"
|
||
unless $op_no =~ /^($opus_rx)\s*[.,:;]\s*No/;
|
||
my $op = $1;
|
||
$name = normalize_piece $tag, $op; # Expands opus to the full name
|
||
$update = 0;
|
||
} elsif (not $update) {
|
||
die "Opus `$op_no' already known: `$name'";
|
||
}
|
||
|
||
my $parsed_op = emit_as_mail_header($tag, $name, 'only_by_opus', my $pre1);
|
||
warn("Prior knowledge not found for `$in'\n"),
|
||
return $parsed if $parsed_op =~ /^Title:/; # Not found, or not parsable
|
||
|
||
unless ($update) { # Handling "a group name"
|
||
$parsed_op =~ s/^Title-Count:.*\n//; # Four ballades for piano
|
||
if ($parsed_op =~ /^Title-Type:\s*(.*)\n/) { # Strip the plural
|
||
my $type = $1;
|
||
$type =~ s/^ Sets \s+ of \s+/Set of /x
|
||
or $type =~ s/^ ($piece_rx) (?:s | es) $/$1/x; # Strip the plural
|
||
$parsed_op =~ s/^.*/Title-Type: $type/;
|
||
}
|
||
$parsed_op =~ s/^Title-Opus:.*/Title-Opus: $op_n/m
|
||
or die "Can't find Opus: `$parsed_op'";
|
||
}
|
||
if ($parsed =~ /^Title-Dates:\s*(.*)/m) {
|
||
my $d = $1; # (?<!.) does as /^/m, but matches at end too
|
||
$parsed_op =~ s/(?<!.)(Title-Dates:.*\n|\Z)/Title-Dates: $d\n/ or die;
|
||
}
|
||
if ($parsed =~ /^Title-Key:\s*(.*)/m) {
|
||
my $k = $1;
|
||
die "Key mismatch: $k vs $1"
|
||
if $parsed_op =~ /^Title-Key:\s*(.*)/m and $1 ne $k;
|
||
# XXXX Where put the key? STD orders: Type/No/Key/For or Type/For/No/Key
|
||
# There is also (beeth) Type/For/Related-On/Key??? Type/For/Key???
|
||
$parsed_op =~ s/(?<!.)(?=Title-(?!(?:Type|For|Related-On|No):)|\Z)/Title-Key: $k\n/ or die;
|
||
}
|
||
if ($parsed =~ s/^Title-RAW:/Title-Name:/m) {
|
||
(my $n) = ($parsed =~ /^Title-Name:\s*(.*)/m);
|
||
warn "Title-RAW `$n' interpreted as Title-Name in `$in'\n";
|
||
}
|
||
if ($parsed =~ /^(Title-Name(?:[-\w]*):\s*.*)/m) { # pre: Type-After-Name, In-Movements
|
||
my $n = $1; # Related-On, Comment, Related-After
|
||
$parsed_op =~ s/(?<!.)(?=Title-(?:Type-After-Name|In-Movements|Related-On|Comment|Related-After|Opus|Dates):|\Z)/$n\n/ or die;
|
||
}
|
||
$parsed_op
|
||
}
|
||
|
||
for my $field (qw(album title title_track)) {
|
||
no strict 'refs';
|
||
*{"normalize_$field"} = \&normalize_piece;
|
||
}
|
||
|
||
# perl -Ii:/zax/bin -MNormalize::Text::Music_Fields -wle "BEGIN{binmode $_, ':encoding(cp866)' for \*STDIN, \*STDOUT, \*STDERR}print Normalize::Text::Music_Fields->check_persons"
|
||
sub check_persons ($) {
|
||
my $self = shift;
|
||
my %seen;
|
||
$seen{$_}++ for values %tr;
|
||
for my $l (keys %seen) {
|
||
my $s = short_person($self, $l);
|
||
my $ll = normalize_person($self, $s);
|
||
warn "`$l' => `$s' => `$ll'" unless $ll eq $l;
|
||
}
|
||
%seen = ();
|
||
$seen{$_}++ for values %short;
|
||
for my $s (values %seen) {
|
||
my $l = normalize_person($self, $s);
|
||
my $ss = short_person($self, $l);
|
||
warn "`$s' => `$l' => `$ss'" unless $ss eq $s;
|
||
}
|
||
}
|
||
|
||
my %aliases;
|
||
|
||
sub load_lists () {
|
||
my @dirs = get_path();
|
||
my @lists = map <$_/*.lst>, @dirs;
|
||
#warn "dirs=`@dirs', lists=`@lists'\n";
|
||
warn("panic: can't find name lists in `@dirs'"), return 0 unless @lists;
|
||
|
||
for my $f (@lists) {
|
||
local $/ = "\n";
|
||
open F, "< $f" or warn("Can't open `$f' for read: $!"), next;
|
||
my @in = <F>;
|
||
close F or warn("Can't close `$f' for read: $!"), next;
|
||
my $charset;
|
||
for (@in) {
|
||
next if /^\s*$/;
|
||
if ( /^ \s* \# \s* (?:charset|encoding) \s* = \s* ("?) (.*?) \1 \s* $/ix) {
|
||
$charset = $2;
|
||
require Encode;
|
||
next;
|
||
}
|
||
$_ = Encode::decode($charset, $_) if $charset; # Make empty to disable
|
||
s/^\s+//, s/\s+$//, s/\s+/ /g;
|
||
next if /^##/;
|
||
if (/^ \# \s* (alias|fix|shortname_for) \s+ (.*?) \s* => \s* (.*)/x) {
|
||
if ($1 eq 'alias') {
|
||
$aliases{$2} = [split /\s*,\s*/, $3];
|
||
} elsif ($1 eq 'fix') {
|
||
my ($old, $ok) = ($2, $3);
|
||
$tr{translate_dots $old} = $tr{translate_dots $ok} || $ok;
|
||
#print "translating `",translate_dots $old,"' to `",translate_dots $ok,"'\n";
|
||
} elsif ($1 eq 'shortname_for') {
|
||
my ($long, $short) = ($2, $3);
|
||
$tr{translate_dots $short} = $long;
|
||
($long) = strip_years($long);
|
||
$short{$long} = $short;
|
||
}
|
||
next;
|
||
}
|
||
if (/^ \# \s* fix_firstname \s+ (.*\s(\S+))$/x) {
|
||
$tr{translate_dots $1} = $tr{translate_dots $2};
|
||
next;
|
||
}
|
||
if (/^ \# \s* keep \s+ (.*?) \s* $/x) {
|
||
$tr{translate_dots $1} = $1;
|
||
next;
|
||
}
|
||
if (/^ \# \s* shortname \s+ (.*?) \s* $/x) {
|
||
my $in = $1;
|
||
my $full = __PACKAGE__->_translate_person($in, 0);
|
||
unless (defined $full and $full ne $in) {
|
||
my @parts = split /\s+/, $in;
|
||
$full = __PACKAGE__->_translate_person($parts[-1], 0);
|
||
warn("Can't find translation for `@parts'"), next
|
||
unless defined $full and $full ne $parts[-1];
|
||
# Add the normalization
|
||
my $f = __PACKAGE__->normalize_person($parts[-1]);
|
||
$tr{translate_dots $in} = $f;
|
||
}
|
||
$short{$full} = $in;
|
||
($full) = strip_years($full);
|
||
$short{$full} = $in;
|
||
next;
|
||
}
|
||
warn("Do not understand directive: `$_'"), next if /^#/;
|
||
#warn "Doing `$_'";
|
||
my ($pre, $post) = /^(.*?)\s*(\(.*\))?$/;
|
||
my @f = split ' ', $pre or warn("`$pre' won't split"), die;
|
||
my $last = pop @f;
|
||
my @last = $last;
|
||
|
||
# no utf8; # `use' is needed by 5.005
|
||
(my $ascii = $last) =~
|
||
tr( ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ\x80-\x9F)
|
||
( !cLXY|S"Ca<__R~o+23'mP.,1o>...?AAAAAAACEEEEIIIIDNOOOOOx0UUUUYpbaaaaaaaceeeeiiiidnooooo:ouuuuyPy_);
|
||
push @last, $ascii unless $ascii eq $last;
|
||
my $a = $aliases{$last[0]} ? $aliases{$last[0]} : [];
|
||
$a = [$a] unless ref $a;
|
||
push @last, @$a;
|
||
for my $last (@last) {
|
||
my @comp = (@f, $last);
|
||
$tr{"\L@comp"} ||= $_;
|
||
$tr{lc $last} ||= $_; # Two Bach's
|
||
if (@f) {
|
||
$tr{"\L$f[0] $last"} ||= $_; # With the first of pre-names only
|
||
my @ini = map substr($_, 0, 1), @f;
|
||
$tr{"\L$ini[0] $last"} ||= $_; # One initial
|
||
$tr{"\L@ini $last"} ||= $_; # All initials
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
#$tr{lc 'Tchaikovsky, Piotyr Ilyich'} = $tr{lc 'Tchaikovsky'};
|
||
|
||
sub prepare_tag_object_comp ($;$) {
|
||
my ($comp, $piece) = @_;
|
||
require MP3::Tag;
|
||
my $tag = MP3::Tag->new_fake('settable');
|
||
|
||
for my $elt ( qw( title track artist album comment year genre
|
||
title_track artist_collection person ) ) {
|
||
no strict 'refs';
|
||
MP3::Tag->config("translate_$elt", \&{"Normalize::Text::Music_Fields::normalize_$elt"})
|
||
if defined &{"Normalize::Text::Music_Fields::normalize_$elt"};
|
||
# This is needed to expand albums, since pieces file is named so...
|
||
MP3::Tag->config("short_person", \&Normalize::Text::Music_Fields::short_person)
|
||
if defined &Normalize::Text::Music_Fields::short_person;
|
||
}
|
||
$tag->config('parse_data', ['mi', $comp, '%a'], ($piece ? ['mi', $piece, '%l'] : () ));
|
||
$tag;
|
||
}
|
||
|
||
## perl -MNormalize::Text::Music_Fields -e Normalize::Text::Music_Fields::test_normalize_piece
|
||
sub test_normalize_piece {
|
||
for (split /\n/, <<EOS) {
|
||
beethoven # 28
|
||
beethoven wind in C
|
||
beethoven tattoo
|
||
beethoven WoO 20
|
||
beethoven sonata in F#
|
||
beethoven piano in F#
|
||
beethoven op78
|
||
beethoven Op. 10-2
|
||
beethoven Op. 10, #2
|
||
beethoven sonata #22
|
||
beethoven WoO 205-1
|
||
beethoven WoO 205, No 1
|
||
beethoven WoO 205, No. 1
|
||
beethoven WoO 205, no 1
|
||
beethoven WoO 205;#1
|
||
beethoven WoO 205, no1
|
||
beethoven WoO 205 #1
|
||
beethoven WoO 205#1
|
||
beethoven WoO 205. #1
|
||
- beethoven WoO 205,-1
|
||
- beethoven WoO 205, -1
|
||
- beethoven WoO 205 -1
|
||
- beethoven WoO 205 1
|
||
- beethoven WoO 205;1
|
||
EOS
|
||
my $match = (s/^-\s*// ? '-' : '+');
|
||
s/^(\w+)\s+//;
|
||
my $tag = prepare_tag_object_comp("$1", $_);
|
||
print "$match ", find_person($tag), " ", $tag->album, "\n";
|
||
}
|
||
}
|
||
|
||
for my $elt ( qw( title track artist album comment year genre
|
||
title_track artist_collection person ) ) {
|
||
no strict 'refs'; # backward compatibility layer:
|
||
*{"translate_$elt"} = \&{"normalize_$elt"} if defined &{"normalize_$elt"};
|
||
}
|
||
|
||
1;
|
||
|
||
=head1 NAME
|
||
|
||
Normalize::Text::Music_Fields - normalize names of people's and (musical) works.
|
||
|
||
=head1 SYNOPSIS
|
||
|
||
$name = $obj->Normalize::Text::Music_Fields::normalize_person($name);
|
||
$work = $obj->Normalize::Text::Music_Fields::normalize_piece($work);
|
||
# $obj should have methods `name_for_field_normalization', 'shorted_person'
|
||
|
||
=head1 DESCRIPTION
|
||
|
||
Databases of names and of works-per-name are taken from plain-text
|
||
files (optionally in mail-header format). Names are stored in F<*.lst> files.
|
||
Works are stored in F<.comp> files named after the shortened name
|
||
of the composer.
|
||
|
||
The directories of these files are looked in the environment variable
|
||
C<MUSIC_FIELDS_PATH> (if defined, split the same way as C<PATH>), or in
|
||
C<$ENV{HOME}/.music_fields>, and C<-> (and C<-> is replaced by the directory
|
||
named as the module file with F<.pm> dropped). At runtime, one can
|
||
replace the list by calling function Normalize::Text::Music_Fields::set_path()
|
||
with the list of directories as the argument.
|
||
|
||
(Since parsed files are cached, replacing the directory list should be done
|
||
as early as possible.)
|
||
|
||
Files may be managed with utility subroutines provided with the module:
|
||
|
||
# Translate from one-per-line to mail-header format:
|
||
perl -wple "BEGIN {print q(# format = mail-header)} s/#\s*normalized\s*$//; $_ = qq(Title: $_) unless /^\s*(#|$)/; $_ = qq(\n$_) if $p and not /^##/; $_ .= qq(\n) unless $p = /^##/" Normalize::Text::Music_Fields-G_Gershwin.comp >Music_Fields-G_Gershwin.comp-mail
|
||
|
||
# (inverse transformation:) Dump pieces listed in mail-header format
|
||
perl -MNormalize::Text::Music_Fields -wle "print for Normalize::Text::Music_Fields::read_composer_file(shift, shift)" gershwin Music_Fields-G_Gershwin.comp-mail > o
|
||
|
||
# Normalize data in 1-line-per piece format
|
||
perl -MNormalize::Text::Music_Fields -wle "Normalize::Text::Music_Fields::prepare_tag_object_comp(shift)->Normalize::Text::Music_Fields::normalize_file_lines(shift)"
|
||
|
||
# Create a mail-header file from a semi-processed (with "bold" fields)
|
||
# mail-header file (with xml escapes, preceded by opus number)
|
||
perl -MNormalize::Text::Music_Fields -00wnle "BEGIN {$tag = Normalize::Text::Music_Fields::prepare_tag_object_comp(shift @ARGV); print q(# format = mail-header)} print Normalize::Text::Music_Fields::emit_as_mail_header($tag,$_, q(bold,xml,opus),$pre)" shostakovich o-xslt-better >Music_Fields-D_Shostakovich.comp-mail1
|
||
|
||
# Likewise, from work-per-line with opus-numbers:
|
||
perl -MNormalize::Text::Music_Fields -wnle "BEGIN {$tag = Normalize::Text::Music_Fields::prepare_tag_object_comp(shift @ARGV); print qq(# format = mail-header\n)} print Normalize::Text::Music_Fields::emit_as_mail_header($tag,$_, q(opus), $pre)" schnittke o-schnittke-better >Music_Fields-A_Schnittke.comp-mail2
|
||
|
||
# A primitive tool for merging additional info into the database:
|
||
perl -MNormalize::Text::Music_Fields -wnle "BEGIN {$tag = Normalize::Text::Music_Fields::prepare_tag_object_comp(shift @ARGV); print qq(# format = mail-header\n)} next unless s/^\s*\+\+\s*//; print Normalize::Text::Music_Fields::merge_info($tag,$_, q(opus,xml), qr(^(58|70|76|116|118|119)($|-)))" brahms o-brahms-op-no1-xslt
|
||
|
||
# Minimal consistency check of persons database.
|
||
perl -MNormalize::Text::Music_Fields -wle "BEGIN{binmode $_, ':encoding(cp866)' for \*STDIN, \*STDOUT, \*STDERR} print Normalize::Text::Music_Fields->check_persons"
|
||
|
||
# Minimal testing code:
|
||
perl -MNormalize::Text::Music_Fields -e Normalize::Text::Music_Fields::test_normalize_piece
|
||
|
||
It may be easier to type these examples if one uses C<manage_M_N_F.pm>, which
|
||
exports the mentioned subroutines to the main namespace (available in
|
||
F<examples> directory of a distribution of C<MP3::Tag>). E.g., the last
|
||
example becomes:
|
||
|
||
perl -Mmanage_M_N_F -e test_normalize_piece
|
||
|
||
|
||
=cut
|
||
|