#!/usr/bin/perl -w # Parse lyrics from LilyPond files # [JNZ] Modified 19-Mar-2008 use strict; use open ":utf8"; use encoding "utf8"; # Parser state table our $state_notoken = 0; # No token (default) our $state_notokenpush = 1; # No token, but with unpushed previous our $state_string = 2; # String (including multiline) our $state_commentstart = 3; # Start of a comment (not stored in @tokens) our $state_singlecomment = 4; # Single-line comment (not stored) our $state_multicomment = 5; # Multiline comment (not stored) our $state_multicommentend = 6; # End of multiline comment (not stored) our $state_alphabetic = 7; # Alphabetic token our $state_numeric = 8; # Numeric token our $state_backslash = 9; # Backslashed word (eg, "\version") our $state_doubled = 10; # Doubled character our @tokens = (); # Tokens in a given file our @names = (); # Associated filename our @lines = (); # Associated file line number our @cols = (); # Associated file column number our $state = $state_notoken; while (<>) { chomp; my $col = 1; # Always start from column 1 # There's almost certainly a better way to do the following... Split # the input into characters, then make them up into tokens again. # Note that this code does NOT (yet) parse Scheme code correctly! my @chars = split(//, $_); my $token = ""; my $tok_line = $.; my $tok_col = 1; my $c; if ($state == $state_string) { $token = pop @tokens; pop @names; $tok_line = pop @lines; $tok_col = pop @cols; $token .= "\n"; } while (defined($c = shift @chars)) { if ($state == $state_notoken) { # No current token if ($c =~ /\s/) { next; } elsif ($c =~ /%/) { $state = $state_commentstart; next; } elsif ($c =~ /[[:alpha:]\x{2019}]/) { # Note the hack: U+2019 RIGHT SINGLE QUOTATION MARK is # treated as an alphabetic character $state = $state_alphabetic; $tok_line = $.; $tok_col = $col; redo; } elsif ($c =~ /[[:digit:]]/) { $state = $state_numeric; $tok_line = $.; $tok_col = $col; redo; } elsif ($c =~ /"/) { $state = $state_string; $token = $c; $tok_line = $.; $tok_col = $col; next; } elsif ($c =~ /\\/) { $state = $state_backslash; $token = $c; $tok_line = $.; $tok_col = $col; next; } elsif ($c =~ /[-_<>]/) { $state = $state_doubled; $token = $c; $tok_line = $.; $tok_col = $col; next; } else { push @tokens, $c; push @names, $ARGV; push @lines, $.; push @cols, $col; next; } } elsif ($state == $state_notokenpush) { # Unpushed token push @tokens, $token; push @names, $ARGV; push @lines, $tok_line; push @cols, $tok_col; $token = ""; $state = $state_notoken; redo; } elsif ($state == $state_string) { # String token if (($c =~ /"/) and ($token !~ /\\$/)) { # Ending quote? $state = $state_notokenpush; } $token .= $c; next; } elsif ($state == $state_commentstart) { # Start of comment if ($c =~ /{/) { $state = $state_multicomment; } else { $state = $state_singlecomment; } next; } elsif ($state == $state_singlecomment) { # Single-line comment next; } elsif ($state == $state_multicomment) { # Multiline comment if ($c =~ /%/) { $state = $state_multicommentend; } next; } elsif ($state == $state_multicommentend) { # End of multiline comment if ($c =~ /}/) { $state = $state_notoken; } else { $state = $state_multicomment; } next; } elsif ($state == $state_alphabetic) { # Alphabetic token if ($c !~ /[[:alpha:]\x{2019}]/) { $state = $state_notokenpush; redo; } else { $token .= $c; } } elsif ($state == $state_numeric) { # Numeric token (including ".") if ($c !~ /[.[:digit:]]/) { $state = $state_notokenpush; redo; } else { $token .= $c; } } elsif ($state == $state_backslash) { # Backslashed word token if ($c !~ /[[:alpha:]]/) { if ($token =~ /^\\$/) { $token .= $c; } $state = $state_notokenpush; redo; } else { $token .= $c; } } elsif ($state == $state_doubled) { # Doubled character if ($c eq $token) { $token .= $c; push @tokens, $token; push @names, $ARGV; push @lines, $.; push @cols, $tok_col; $token = ""; $state = $state_notoken; next; } else { $state = $state_notokenpush; redo; } } else { die "$0: Unknown state $state ($ARGV:$.)\n"; } } continue { $col++; } # End of the current line has been reached if ($state != $state_notoken) { if (($state != $state_commentstart) and ($state != $state_singlecomment) and ($state != $state_multicomment) and ($state != $state_multicommentend)) { push @tokens, $token; push @names, $ARGV; push @lines, $tok_line; push @cols, $tok_col; } $state = $state_notoken unless (($state == $state_string) or ($state == $state_multicomment)); } } continue { if (eof) { close ARGV; # Reset line numbering in $. $state = $state_notoken; # Comments and strings don't cross file boundaries } } # All files now parsed # Now we can start looking for lyrics our @lyric_words = (); our @lyric_index = (); my $in_lyrics = 0; # Are we in a \lyricmode block? my $in_markup = 0; # Are in in a \markup block? my $lyr_nest = 0; # Brace nesting level for \lyricmode my $mark_nest = 0; # Brace nesting level for \markup sub ourwarn (@) { my $i = shift; warn "$0: $names[$i]:$lines[$i]:$cols[$i]: WARNING: @_\n"; } for (my $i = 0; $i <= $#tokens; $i++) { my $token = $tokens[$i]; if ($in_markup) { if ($token eq "{") { $mark_nest++; } elsif ($token eq "}") { if ($mark_nest < 1) { ourwarn $i, "unmatched braces"; } elsif ($mark_nest == 1) { $in_markup = 0; } else { $mark_nest--; } } else { # Ignore \markup } } elsif ($in_lyrics) { if ($token eq "{") { $lyr_nest++; } elsif ($token eq "}") { if ($lyr_nest < 1) { ourwarn $i, "unmatched braces"; } elsif ($lyr_nest == 1) { $in_lyrics = 0; } else { $lyr_nest--; } } elsif ($token =~ /^[[:alpha:]"\x{2019}]/) { # Alphabetic or string if (($#lyric_words >= 0) and ($lyric_words[$#lyric_words] =~ / --$/)) { my $word = pop @lyric_words; $word .= " " . $token; push @lyric_words, $word; } else { push @lyric_words, $token; push @lyric_index, $i; } } elsif ($token eq "--") { my $word = pop @lyric_words; $word .= " --"; push @lyric_words, $word; } elsif ($token =~ /^\d/) { # Ignore digit tokens } elsif (($token eq "_") or ($token eq "__") or ($token eq "~")) { # Ignore lyric spaces } elsif ($token eq "|") { # Ignore beat bar line } elsif ($token =~ /^[.,!?:;()\x{AB}\x{BB}\x{2012}\x{2013}\x{2014}\x{201C}\x{201D}\x{201E}\x{201F}\x{2026}]/) { # Ignore punctuation # \x{AB} = LEFT-POINTING DOUBLE ANGLE QUOTATION MARK # \x{BB} = RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK # \x{2012} = FIGURE DASH # \x{2013} = EN DASH # \x{2014} = EM DASH # \x{201C} - \x{201F} = various DOUBLE QUOTATION MARKs # \x{2026} = HORIZONTAL ELLIPSIS } elsif ($token eq "\\markup") { $in_markup = 1; $mark_nest = 0; } elsif ($token eq "\\set") { if ($tokens[$i+2] eq "=") { if ($tokens[$i+3] =~ /^"/) { # \set XXX = "string" $i += 3; } elsif ($tokens[$i+3] eq "#") { # \set XXX = #YYY if ($tokens[$i+4] =~ /^\d/) { $i += 4; } elsif ($tokens[$i+4] eq "#") { # \set XXX = ##B if ($tokens[$i+5] =~ /^[tf]$/) { $i += 5; } else { ourwarn $i+5, "unknown boolean:", $tokens[$i+5]; } } else { ourwarn $i+4, "unknown Scheme expression:", $tokens[$i+4]; } } else { ourwarn $i+3, "unknown \\set expression:", $tokens[$i+3]; } } else { ourwarn $i+2, "unknown \\set type:", $tokens[$i+2]; } } elsif ($token eq "\\unset") { $i += 1; } elsif ($token eq "\\override") { if ($tokens[$i+2] eq "#") { if ($tokens[$i+3] eq "'") { # \override XXX #'YYY while ($tokens[$i+5] eq "-") { $i += 2; # Allow hyphenated names } if ($tokens[$i+5] eq "=") { if ($tokens[$i+6] eq "#") { # \override XXX #'YYY = #ZZZ if ($tokens[$i+7] =~ /^\d/) { $i += 7; } elsif ($tokens[$i+7] eq "#") { if ($tokens[$i+8] =~ /^[tf]$/) { $i += 8; } else { ourwarn $i+8, "unknown boolean:", $tokens[$i+8]; } } else { ourwarn $i+7, "unknown Scheme expression:", $tokens[$i+7]; } } else { ourwarn $i+6, "unknown \\override expression:", $tokens[$i+6]; } } else { ourwarn $i+5, "unknown \\override expression:", $tokens[$i+5]; } } else { ourwarn $i+3, "unknown \\override expression:", $tokens[$i+3]; } } else { ourwarn $i+2, "unknown \\override type:", $tokens[$i+2]; } } elsif ($token eq "\\revert") { if ($tokens[$i+2] eq "#") { if ($tokens[$i+3] eq "'") { # \override XXX #'YYY while ($tokens[$i+5] eq "-") { $i += 2; # Allow hyphenated names } $i += 4; } else { ourwarn $i+3, "unknown \\override expression:", $tokens[$i+3]; } } else { ourwarn $i+2, "unknown \\override type:", $tokens[$i+2]; } } elsif ($token eq "\\skip") { $i += 1; } elsif ($token eq "\\repeat") { $i += 2; } elsif ($token eq "\\alternative") { # Do nothing... } elsif ($token =~ /^\\/) { ourwarn $i, "unknown expression:", $token; } elsif ($token eq "-") { ourwarn $i, "unexpected single hyphen" } elsif ($token eq "'") { ourwarn $i, "unexpected single quote" } else { ourwarn $i, "unknown token:", $token; } } elsif ($token eq "\\lyricmode") { $in_lyrics = 1; $lyr_nest = 0; $mark_nest = 0; } } # Print out all lyric words as a colon-separated set of fields for (my $j = 0; $j <= $#lyric_words; $j++) { my $lw = $lyric_words[$j]; my $i = $lyric_index[$j]; my $lwfull = $lw; $lwfull =~ s/ -- //g; $lwfull =~ s/[".,!?:;()\x{AB}\x{BB}\x{2012}\x{2013}\x{2014}\x{201C}\x{201D}\x{201E}\x{201F}\x{2026}]//g; $lwfull =~ s/^\s+//g; $lwfull =~ s/\s+$//g; if ($lwfull =~ /[^-[:alpha:]\x{2019}\s]/g) { ourwarn $i, "unprocessed character in word:", $lwfull; } print "$lwfull:$lw:$names[$i]:$lines[$i]:$cols[$i]\n"; }