#!/usr/bin/perl -w

# Parse lyrics from LilyPond files
# [JNZ] Modified 30-Jun-2018

use v5.26;
use strict;

use locale;
use utf8;
use open ":utf8";


# Parser state table

our $state_notoken		= 0;	# No token (default)
our $state_notokenpush		= 1;	# No token, but with unpushed previous
our $state_string		= 2;	# String (including multiline)
our $state_commentstart		= 3;	# Start of a comment (not stored in @tokens)
our $state_singlecomment	= 4;	# Single-line comment (not stored)
our $state_multicomment		= 5;	# Multiline comment (not stored)
our $state_multicommentend	= 6;	# End of multiline comment (not stored)
our $state_alphabetic		= 7;	# Alphabetic token
our $state_numeric		= 8;	# Numeric token
our $state_backslash		= 9;	# Backslashed word (eg, "\version")
our $state_doubled		= 10;	# Doubled character


our @tokens = ();			# Tokens in a given file
our @names = ();			# Associated filename
our @lines = ();			# Associated file line number
our @cols = ();				# Associated file column number

our $state = $state_notoken;

while (<>) {
    chomp;

    my $col = 1;			# Always start from column 1

    # There's almost certainly a better way to do the following...  Split
    # the input into characters, then make them up into tokens again.

    # Note that this code does NOT (yet) parse Scheme code correctly!

    my @chars = split(//, $_);
    my $token = "";
    my $tok_line = $.;
    my $tok_col = 1;
    my $c;

    if ($state == $state_string) {
	$token = pop @tokens;
	pop @names;
	$tok_line = pop @lines;
	$tok_col = pop @cols;
	$token .= "\n";
    }

    while (defined($c = shift @chars)) {
	if ($state == $state_notoken) {		# No current token
	    if ($c =~ /\s/) {
		next;
	    } elsif ($c =~ /%/) {
		$state = $state_commentstart;
		next;
	    } elsif ($c =~ /[[:alpha:]\x{2019}]/) {
		# Note the hack: U+2019 RIGHT SINGLE QUOTATION MARK is
		# treated as an alphabetic character
		$state = $state_alphabetic;
		$tok_line = $.;
		$tok_col = $col;
		redo;
	    } elsif ($c =~ /[[:digit:]]/) {
		$state = $state_numeric;
		$tok_line = $.;
		$tok_col = $col;
		redo;
	    } elsif ($c =~ /"/) {
		$state = $state_string;
		$token = $c;
		$tok_line = $.;
		$tok_col = $col;
		next;
	    } elsif ($c =~ /\\/) {
		$state = $state_backslash;
		$token = $c;
		$tok_line = $.;
		$tok_col = $col;
		next;
	    } elsif ($c =~ /[-_<>]/) {
		$state = $state_doubled;
		$token = $c;
		$tok_line = $.;
		$tok_col = $col;
		next;
	    } else {
		push @tokens, $c;
		push @names, $ARGV;
		push @lines, $.;
		push @cols, $col;
		next;
	    }
	}

	elsif ($state == $state_notokenpush) {	# Unpushed token
	    push @tokens, $token;
	    push @names, $ARGV;
	    push @lines, $tok_line;
	    push @cols, $tok_col;
	    $token = "";
	    $state = $state_notoken;
	    redo;
	}

	elsif ($state == $state_string) {	# String token
	    if (($c =~ /"/) and ($token !~ /\\$/)) { # Ending quote?
		$state = $state_notokenpush;
	    }
	    $token .= $c;
	    next;
	}

	elsif ($state == $state_commentstart) {	# Start of comment
	    if ($c =~ /{/) {
		$state = $state_multicomment;
	    } else {
		$state = $state_singlecomment;
	    }
	    next;
	}

	elsif ($state == $state_singlecomment) { # Single-line comment
	    next;
	}

	elsif ($state == $state_multicomment) {	# Multiline comment
	    if ($c =~ /%/) {
		$state = $state_multicommentend;
	    }
	    next;
	}

	elsif ($state == $state_multicommentend) { # End of multiline comment
	    if ($c =~ /}/) {
		$state = $state_notoken;
	    } else {
		$state = $state_multicomment;
	    }
	    next;
	}

	elsif ($state == $state_alphabetic) {	# Alphabetic token
	    if ($c !~ /[[:alpha:]\x{2019}]/) {
		$state = $state_notokenpush;
		redo;
	    } else {
		$token .= $c;
	    }
	}

	elsif ($state == $state_numeric) {	# Numeric token (including ".")
	    if ($c !~ /[.[:digit:]]/) {
		$state = $state_notokenpush;
		redo;
	    } else {
		$token .= $c;
	    }
	}

	elsif ($state == $state_backslash) {	# Backslashed word token
	    if ($c !~ /[[:alpha:]]/) {
		if ($token =~ /^\\$/) {
		    $token .= $c;
		}
		$state = $state_notokenpush;
		redo;
	    } else {
		$token .= $c;
	    }
	}

	elsif ($state == $state_doubled) {	# Doubled character
	    if ($c eq $token) {
		$token .= $c;
		push @tokens, $token;
		push @names, $ARGV;
		push @lines, $.;
		push @cols, $tok_col;
		$token = "";
		$state = $state_notoken;
		next;
	    } else {
		$state = $state_notokenpush;
		redo;
	    }
	}

	else {
	    die "$0: Unknown state $state ($ARGV:$.)\n";
	}
    } continue {
	$col++;
    }

    # End of the current line has been reached

    if ($state != $state_notoken) {
	if (($state != $state_commentstart)
	    and ($state != $state_singlecomment)
	    and ($state != $state_multicomment)
	    and ($state != $state_multicommentend)) {

	    push @tokens, $token;
	    push @names, $ARGV;
	    push @lines, $tok_line;
	    push @cols, $tok_col;
	}
	$state = $state_notoken
	    unless (($state == $state_string)
		    or ($state == $state_multicomment));
    }

} continue {
    if (eof) {
	close ARGV;			# Reset line numbering in $.
	$state = $state_notoken;	# Comments and strings don't cross file boundaries
    }
}


# All files now parsed


# Now we can start looking for lyrics

our @lyric_words = ();
our @lyric_index = ();

my $in_lyrics = 0;			# Are we in a \lyricmode block?
my $in_markup = 0;			# Are in in a \markup block?
my $lyr_nest = 0;			# Brace nesting level for \lyricmode
my $mark_nest = 0;			# Brace nesting level for \markup

sub ourwarn (@) {
    my $i = shift;
    warn "$0: $names[$i]:$lines[$i]:$cols[$i]: WARNING: @_\n";
}

for (my $i = 0; $i <= $#tokens; $i++) {
    my $token = $tokens[$i];

    if ($in_markup) {
	if ($token eq "{") {
	    $mark_nest++;
	}

	elsif ($token eq "}") {
	    if ($mark_nest < 1) {
		ourwarn $i, "unmatched braces";
	    } elsif ($mark_nest == 1) {
		$in_markup = 0;
	    } else {
		$mark_nest--;
	    }
	}

	else {
	    # Ignore \markup
	}
    }

    elsif ($in_lyrics) {
	if ($token eq "{") {
	    $lyr_nest++;
	}

	elsif ($token eq "}") {
	    if ($lyr_nest < 1) {
		ourwarn $i, "unmatched braces";
	    } elsif ($lyr_nest == 1) {
		$in_lyrics = 0;
	    } else {
		$lyr_nest--;
	    }
	}

	elsif ($token =~ /^[[:alpha:]"\x{2019}]/) {	# Alphabetic or string
	    if (($#lyric_words >= 0)
		and ($lyric_words[$#lyric_words] =~ / --$/)) {
		my $word = pop @lyric_words;
		$word .= " " . $token;
		push @lyric_words, $word;
	    } else {
		push @lyric_words, $token;
		push @lyric_index, $i;
	    }
	}

	elsif ($token eq "--") {
	    my $word = pop @lyric_words;
	    $word .= " --";
	    push @lyric_words, $word;
	}

	elsif ($token =~ /^\d/) {
	    # Ignore digit tokens
	}

	elsif (($token eq "_") or ($token eq "__") or ($token eq "~")) {
	    # Ignore lyric spaces
	}

	elsif ($token eq "|") {
	    # Ignore beat bar line
	}

	elsif ($token =~ /^[.,!?:;()\x{AB}\x{BB}\x{2012}\x{2013}\x{2014}\x{201C}\x{201D}\x{201E}\x{201F}\x{2026}]/) {
	    # Ignore punctuation
	    #   \x{AB}   = LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
	    #   \x{BB}   = RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
	    #   \x{2012} = FIGURE DASH
	    #   \x{2013} = EN DASH
	    #   \x{2014} = EM DASH
	    #   \x{201C} - \x{201F} = various DOUBLE QUOTATION MARKs
	    #   \x{2026} = HORIZONTAL ELLIPSIS
	}

	elsif ($token eq "\\markup") {
	    $in_markup = 1;
	    $mark_nest = 0;
	}

	elsif ($token eq "\\set") {
	    if ($tokens[$i+2] eq "=") {
		if ($tokens[$i+3] =~ /^"/) { # \set XXX = "string"
		    $i += 3;
		} elsif ($tokens[$i+3] eq "#") { # \set XXX = #YYY
		    if ($tokens[$i+4] =~ /^\d/) {
			$i += 4;
		    } elsif ($tokens[$i+4] eq "#") { # \set XXX = ##B
			if ($tokens[$i+5] =~ /^[tf]$/) {
			    $i += 5;
			} else {
			    ourwarn $i+5, "unknown boolean:", $tokens[$i+5];
			}
		    } else {
			ourwarn $i+4, "unknown Scheme expression:",
			    $tokens[$i+4];
		    }
		} else {
		    ourwarn $i+3, "unknown \\set expression:", $tokens[$i+3];
		}
	    } else {
		ourwarn $i+2, "unknown \\set type:", $tokens[$i+2];
	    }
	}

	elsif ($token eq "\\unset") {
	    $i += 1;
	}

	elsif ($token eq "\\override") {
	    if ($tokens[$i+2] eq "#") {
		if ($tokens[$i+3] eq "'") { # \override XXX #'YYY
		    while ($tokens[$i+5] eq "-") {
			$i += 2;	# Allow hyphenated names
		    }
		    if ($tokens[$i+5] eq "=") {
			if ($tokens[$i+6] eq "#") { # \override XXX #'YYY = #ZZZ
			    if ($tokens[$i+7] =~ /^\d/) {
				$i += 7;
			    } elsif ($tokens[$i+7] eq "#") {
				if ($tokens[$i+8] =~ /^[tf]$/) {
				    $i += 8;
				} else {
				    ourwarn $i+8, "unknown boolean:", $tokens[$i+8];
				}
			    } else {
				ourwarn $i+7, "unknown Scheme expression:",
				    $tokens[$i+7];
			    }
			} else {
			    ourwarn $i+6, "unknown \\override expression:",
				$tokens[$i+6];
			}
		    } else {
			ourwarn $i+5, "unknown \\override expression:",
			    $tokens[$i+5];
		    }
		} else {
		    ourwarn $i+3, "unknown \\override expression:",
			$tokens[$i+3];
		}
	    } else {
		ourwarn $i+2, "unknown \\override type:", $tokens[$i+2];
	    }
	}

	elsif ($token eq "\\revert") {
	    if ($tokens[$i+2] eq "#") {
		if ($tokens[$i+3] eq "'") { # \override XXX #'YYY
		    while ($tokens[$i+5] eq "-") {
			$i += 2;	# Allow hyphenated names
		    }
		    $i += 4;
		} else {
		    ourwarn $i+3, "unknown \\override expression:",
			$tokens[$i+3];
		}
	    } else {
		ourwarn $i+2, "unknown \\override type:", $tokens[$i+2];
	    }
	}

	elsif ($token eq "\\skip") {
	    $i += 1;
	}

	elsif ($token eq "\\repeat") {
	    $i += 2;
	}

	elsif ($token eq "\\alternative") {
	    # Do nothing...
	}

	elsif ($token =~ /^\\/) {
	    ourwarn $i, "unknown expression:", $token;
	}

	elsif ($token eq "-") {
	    ourwarn $i, "unexpected single hyphen"
	}

	elsif ($token eq "'") {
	    ourwarn $i, "unexpected single quote"
	}

	else {
	    ourwarn $i, "unknown token:", $token;
	}
    }

    elsif ($token eq "\\lyricmode") {
	$in_lyrics = 1;
	$lyr_nest = 0;
	$mark_nest = 0;
    }
}


# Print out all lyric words as a colon-separated set of fields

for (my $j = 0; $j <= $#lyric_words; $j++) {
    my $lw = $lyric_words[$j];
    my $i  = $lyric_index[$j];

    my $lwfull = $lw;
    $lwfull =~ s/ -- //g;
    $lwfull =~ s/[".,!?:;()\x{AB}\x{BB}\x{2012}\x{2013}\x{2014}\x{201C}\x{201D}\x{201E}\x{201F}\x{2026}]//g;
    $lwfull =~ s/^\s+//g;
    $lwfull =~ s/\s+$//g;

    if ($lwfull =~ /[^-[:alpha:]\x{2019}\s]/g) {
	ourwarn $i, "unprocessed character in word:", $lwfull;
    }

    print "$lwfull:$lw:$names[$i]:$lines[$i]:$cols[$i]\n";
}