#!/usr/bin/perl -w # Convert standard input to strictly lower-case ASCII filename characters # [JNZ] Modified 29-Mar-2011 use strict; # Enforce better programming habits use locale; # Allow locale-specific sorting, etc. use utf8; # This script uses UTF-8 character set use open ":locale"; # Use locale for standard input/output use charnames ":full"; use IO::Handle; use Encode qw(decode_utf8); # Standard Encode package # Encode an integer (0x80 to 0xFF) as a UTF-8 string sub u ($) { return pack("U", $_[0]); } # Transform a string into lowercase ASCII filename equivalent sub transform ($$$) { my $in = $_[0]; my $fn = $_[1]; my $line = $_[2]; my $c; $in =~ s/ /-/g; # Replace spaces with a dash $in =~ s/_/-/g; # Replace underscores with a dash $in =~ s/:/-/g; # Replace colons with a dash $in =~ s/(\p{UppercaseLetter})/\l$1/g; # Make lowercase $c = u(0xDF); $in =~ s/$c/ss/g; # Latin small letter Sharp S $c = u(0xE0); $in =~ s/$c/a/g; # Latin small letter A with grave $c = u(0xE1); $in =~ s/$c/a/g; # Latin small letter A with acute $c = u(0xE2); $in =~ s/$c/a/g; # Latin small letter A with circumflex $c = u(0xE3); $in =~ s/$c/a/g; # Latin small letter A with tilde $c = u(0xE4); $in =~ s/$c/a/g; # Latin small letter A with diaresis $c = u(0xE5); $in =~ s/$c/a/g; # Latin small letter A with ring above $c = u(0xE6); $in =~ s/$c/ae/g; # Latin small letter AE $c = u(0xE7); $in =~ s/$c/c/g; # Latin small letter C with cedilla $c = u(0xE8); $in =~ s/$c/e/g; # Latin small letter E with grave $c = u(0xE9); $in =~ s/$c/e/g; # Latin small letter E with acute $c = u(0xEA); $in =~ s/$c/e/g; # Latin small letter E with circumflex $c = u(0xEB); $in =~ s/$c/e/g; # Latin small letter E with diaresis $c = u(0xEC); $in =~ s/$c/i/g; # Latin small letter I with grave $c = u(0xED); $in =~ s/$c/i/g; # Latin small letter I with acute $c = u(0xEE); $in =~ s/$c/i/g; # Latin small letter I with circumflex $c = u(0xEF); $in =~ s/$c/i/g; # Latin small letter I with diaresis $c = u(0xF0); $in =~ s/$c/eth/g; # Latin small letter ETH $c = u(0xF1); $in =~ s/$c/n/g; # Latin small letter N with tilde $c = u(0xF2); $in =~ s/$c/o/g; # Latin small letter O with grave $c = u(0xF3); $in =~ s/$c/o/g; # Latin small letter O with acute $c = u(0xF4); $in =~ s/$c/o/g; # Latin small letter O with circumflex $c = u(0xF5); $in =~ s/$c/o/g; # Latin small letter O with tilde $c = u(0xF6); $in =~ s/$c/o/g; # Latin small letter O with diaresis $c = u(0xF8); $in =~ s/$c/o/g; # Latin small letter O with stroke $c = u(0xF9); $in =~ s/$c/u/g; # Latin small letter U with grave $c = u(0xFA); $in =~ s/$c/u/g; # Latin small letter U with acute $c = u(0xFB); $in =~ s/$c/u/g; # Latin small letter U with circumflex $c = u(0xFC); $in =~ s/$c/u/g; # Latin small letter U with diaresis $c = u(0xFD); $in =~ s/$c/y/g; # Latin small letter Y with acute $c = u(0xFE); $in =~ s/$c/th/g; # Latin small letter THORN $c = u(0xFF); $in =~ s/$c/y/g; # Latin small letter Y with diaresis $in =~ s/\x{0101}/a/g; # Latin small letter A with macron $in =~ s/\x{0103}/a/g; # Latin small letter A with breve $in =~ s/\x{0105}/a/g; # Latin small letter A with ogonek $in =~ s/\x{0107}/c/g; # Latin small letter C with acute $in =~ s/\x{0109}/c/g; # Latin small letter C with circumflex $in =~ s/\x{010B}/c/g; # Latin small letter C with dot above $in =~ s/\x{010D}/c/g; # Latin small letter C with caron $in =~ s/\x{010F}/d/g; # Latin small letter D with caron $in =~ s/\x{0111}/d/g; # Latin small letter D with stroke $in =~ s/\x{0113}/e/g; # Latin small letter E with macron $in =~ s/\x{0115}/e/g; # Latin small letter E with breve $in =~ s/\x{0117}/e/g; # Latin small letter E with dot above $in =~ s/\x{0119}/e/g; # Latin small letter E with ogonek $in =~ s/\x{011B}/e/g; # Latin small letter E with caron $in =~ s/\x{011D}/g/g; # Latin small letter G with circumflex $in =~ s/\x{011F}/g/g; # Latin small letter G with breve $in =~ s/\x{0121}/g/g; # Latin small letter G with dot above $in =~ s/\x{0123}/g/g; # Latin small letter G with cedilla $in =~ s/\x{0125}/h/g; # Latin small letter H with circumflex $in =~ s/\x{0127}/h/g; # Latin small letter H with stroke $in =~ s/\x{0129}/i/g; # Latin small letter I with tilde $in =~ s/\x{012B}/i/g; # Latin small letter I with macron $in =~ s/\x{012D}/i/g; # Latin small letter I with breve $in =~ s/\x{012F}/i/g; # Latin small letter I with ogonek $in =~ s/\x{0131}/i/g; # Latin small letter dotless I $in =~ s/\x{0133}/ij/g; # Latin small ligature IJ $in =~ s/\x{0135}/j/g; # Latin small letter J with circumflex $in =~ s/\x{0137}/k/g; # Latin small letter K with cedilla $in =~ s/\x{0138}/kr/g; # Latin small letter KRA $in =~ s/\x{013A}/l/g; # Latin small letter L with acute $in =~ s/\x{013C}/l/g; # Latin small letter L with cedilla $in =~ s/\x{013E}/l/g; # Latin small letter L with caron $in =~ s/\x{0140}/l/g; # Latin small letter L with middle dot $in =~ s/\x{0142}/l/g; # Latin small letter L with stroke $in =~ s/\x{0144}/n/g; # Latin small letter N with acute $in =~ s/\x{0146}/n/g; # Latin small letter N with cedilla $in =~ s/\x{0148}/n/g; # Latin small letter N with caron $in =~ s/\x{0149}/n/g; # Latin small letter N preceded by apostrophe $in =~ s/\x{014B}/ng/g; # Latin small letter ENG $in =~ s/\x{014D}/o/g; # Latin small letter O with macron $in =~ s/\x{014F}/o/g; # Latin small letter O with breve $in =~ s/\x{0151}/o/g; # Latin small letter O with double acute $in =~ s/\x{0153}/oe/g; # Latin small ligature OE $in =~ s/\x{0155}/r/g; # Latin small letter R with acute $in =~ s/\x{0157}/r/g; # Latin small letter R with cedilla $in =~ s/\x{0159}/r/g; # Latin small letter R with caron $in =~ s/\x{015B}/s/g; # Latin small letter S with acute $in =~ s/\x{015D}/s/g; # Latin small letter S with circumflex $in =~ s/\x{015F}/s/g; # Latin small letter S with cedilla $in =~ s/\x{0161}/s/g; # Latin small letter S with caron $in =~ s/\x{0163}/t/g; # Latin small letter T with cedilla $in =~ s/\x{0165}/t/g; # Latin small letter T with caron $in =~ s/\x{0167}/t/g; # Latin small letter T with stroke $in =~ s/\x{0169}/u/g; # Latin small letter U with tilde $in =~ s/\x{016B}/u/g; # Latin small letter U with macron $in =~ s/\x{016D}/u/g; # Latin small letter U with breve $in =~ s/\x{016F}/u/g; # Latin small letter U with ring above $in =~ s/\x{0171}/u/g; # Latin small letter U with double acute $in =~ s/\x{0173}/u/g; # Latin small letter U with ogonek $in =~ s/\x{0175}/w/g; # Latin small letter W with circumflex $in =~ s/\x{0177}/y/g; # Latin small letter Y with circumflex $in =~ s/\x{017A}/z/g; # Latin small letter Z with acute $in =~ s/\x{017C}/z/g; # Latin small letter Z with dot above $in =~ s/\x{017E}/z/g; # Latin small letter Z with caron $in =~ s/\x{017F}/s/g; # Latin small letter long S $in =~ s/\x{0431}\x{0435}/be/g; # Cyrillic small letters BE + IE $in =~ s/\x{0432}\x{0435}/ve/g; # Cyrillic small letters VE + IE $in =~ s/\x{0433}\x{0435}/ge/g; # Cyrillic small letters GHE + IE $in =~ s/\x{0434}\x{0435}/de/g; # Cyrillic small letters DE + IE $in =~ s/\x{0436}\x{0435}/zhe/g; # Cyrillic small letters ZHE + IE $in =~ s/\x{0437}\x{0435}/ze/g; # Cyrillic small letters ZE + IE $in =~ s/\x{043A}\x{0435}/ke/g; # Cyrillic small letters KA + IE $in =~ s/\x{043F}\x{0435}/pe/g; # Cyrillic small letters PE + IE $in =~ s/\x{0441}\x{0435}/se/g; # Cyrillic small letters ES + IE $in =~ s/\x{0442}\x{0435}/te/g; # Cyrillic small letters TE + IE $in =~ s/\x{0446}\x{0435}/tse/g; # Cyrillic small letters TSE + IE $in =~ s/\x{0447}\x{0435}/che/g; # Cyrillic small letters CHE + IE $in =~ s/\x{0448}\x{0435}/she/g; # Cyrillic small letters SHA + IE $in =~ s/\x{0449}\x{0435}/shche/g; # Cyrillic small letters SHCHA + IE $in =~ s/\x{0430}/a/g; # Cyrillic small letter A $in =~ s/\x{0431}/b/g; # Cyrillic small letter BE $in =~ s/\x{0432}/v/g; # Cyrillic small letter VE $in =~ s/\x{0433}/g/g; # Cyrillic small letter GHE $in =~ s/\x{0434}/d/g; # Cyrillic small letter DE $in =~ s/\x{0435}/ye/g; # Cyrillic small letter IE $in =~ s/\x{0436}/zh/g; # Cyrillic small letter ZHE $in =~ s/\x{0437}/z/g; # Cyrillic small letter ZE $in =~ s/\x{0438}/i/g; # Cyrillic small letter I $in =~ s/\x{0439}/y/g; # Cyrillic small letter Short I $in =~ s/\x{043A}/k/g; # Cyrillic small letter KA $in =~ s/\x{043B}/l/g; # Cyrillic small letter EL $in =~ s/\x{043C}/m/g; # Cyrillic small letter EM $in =~ s/\x{043D}/n/g; # Cyrillic small letter EN $in =~ s/\x{043E}/o/g; # Cyrillic small letter O $in =~ s/\x{043F}/p/g; # Cyrillic small letter PE $in =~ s/\x{0440}/r/g; # Cyrillic small letter ER $in =~ s/\x{0441}/s/g; # Cyrillic small letter ES $in =~ s/\x{0442}/t/g; # Cyrillic small letter TE $in =~ s/\x{0443}/u/g; # Cyrillic small letter U $in =~ s/\x{0444}/f/g; # Cyrillic small letter EF $in =~ s/\x{0445}/kh/g; # Cyrillic small letter HA $in =~ s/\x{0446}/ts/g; # Cyrillic small letter TSE $in =~ s/\x{0447}/ch/g; # Cyrillic small letter CHE $in =~ s/\x{0448}/sh/g; # Cyrillic small letter SHA $in =~ s/\x{0449}/shch/g; # Cyrillic small letter SHCHA $in =~ s/\x{044A}//g; # Cyrillic small letter hard sign $in =~ s/\x{044B}/y/g; # Cyrillic small letter YERU $in =~ s/\x{044C}//g; # Cyrillic small letter soft sign $in =~ s/\x{044D}/e/g; # Cyrillic small letter E $in =~ s/\x{044E}/yu/g; # Cyrillic small letter YU $in =~ s/\x{044F}/ya/g; # Cyrillic small letter YA $in =~ s/\x{0451}/yo/g; # Cyrillic small letter IO $in =~ s/\x{0454}/ie/g; # Cyrillic small letter Ukrainian IE $in =~ s/\x{0456}/i/g; # Cyrillic small letter Ukrainian I $in =~ s/\x{0457}/yi/g; # Cyrillic small letter YI $in =~ s/\x{0491}/g/g; # Cyrillic small letter GHE with upturn $c = u(0xAD); $in =~ s/$c/-/g; # Soft hyphen $in =~ s/\x{2010}/-/g; # Hyphen $in =~ s/\x{2011}/-/g; # Non-breaking hyphen $in =~ s/\x{2012}/-/g; # Figure dash $in =~ s/\x{2013}/-/g; # En dash $in =~ s/\x{2014}/-/g; # Em dash $in =~ s/\x{2015}/-/g; # Horizontal bar $c = u(0xAB); $in =~ s/$c//g; # Left-pointing double angle quotation mark $c = u(0xBB); $in =~ s/$c//g; # Right-pointing double angle quotation mark $in =~ s/\x{2018}//g; # Left single quotation mark $in =~ s/\x{2019}//g; # Right single quotation mark $in =~ s/\x{201C}//g; # Left double quotation mark $in =~ s/\x{201D}//g; # Right double quotation mark $in =~ s/\x{2026}//g; # Horizontal ellipsis while ($in =~ /(\P{IsASCII})/g) { warn sprintf("$0: %s" . "String contains non-ASCII character: \"%s\" (U+%04X)\n", defined($fn) ? ($fn eq "-" ? ":$line: " : "$fn:$line: ") : "", $1, ord($1)); } $in =~ tr/-0-9a-z//dc; # Remove non-alphanumeric characters while ($in =~ /--/) { $in =~ s/--/-/g; # Remove sequences of dashes } return $in; } # Main program my $line; while (<>) { chomp; print transform($_, $ARGV, $.) . "\n"; }