#!/usr/bin/perl -wp # This filter attempts to translate between various encodings # of the Cyrillic character set, or at least the 66 characters # making up the Russian alphabet subset thereof. I want to emphasize # that this focus wasn't an act of prejudice on my part; I just # had a need for something that could handle Russian, but I would # hope that someone concerned with any of the many other languages # using a Cyrillic or Cyrillic-derived alphabet would be able to # extend or modify what I've done here, or at least use this program # as a starting point for something else. # # I could not have written this program without information obtained # from Roman Czyborra's very helpful page at http://www.czyborra.com. use strict; use vars '$Translator', '%Reg_translate'; BEGIN { # Early on I thought I would need this, then realized I didn't. I'll # leave it here just so anyone adding encodings can double check the # ordering. # my @Azbuka = qw(A BE VE GHE DE IE IO ZHE ZE I SHORT_I KA EL EM EN O PE ER # ES TE U EF HA TSE CHE SHA SHCHA HARD_SIGN YERU SOFT_SIGN E YU YA # a be ve ghe de ie io zhe ze i short_i ka el em en o pe er # es te u ef ha tse che sha shcha hard_sign yeru soft_sign e yu ya); # The default UCFORMAT here is obviously just an ASCII representation of # the unicode values, not native unicode. But it's what a common # utf8-to-ascii filter outputs; if you have native unicode support, # the appropriate values should be pretty obvious. my ($UC_format, $UC_format_regexp); if (defined($ENV{'UCFORMAT'}) && defined($ENV{'UCFORMAT_REGEXP'})) { ($UC_format, $UC_format_regexp) = @ENV{'UCFORMAT','UCFORMAT_REGEXP'}; } else { ($UC_format, $UC_format_regexp) = ('', ''); } (my $basename = $0) =~ s!.*/!!; my @Raw_unicode = qw(0410 0411 0412 0413 0414 0415 0401 0416 0417 0418 0419 041A 041B 041C 041D 041E 041F 0420 0421 0422 0423 0424 0425 0426 0427 0428 0429 042A 042B 042C 042D 042E 042F 0430 0431 0432 0433 0434 0435 0451 0436 0437 0438 0439 043A 043B 043C 043D 043E 043F 0440 0441 0442 0443 0444 0445 0446 0447 0448 0449 044A 044B 044C 044D 044E 044F); my %Table; # The koi8 family of encodings; here I am only concerned with the # 66 Russian letters, so I don't take into account the many koi8 variants. $Table{koi8} = [split(' ', "\xe1 \xe2 \xf7 \xe7 \xe4 \xe5 \xb3 \xf6 \xfa \xe9 \xea \xeb \xec \xed \xee \xef \xf0 \xf2 \xf3 \xf4 \xf5 \xe6 \xe8 \xe3 \xfe \xfb \xfd \xff \xf9 \xf8 \xfc \xe0 \xf1 \xc1 \xc2 \xd7 \xc7 \xc4 \xc5 \xa3 \xd6 \xda \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd2 \xd3 \xd4 \xd5 \xc6 \xc8 \xc3 \xde \xdb \xdd \xdf \xd9 \xd8 \xdc \xc0 \xd1")]; # This is iso-8859-5. I got tired of typing 8859. $Table{iso5} = [split(' ', "\xb0 \xb1 \xb2 \xb3 \xb4 \xb5 \xa1 \xb6 \xb7 \xb8 \xb9 \xba \xbb \xbc \xbd \xbe \xbf \xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xf1 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef")]; # Microsoft codepage 1251. $Table{cp1251} = [split(' ', "\xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xa8 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xb8 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd \xfe \xff")]; # Bulgarian-MIK. $Table{mik} = [split(' ', "\x80 \x81 \x82 \x83 \x84 \x85 \x85 \x86 \x87 \x88 \x89 \x8a \x8b \x8c \x8d \x8e \x8f \x90 \x91 \x92 \x93 \x94 \x95 \x96 \x97 \x98 \x99 \x9a \x9b \x9c \x9d \x9e \x9f \xa0 \xa1 \xa2 \xa3 \xa4 \xa5 \xa5 \xa6 \xa7 \xa8 \xa9 \xaa \xab \xac \xad \xae \xaf \xb0 \xb1 \xb2 \xb3 \xb4 \xb5 \xb6 \xb7 \xb8 \xb9 \xba \xbb \xbc \xbd \xbe \xbf")]; # Mac-Ukrainian. $Table{mac} = [split(' ', "\x80 \x81 \x82 \x83 \x84 \x85 \xdd \x86 \x87 \x88 \x89 \x8a \x8b \x8c \x8d \x8e \x8f \x90 \x91 \x92 \x93 \x94 \x95 \x96 \x97 \x98 \x99 \x9a \x9b \x9c \x9d \x9e \x9f \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xde \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd \xfe \xff")]; # Microsoft codepage 866. $Table{cp866} = [split(' ', "\x80 \x81 \x82 \x83 \x84 \x85 \xf0 \x86 \x87 \x88 \x89 \x8a \x8b \x8c \x8d \x8e \x8f \x90 \x91 \x92 \x93 \x94 \x95 \x96 \x97 \x98 \x99 \x9a \x9b \x9c \x9d \x9e \x9f \xa0 \xa1 \xa2 \xa3 \xa4 \xa5 \xf1 \xa6 \xa7 \xa8 \xa9 \xaa \xab \xac \xad \xae \xaf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef")]; # Next three are unicode related: (sort-of-)raw, html-transport-encoded, # and utf8 encoded. $Table{uni} = [map {sprintf($UC_format, hex($_))} @Raw_unicode]; $Table{html} = [map {'&#' . hex($_) . ';'} @Raw_unicode]; $Table{utf8} = [map { my $U = hex($_); $U < 2** 7 ? $U : ($U < 2**11 ? pack('C2', $U>> 6|0xc0, $U &0x3f|0x80) : # ($U < 2**16 ? pack('C3', $U>>12|0xe0, $U>> 6&0x3f|0x80, $U&0x3f|0x80) : # ($U < 2**21 ? pack('C4', $U>>18|0xf0, $U>>12&0x3f|0x80, # $U>>6&0x3f|0x80, $U&0x3f|0x80) : die "Unicode character $_ ($U) out of (Cyrillic) range, aborting"); } @Raw_unicode]; my %Reg_table = (uni => $UC_format_regexp, html => '&#[0-9]{4};', utf8 => '[\xC0-\xDF][\x80-\xBF]', # utf8 => '(?:[\xC0-\xDF][\x80-\xBF])|(?:[\xE0-\xEF][\x80-\xBF][\x80-\xBF])|(?:[\xF0-\xF7][\x80-\xBF][\x80-\xBF][\x80-\xBF])', ); my $Source = shift @ARGV; my $Target = shift @ARGV; die "$basename: need source and target encodings at the command line.\n" unless (defined($Source) && defined($Target)); die "$basename: Unsupported source encoding $Source \t(try one of @{[keys %Table]})\n" unless (exists($Table{$Source})); die "$basename: Unsupported target encoding $Target \t(try one of @{[keys %Table]})\n" unless (exists($Table{$Target})); if ($Source eq $Target) { $Translator = ''; # I used to die here, but then I thought a filter # should be friendlier. } else { if ($Source eq 'mik') { # mik has no 'IO', so I need to strip 'IO' from both lists. (If mik # is the target, then I just use cyrillic E, "stripping the # diaresis" so to speak.) A similar trick could in principle be # used for a codeset lacking a capital hard sign (but I don't know # what to suggest if such a set is the target!). splice(@{$Table{$Source}}, 6, 1); splice(@{$Table{$Source}}, 39, 1); splice(@{$Table{$Target}}, 6, 1); splice(@{$Table{$Target}}, 39, 1); } if (exists $Reg_table{$Source} || exists $Reg_table{$Target}) { @Reg_translate{@{$Table{$Source}}} = @{$Table{$Target}}; my $Regexp = (exists $Reg_table{$Source}) ? $Reg_table{$Source} : '.'; $Translator = "s/($Regexp)/exists \$Reg_translate{\$1} ? \$Reg_translate{\$1} : \$1/eg;"; } else { $Translator = 'tr[' . join('', @{$Table{$Source}}) . '][' . join('', @{$Table{$Target}}) . '];'; } } # warn "DEBUG: translator is\n*$Translator*\n"; } eval $Translator;