#! /usr/bin/perl use utf8; use strict; use warnings; use Encode; use Getopt::Std; use Unicode::Normalize; use HTML::Entities; my ($Trans_IA1, @Trans_IA2, $Trans_UA1, %Trans_UA2, %Diacr); my ($intext, $code1, $code2, $text, $target_code, $nxtc, $nxtce, $nxtco, $init_bom); our ($opt_h, $opt_b, $opt_c, $opt_E, $opt_A, $opt_X, $opt_T, $opt_B, $opt_a, $opt_i, $opt_w); if (!getopts('hbc:EAXTB:aiw')) { die "Use -h option for help (or -hi options for ISO-8859-1 output).\n"; }; initialize_tables(); # open the big "if -h" if ($opt_h) { $text = < ──────────────── Code to be used for interpreting the bytes that are not part of a valid UTF-8 character (see also option -b). can be any of – a number between 1 and 19 for specifying code ISO-8859- for characters not in the range U+0080 to U+009F, and code Windows-1252 for characters in that range. If is 2, Windows-1250 instead of Windows-1252 is used for the latter purpose. Default is 1. – another number for specifying code Windows- – the name “mac” for specifying code Macintosh's Roman code – any other name for specifying a code name known to Perl's Encode module Option -b ───────── If option -b is set, no attempt is made to identify UTF-8 characters on input. Rather, the code specified in option -c (or the default -c1) is used throughout the input text. Options -E and -A ───────────────── If one or both of these options are set, character entities, that is, special character sequences beginning with “&” and ending with “;”, are replaced by the character they denote. The “;” is mandatory even though it is not always required in SGML. With option -E, the replacement is made for characters outside the range U+0020 to U+007E, with option -A, for characters within that range. This allows special handling of printable ASCII characters that could have a special meaning in a markup language like HTML. Unrecognized entities are left untouched. The following entities are recognized: – “&#nnn;” with a decimal number “nnn” (any number of digits) denoting the Unicode value of the character – “&#xnnn;” with a hexadecimal number “nnn” (any number of digits) denoting the Unicode value of the character – character entities defined in the HTML 4.1 standard – character entities of the form “&cdiacr;” where “c” is any ASCII letter (or “ü” as a matter of convenience for writing Pinyin) and “diacr” is the name of a diacritic in the style in which character entities are defined in the HTML 4.1 standard. The following values of “diacr” may be used: grave (grave accent), acute (acute accent), circ (circumflex accent), tilde, macr (macron), breve, dot (dot above), uml (diaeresis), ring (ring above), dblacute (double acute accent), caron, dotb (dot below), umlb (diaeresis below), ringb (ring below), cedil (cedilla), ogon (ogonek), macrb (macron below). Example: “ş” (ş with cedilla). – a few character entities for other characters: “&dotli;” (dotless ı), “&Lstroke;” (Ł with stroke),“&lstroke;” (ł with stroke), “&longs;” (long ſ), “&Zhook;” (Ȥ with hook), “&zhook;” (ȥ with hook). Option -X ───────── If this option is set, character sequences that look like the byte sequence of the representation of an UTF-8 character in another code (e.g. “ü” where “ü” would be correct) are transformed back. Such sequences occur from recoding something into UTF-8 which is already in UTF-8 (albeit not with this tool unless the option -b is set). Option -T ───────── If this option is set, characters not representable in the output code are transliterated to other characters in an irreversible manner, in order to produce intelligible output. In particular, Latin characters with diacritics are replaced by the base letters. In addition, box drawing characters and various forms of bullets and asterisks are replaced by other characters with a similar look. The effect can be seen by comparing the effect of “$0 -h” and “$0 -hi | $0 -b”. -T is assumed when -h is set. Option -B ───────────────── Option B can have values 0 or 1 and specifies whether a Byte Order Mark (BOM, the character U+FEFF) is to be set at the beginning of the file. Using a BOM can be a means to indicate that a file is in UTF-8 code; it can also be a fatal error when the first bytes of the file contents are interpreted (magic number, shebang; although the Unix “file” command has no problems with it). It is recommended to use “-B 1” (ensure BOM) for a plain text file to be used under Windows, to use “-B 0” (remove BOM) if output is not UTF-8 and option -T is not set, and otherwise to use this option only for a good reason. Other values for the option ar treated like 1 for UTF-8 output and 0 for other output. Default is no change of the file contents regarding BOM. Options -a, -i, and -w ────────────────────── Use ASCII, ISO-8859-1, or Windows-1252 for output instead of UTF-8. Characters not contained in the output code are written as hexadecimal character entities, e.g. “ı” for U+0131 (dotless ı). See also the -T option. FINIS $opt_T = 1; } else { # input decoding undef $/; $intext = ; if ($opt_c) { # input code specified if ($opt_c =~ /^(\d|1\d)$/) { $code1 = "iso-8859-$opt_c"; if ($opt_c eq '2') { $code2 = 'cp1250'; } else { $code2 = 'cp1252'; }; if ($opt_c eq '1' && !$opt_b) { $code1 = ''; }; } elsif ($opt_c =~ /^\d+$/) { $code1 = "cp$opt_c"; } elsif ($opt_c =~ /^mac$/i) { $code1 = 'MacRoman'; } else { $code1 = $opt_c; }; }; if ($code1) { if ($opt_b) { $text = decode($code1, $intext); $intext =''; } else { $text = ''; while ($intext) { $text .= decode('utf8', $intext, Encode::FB_QUIET); if ($intext) { $nxtc = substr($intext, 0, 1); substr($intext, 0, 1) = ''; if ($code2 && (ord($nxtc) >= 128) && (ord($nxtc) < 160)) { $text .= decode($code2, $nxtc); } else { $text .= decode($code1, $nxtc); }; }; }; }; } else { if ($opt_b) { $text = decode('cp1252', $intext); $intext =''; } else { $text = ''; while ($intext) { $text .= decode('utf8', $intext, Encode::FB_QUIET); if ($intext) { $text .= decode('cp1252', substr($intext, 0, 1)); substr($intext, 0, 1) = ''; }; }; }; }; undef $intext; # close the big "if -h" }; # processing of intermediate text # decode entities if ($opt_E || $opt_A) { $text =~ s/\&(\#([0-9]+|x[0-9A-Fa-f]+)|[A-Za-zÜü][A-Za-z0-9.-]*);/entity_value($1)/ge; }; # double UTF-8 encoding if ($opt_X) { $text =~ s/[À-ß][€-¿]|[à-ï][€-¿]{2}/decode('utf8', encode('iso-8859-1', $&))/ge; }; # byte order mark if (defined $opt_B) { if ($opt_B !~ /^[01]$/) { if ($opt_a || $opt_i || $opt_w) { $opt_B = 0; } else { $opt_B = 1; }; }; $text =~ /^[\x{feff}]*/; $init_bom = length $&; if ($init_bom != $opt_B) { substr($text, 0, $init_bom) = chr(0xfeff) x $opt_B; }; }; # output transliteration and encoding if ($opt_a) { $target_code = 'ascii'; } elsif ($opt_i) { $target_code = 'iso-8859-1'; } elsif ($opt_w) { $target_code = 'cp1252'; } else { $target_code = 'utf8'; }; if ($opt_T) { if ($opt_a) { eval ("\$text =~ " . $Trans_IA1); eval ("\$text =~ " . $Trans_UA1); } elsif ($opt_i) { eval ("\$text =~ " . $Trans_UA1); }; while (length $text) { print (encode($target_code, $text, Encode::FB_QUIET)); last unless length($text); $nxtco = ord($text); $nxtc = chr($nxtco); $nxtce = substitute($nxtco); $text =~ s/$nxtc/$nxtce/g; }; } else { print (encode($target_code, $text, Encode::FB_XMLCREF)); }; sub substitute { # replace a character by one that occurs in more codes # whether the replacement is necessary is not checked: the procedure must not be called if unnecessary my $char_codepoint = $_[0]; my $character = chr($char_codepoint); my $decomposed = NFKD($character); my ($result, $test); if ($char_codepoint == 0 || ($char_codepoint >= 0x200b && $char_codepoint <= 0x200f) || $char_codepoint == 0xfeff) { return ''; } elsif ($char_codepoint == 0xa0 || ($char_codepoint >= 0x2000 && $char_codepoint <= 0x200a)) { return ' '; }; $result = $character; eval "\$result =~ " . $Trans_IA1; if ($result ne $character) { return $result; }; if (defined $Trans_IA2[$char_codepoint]) { return $Trans_IA2[$char_codepoint]; }; eval "\$result =~ " . $Trans_UA1; if ($result ne $character) { if ($target_code eq 'ascii' && $result eq 'Ø') { $result = '0'; }; return $result; }; if (defined $Trans_UA2{$character}) { if ($target_code eq 'ascii' && $result eq '¬=') { $result = '!='; }; return $Trans_UA2{$character}; }; $decomposed =~ tr=⁄=/=; if ($decomposed =~ /^ ./) { $decomposed = ''; }; while (length($decomposed)) { # spell($decomposed); $result = NFC($decomposed); $test = $result; encode($target_code, $test, Encode::FB_QUIET); if (! length($test)) { return $result; }; chop $decomposed; }; return (sprintf('&#x%x;', $_[0])); }; sub entity_value { my $x = $_[0]; my (%cache, $cc); my ($result, $num, $diacr); if (! defined($cache{'dotli'})) { $cache{'dotli'} = 'ı'; $cache{'Lstroke'} = 'Ł'; $cache{'lstroke'} = 'ł'; $cache{'longs'} = 'ſ'; $cache{'Zhook'} = 'Ȥ'; $cache{'zhook'} = 'ȥ'; }; if ($cc = $cache{$x}) { return $cc; }; if ($x =~ /^[#]x/) { $num = hex($'); } elsif ($x =~ /^[#]/) { $num = $'; } else { $result = decode_entities("\&$x;"); if (length($result) == 1) { $num = ord($result); } else { $num = -1; }; }; if ($num >= 0) { if ($num < 0x20 || $num > 0x7e) { if ($opt_E) { $result = chr($num); } else { $result = "\&$x;"; }; } else { if ($opt_A) { $result = chr($num); } else { $result = "\&$x;"; }; }; $cache{$x} = $result; return $result; }; if ($opt_E && ($x =~ /^[A-Za-zÜü]/)) { if ($diacr = $Diacr{$'}) { $result = NFC("$&$diacr"); $cache{$x} = $result; return $result; }; }; $result = "\&$x;"; $cache{$x} = $result; return $result; }; sub spell { print "+++++ "; foreach my $c (split(//, $_[0])) { printf("%4.4x ", ord($c)); }; print "\n"; }; sub initialize_tables { # ISO to one ASCII character my $Trans_IA1_from = ' ¢¤¦¨µ·Ð×ð­'; my $Trans_IA1_to = ' c$|"u*D*d-'; $Trans_IA1 = "tr/$Trans_IA1_from/$Trans_IA1_to/;"; # ISO to more than one ASCII character $Trans_IA2[0xa9] = '(c)'; $Trans_IA2[0xae] = '(R)'; $Trans_IA2[0xb1] = '+/-'; $Trans_IA2[0xc4] = 'Ae'; $Trans_IA2[0xc5] = 'Aa'; $Trans_IA2[0xc6] = 'AE'; $Trans_IA2[0xd6] = 'Oe'; $Trans_IA2[0xd8] = 'Oe'; $Trans_IA2[0xdc] = 'Ue'; $Trans_IA2[0xde] = 'Th'; $Trans_IA2[0xdf] = 'ss'; $Trans_IA2[0xe4] = 'ae'; $Trans_IA2[0xe5] = 'aa'; $Trans_IA2[0xe6] = 'ae'; $Trans_IA2[0xf6] = 'oe'; $Trans_IA2[0xf8] = 'oe'; $Trans_IA2[0xfc] = 'ue'; $Trans_IA2[0xfe] = 'th'; # non-ISO to one ASCII/ISO character my $Trans_UA1_from = q(‘’‚‛‹›ʹʼˈ′“”„‟«»ˮʺ″ǃ∗✱•∙■□▪▫○◌◍◎●◦∘⁄∕∶〈〉˄ˆ⌃ˍˋ‵ǀ∣❘˜∼│┃┆┇┊┋┌┍┎┏┐┑┒┓└┕┖┗┘┙┚┛├┝┞┟┠┡┢┣┤┥┦┧┨┩┪┫┼┽┾┿╀╁╂╃╄╅╆╇╈╉╊╋╎╏║╒╓╔╕╖╗╘╙╚╛╜╝╞╟╠╡╢╣╪╫╬╵╷╹╻╽╾╿∅ĐđĦħıĸŁłʼnŦŧƀƁƓƘƙƵƶǤǥȤȥ∖‐‑‒–—―─━┄┅┈┉┬┭┮┯┰┱┲┳┴┵┶┷┸┹┺┻╌╍═╤╥╦╧╨╩╶╸╺╼╴); my $Trans_UA1_to = q(''''''''''"""""""""!*****o..o.++*oo//:<>^^^_``|||~~||||||++++++++++++++++||||||||||||||||++++++++++++++++|||++++++++++++||||||+++|||||||ØDdHhiqLlnTtbBGKkZzGgZz\\\-); $Trans_UA1 = "tr,$Trans_UA1_from,$Trans_UA1_to,;"; # non-ISO to zero or more than one ASCII/ISO character $Trans_UA2{'€'} = 'EUR'; $Trans_UA2{'Ŋ'} = 'Ng'; $Trans_UA2{'ŋ'} = 'ng'; $Trans_UA2{'Œ'} = 'Oe'; $Trans_UA2{'œ'} = 'oe'; $Trans_UA2{'←'} = '<-'; $Trans_UA2{'→'} = '->'; $Trans_UA2{'↔'} = '<->'; $Trans_UA2{'⇐'} = '<='; $Trans_UA2{'⇒'} = '=>'; $Trans_UA2{'⇔'} = '<=>'; $Trans_UA2{'∞'} = 'oo'; $Trans_UA2{'≠'} = '¬='; $Trans_UA2{'≤'} = '<='; $Trans_UA2{'≥'} = '>='; $Trans_UA2{'≦'} = '<='; $Trans_UA2{'≧'} = '>='; $Trans_UA2{'…'} = '...'; # diacritics $Diacr{'grave'} = chr(0x300); $Diacr{'acute'} = chr(0x301); $Diacr{'circ'} = chr(0x302); $Diacr{'tilde'} = chr(0x303); $Diacr{'macr'} = chr(0x304); $Diacr{'breve'} = chr(0x306); $Diacr{'dot'} = chr(0x307); $Diacr{'uml'} = chr(0x308); $Diacr{'ring'} = chr(0x30a); $Diacr{'dblacute'} = chr(0x30b); $Diacr{'caron'} = chr(0x30c); $Diacr{'dotb'} = chr(0x323); $Diacr{'umlb'} = chr(0x324); $Diacr{'ringb'} = chr(0x325); $Diacr{'cedil'} = chr(0x327); $Diacr{'ogon'} = chr(0x328); $Diacr{'macrb'} = chr(0x331); };