#! /usr/bin/perl use utf8; use strict; use warnings; use Encode; use Getopt::Std; use Unicode::Normalize; use HTML::Entities; my ($Trans_IA1, @Trans_IA2, $Trans_UA1, %Trans_UA2, %Diacr); my ($intext, $code1, $code2, $text, $target_code, $nxtc, $nxtce, $nxtco, $init_bom); our ($opt_h, $opt_b, $opt_c, $opt_E, $opt_A, $opt_X, $opt_T, $opt_B, $opt_a, $opt_i, $opt_w); if (!getopts('hbc:EAXTB:aiw')) { die "Use -h option for help (or -hi options for ISO-8859-1 output).\n"; }; initialize_tables(); # open the big "if -h" if ($opt_h) { $text = < ──────────────── Code to be used for interpreting the bytes that are not part of a valid UTF-8 character (see also option -b).
 can be any of

 – a number between 1 and 19 for specifying code ISO-8859- for
   characters not in the range U+0080 to U+009F, and code Windows-1252 for
   characters in that range. If  is 2, Windows-1250 instead of
   Windows-1252 is used for the latter purpose. Default is 1.

 – another number for specifying code Windows-

 – the name “mac” for specifying code Macintosh's Roman code

 – any other name for specifying a code name known to Perl's Encode module


Option -b
─────────

If option -b is set, no attempt is made to identify UTF-8 characters on
input. Rather, the code specified in option -c (or the default -c1) is used
throughout the input text.


Options -E and -A
─────────────────

If one or both of these options are set, character entities, that is, special
character sequences beginning with “&” and ending with “;”, are replaced by
the character they denote. The “;” is mandatory even though it is not always
required in SGML.  With option -E, the replacement is made for characters
outside the range U+0020 to U+007E, with option -A, for characters within that
range. This allows special handling of printable ASCII characters that could
have a special meaning in a markup language like HTML.

Unrecognized entities are left untouched. The following entities are
recognized:

 – “&#nnn;” with a decimal number “nnn” (any number of digits) denoting the
   Unicode value of the character

 – “&#xnnn;” with a hexadecimal number “nnn” (any number of digits) denoting the
   Unicode value of the character

 – character entities defined in the HTML 4.1 standard

 – character entities of the form “&cdiacr;” where “c” is any ASCII letter (or
   “ü” as a matter of convenience for writing Pinyin) and “diacr” is the name
   of a diacritic in the style in which character entities are defined in the
   HTML 4.1 standard. The following values of “diacr” may be used: grave
   (grave accent), acute (acute accent), circ (circumflex accent), tilde, macr
   (macron), breve, dot (dot above), uml (diaeresis), ring (ring above),
   dblacute (double acute accent), caron, dotb (dot below), umlb (diaeresis
   below), ringb (ring below), cedil (cedilla), ogon (ogonek), macrb (macron
   below). Example: “ş” (ş with cedilla).

 – a few character entities for other characters: “&dotli;” (dotless ı),
   “&Lstroke;” (Ł with stroke),“&lstroke;” (ł with stroke),
   “&longs;” (long ſ), “&Zhook;” (Ȥ with hook), “&zhook;” (ȥ with hook).


Option -X
─────────

If this option is set, character sequences that look like the byte sequence of
the representation of an UTF-8 character in another code (e.g. “Ã¼” where “ü”
would be correct) are transformed back.  Such sequences occur from recoding
something into UTF-8 which is already in UTF-8 (albeit not with this tool
unless the option -b is set).


Option -T
─────────

If this option is set, characters not representable in the output code are
transliterated to other characters in an irreversible manner, in order to
produce intelligible output. In particular, Latin characters with diacritics
are replaced by the base letters. In addition, box drawing characters and
various forms of bullets and asterisks are replaced by other characters with a
similar look.  The effect can be seen by comparing the effect of
“$0 -h” and “$0 -hi | $0 -b”.

-T is assumed when -h is set.


Option -B 
─────────────────

Option B can have values 0 or 1 and specifies whether a Byte Order Mark (BOM,
the character U+FEFF) is to be set at the beginning of the file. Using a BOM
can be a means to indicate that a file is in UTF-8 code; it can also be a
fatal error when the first bytes of the file contents are interpreted (magic
number, shebang; although the Unix “file” command has no problems with it).
It is recommended to use “-B 1” (ensure BOM) for a plain text file to be used
under Windows, to use “-B 0” (remove BOM) if output is not UTF-8 and option -T
is not set, and otherwise to use this option only for a good reason.

Other values for the option ar treated like 1 for UTF-8 output and 0 for other
output. Default is no change of the file contents regarding BOM.


Options -a, -i, and -w
──────────────────────

Use ASCII, ISO-8859-1, or Windows-1252 for output instead of
UTF-8. Characters not contained in the output code are written as hexadecimal
character entities, e.g. “ı” for U+0131 (dotless ı). See also the -T
option.

FINIS
  $opt_T = 1;
} else {


# input decoding

undef $/;
$intext = ;

if ($opt_c) { # input code specified
  if ($opt_c =~ /^(\d|1\d)$/) {
    $code1 = "iso-8859-$opt_c";
    if ($opt_c eq '2') {
      $code2 = 'cp1250';
    } else {
      $code2 = 'cp1252';
    };
    if ($opt_c eq '1' && !$opt_b) {
      $code1 = '';
    };
  } elsif ($opt_c =~ /^\d+$/) {
    $code1 = "cp$opt_c";
  } elsif ($opt_c =~ /^mac$/i) {
    $code1 = 'MacRoman';
  } else {
    $code1 = $opt_c;
  };
};

if ($code1) {
  if ($opt_b) {
    $text = decode($code1, $intext);
    $intext ='';
  } else {
    $text = '';
    while ($intext) {
      $text .= decode('utf8', $intext, Encode::FB_QUIET);
      if ($intext) {
        $nxtc = substr($intext, 0, 1);
        substr($intext, 0, 1) = '';
        if ($code2 && (ord($nxtc) >= 128) && (ord($nxtc) < 160)) {
          $text .= decode($code2, $nxtc);
        } else {
          $text .= decode($code1, $nxtc);
        };
      };
    };
  };
} else {
  if ($opt_b) {
    $text = decode('cp1252', $intext);
    $intext ='';
  } else {
    $text = '';
    while ($intext) {
      $text .= decode('utf8', $intext, Encode::FB_QUIET);
      if ($intext) {
	$text .= decode('cp1252', substr($intext, 0, 1));
	substr($intext, 0, 1) = '';
      };
    };
  };
};
undef $intext;


# close the big "if -h"

};


# processing of intermediate text

# decode entities 

if ($opt_E  || $opt_A) {
  $text =~ s/\&(\#([0-9]+|x[0-9A-Fa-f]+)|[A-Za-zÜü][A-Za-z0-9.-]*);/entity_value($1)/ge;
};


# double UTF-8 encoding

if ($opt_X) {
  $text =~ s/[À-ß][-¿]|[à-ï][-¿]{2}/decode('utf8', encode('iso-8859-1', $&))/ge;
};


# byte order mark

if (defined $opt_B) {
  if ($opt_B !~ /^[01]$/) {
    if ($opt_a || $opt_i || $opt_w) {
      $opt_B = 0;
    } else {
      $opt_B = 1;
    };
  };
  $text =~ /^[\x{feff}]*/;
  $init_bom = length $&;
  if ($init_bom != $opt_B) {
    substr($text, 0, $init_bom) = chr(0xfeff) x $opt_B;
  };
};


# output transliteration and encoding

if ($opt_a) {
  $target_code = 'ascii';
} elsif ($opt_i) {
  $target_code = 'iso-8859-1';
} elsif ($opt_w) {
  $target_code = 'cp1252';
} else {
  $target_code = 'utf8';
};

if ($opt_T) {
  if ($opt_a) {
    eval ("\$text =~ " . $Trans_IA1);
    eval ("\$text =~ " . $Trans_UA1);
  } elsif ($opt_i) {
    eval ("\$text =~ " . $Trans_UA1);
  };
  while (length $text) {
    print (encode($target_code, $text, Encode::FB_QUIET));
    last unless length($text);
    $nxtco = ord($text);
    $nxtc = chr($nxtco);
    $nxtce = substitute($nxtco);
    $text =~ s/$nxtc/$nxtce/g;
  };
} else {
  print (encode($target_code, $text, Encode::FB_XMLCREF));
};


sub substitute {
  # replace a character by one that occurs in more codes
  # whether the replacement is necessary is not checked: the procedure must not be called if unnecessary 

  my $char_codepoint = $_[0]; 
  my $character = chr($char_codepoint);
  my $decomposed = NFKD($character);
  my ($result, $test);

  if ($char_codepoint == 0 || ($char_codepoint >= 0x200b && $char_codepoint <= 0x200f) || $char_codepoint == 0xfeff) {
    return '';
  } elsif ($char_codepoint == 0xa0 || ($char_codepoint >= 0x2000 && $char_codepoint <= 0x200a)) {
    return ' ';
  };

  $result = $character;
  eval "\$result =~ " . $Trans_IA1;

  if ($result ne $character) {
    return $result;
  };

  if (defined $Trans_IA2[$char_codepoint]) {
    return $Trans_IA2[$char_codepoint];
  };

  eval "\$result =~ " . $Trans_UA1;

  if ($result ne $character) {
    if ($target_code eq 'ascii' && $result eq 'Ø') {
      $result = '0';
    };
    return $result;
  };

  if (defined $Trans_UA2{$character}) {
    if ($target_code eq 'ascii' && $result eq '¬=') {
      $result = '!=';
    };
    return $Trans_UA2{$character};
  };

  $decomposed =~ tr=⁄=/=;
  if ($decomposed =~ /^ ./) {
    $decomposed = '';
  };

  while (length($decomposed)) { # spell($decomposed);
    $result = NFC($decomposed);
    $test = $result;
    encode($target_code, $test, Encode::FB_QUIET);
    if (! length($test)) {
      return $result;
    };
    chop $decomposed;
  };
  return (sprintf('&#x%x;', $_[0]));
};


sub entity_value {
  my $x = $_[0];
  my (%cache, $cc);
  my ($result, $num, $diacr);

  if (! defined($cache{'dotli'})) {
    $cache{'dotli'} = 'ı';
    $cache{'Lstroke'} = 'Ł';
    $cache{'lstroke'} = 'ł';
    $cache{'longs'} = 'ſ';
    $cache{'Zhook'} = 'Ȥ';
    $cache{'zhook'} = 'ȥ';
  };

  if ($cc = $cache{$x}) {
    return $cc;
  };

  if ($x =~ /^[#]x/) {
    $num = hex($'); 
  } elsif ($x =~ /^[#]/) {
    $num = $'; 
  } else {
    $result = decode_entities("\&$x;");
    if (length($result) == 1) {
      $num = ord($result);
    } else {
      $num = -1;
    };
  };

  if ($num >= 0) {
    if ($num < 0x20 || $num > 0x7e) {
      if ($opt_E) {
        $result = chr($num);
      } else {
        $result = "\&$x;";
      };
    } else {
      if ($opt_A) {
        $result = chr($num);
      } else {
        $result = "\&$x;";
      };
    };
    $cache{$x} = $result;
    return $result;
  };

  if ($opt_E && ($x =~ /^[A-Za-zÜü]/)) {
    if ($diacr = $Diacr{$'}) {
      $result = NFC("$&$diacr");
      $cache{$x} = $result;
      return $result;
    };
  };

  $result = "\&$x;";
  $cache{$x} = $result;
  return $result;
};


sub spell {
  print "+++++ ";
  foreach my $c (split(//, $_[0])) {
    printf("%4.4x ", ord($c));
  };
  print "\n";
};


sub initialize_tables {

# ISO to one ASCII character

  my $Trans_IA1_from = ' ¢¤¦¨µ·Ð×ð';
  my $Trans_IA1_to   = ' c$|"u*D*d-';
  $Trans_IA1 = "tr/$Trans_IA1_from/$Trans_IA1_to/;";

# ISO to more than one ASCII character

  $Trans_IA2[0xa9] = '(c)';
  $Trans_IA2[0xae] = '(R)';
  $Trans_IA2[0xb1] = '+/-';
  $Trans_IA2[0xc4] = 'Ae';
  $Trans_IA2[0xc5] = 'Aa';
  $Trans_IA2[0xc6] = 'AE';
  $Trans_IA2[0xd6] = 'Oe';
  $Trans_IA2[0xd8] = 'Oe';
  $Trans_IA2[0xdc] = 'Ue';
  $Trans_IA2[0xde] = 'Th';
  $Trans_IA2[0xdf] = 'ss';
  $Trans_IA2[0xe4] = 'ae';
  $Trans_IA2[0xe5] = 'aa';
  $Trans_IA2[0xe6] = 'ae';
  $Trans_IA2[0xf6] = 'oe';
  $Trans_IA2[0xf8] = 'oe';
  $Trans_IA2[0xfc] = 'ue';
  $Trans_IA2[0xfe] = 'th';

# non-ISO to one ASCII/ISO character

  my $Trans_UA1_from = q(‘’‚‛‹›ʹʼˈ′“”„‟«»ˮʺ″ǃ∗✱•∙■□▪▫○◌◍◎●◦∘⁄∕∶〈〉˄ˆ⌃ˍˋ‵ǀ∣❘˜∼│┃┆┇┊┋┌┍┎┏┐┑┒┓└┕┖┗┘┙┚┛├┝┞┟┠┡┢┣┤┥┦┧┨┩┪┫┼┽┾┿╀╁╂╃╄╅╆╇╈╉╊╋╎╏║╒╓╔╕╖╗╘╙╚╛╜╝╞╟╠╡╢╣╪╫╬╵╷╹╻╽╾╿∅ĐđĦħıĸŁłŉŦŧƀƁƓƘƙƵƶǤǥȤȥ∖‐‑‒–—―─━┄┅┈┉┬┭┮┯┰┱┲┳┴┵┶┷┸┹┺┻╌╍═╤╥╦╧╨╩╶╸╺╼╴);
  my $Trans_UA1_to   = q(''''''''''"""""""""!*****o..o.++*oo//:<>^^^_``|||~~||||||++++++++++++++++||||||||||||||||++++++++++++++++|||++++++++++++||||||+++|||||||ØDdHhiqLlnTtbBGKkZzGgZz\\\-);
  $Trans_UA1 = "tr,$Trans_UA1_from,$Trans_UA1_to,;";

# non-ISO to zero or more than one ASCII/ISO character

  $Trans_UA2{'€'} = 'EUR';
  $Trans_UA2{'Ŋ'} = 'Ng';
  $Trans_UA2{'ŋ'} = 'ng';
  $Trans_UA2{'Œ'} = 'Oe';
  $Trans_UA2{'œ'} = 'oe';
  $Trans_UA2{'←'} = '<-';
  $Trans_UA2{'→'} = '->';
  $Trans_UA2{'↔'} = '<->';
  $Trans_UA2{'⇐'} = '<=';
  $Trans_UA2{'⇒'} = '=>';
  $Trans_UA2{'⇔'} = '<=>';
  $Trans_UA2{'∞'} = 'oo';
  $Trans_UA2{'≠'} = '¬=';
  $Trans_UA2{'≤'} = '<=';
  $Trans_UA2{'≥'} = '>=';
  $Trans_UA2{'≦'} = '<=';
  $Trans_UA2{'≧'} = '>=';
  $Trans_UA2{'…'} = '...';

# diacritics

  $Diacr{'grave'} = chr(0x300);
  $Diacr{'acute'} = chr(0x301);
  $Diacr{'circ'} = chr(0x302);
  $Diacr{'tilde'} = chr(0x303);
  $Diacr{'macr'} = chr(0x304);
  $Diacr{'breve'} = chr(0x306);
  $Diacr{'dot'} = chr(0x307);
  $Diacr{'uml'} = chr(0x308);
  $Diacr{'ring'} = chr(0x30a);
  $Diacr{'dblacute'} = chr(0x30b);
  $Diacr{'caron'} = chr(0x30c);
  $Diacr{'dotb'} = chr(0x323);
  $Diacr{'umlb'} = chr(0x324);
  $Diacr{'ringb'} = chr(0x325);
  $Diacr{'cedil'} = chr(0x327);
  $Diacr{'ogon'} = chr(0x328);
  $Diacr{'macrb'} = chr(0x331);

};