script – bad-utf8

#!/usr/bin/env perl
# $Id: script-bad-utf8.html,v 1.2 2020/08/17 17:44:19 tom Exp $
# -----------------------------------------------------------------------------
# Copyright 2020 by Thomas E. Dickey
#
#                         All Rights Reserved
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the
# "Software"), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so, subject to
# the following conditions:
#
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
# IN NO EVENT SHALL THE ABOVE LISTED COPYRIGHT HOLDER(S) BE LIABLE FOR ANY
# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#
# Except as contained in this notice, the name(s) of the above copyright
# holders shall not be used in advertising or otherwise to promote the
# sale, use or other dealings in this Software without prior written
# authorization.
# -----------------------------------------------------------------------------
# Use the cursor-position response to make a copy of Markus Kuhn's sample of
# bad UTF-8 which is adjusted for a given terminal.   Also, update a CSV file
# which shows the number of differences against the target file.
#
# NOTE: when running this script, it helps to first reset the terminal, since
# most terminal emulators other than xterm have fragile/stateful error handling,
# and will produce inconsistent results.
#
# Further reading
# http://unicode.org/mail-arch/unicode-ml/Archives-Old/UML015/0145.html
# "UTF-8 stress test"
# by Markus Kuhn
#
# https://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html
# https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
# "UTF-8 decoder capability and stress test"
# by Markus Kuhn
#
# https://hsivonen.fi/broken-utf-8/
# "How Many REPLACEMENT CHARACTERs?"
# by Henri Sivonen
#
# http://www.unicode.org/L2/L2019/19192-review-docs.pdf
# "Review of Unicode 2018 Henri Sivonen docs"
# by Markus Scherer

# TODO: sort cases as version-strings

use strict;
use warnings;
use Term::ReadKey;
use Getopt::Std;
use Text::CSV;

our ( $opt_d, $opt_i, $opt_m, $opt_n, $opt_q, $opt_r, $opt_s, $opt_t );

our $destdir    = "/users/ftp/httpdocs/xterm/bad-utf8";
our $rootname   = "UTF-8-test";
our $sourcefile = "$rootname.txt";

our @sourcefile;

our $tty;
our $crlf = "\r\n";

# This is a hash of testcases into the data for the rows.
# The data for each row hashes column header (terminal name) to adjustments.
our %csv_file;

our ( $oldx, $oldy );
our ( $maxx, $maxy );

sub abspath($) {
    my $path = shift;
    $path = "$destdir/$path" unless ( $path =~ /^(\.){0,2}\// );
    return $path;
}

sub filesize($) {
    my $filename = shift;
    my (
        $dev,  $ino,   $mode,  $nlink, $uid,     $gid, $rdev,
        $size, $atime, $mtime, $ctime, $blksize, $blocks
    ) = stat($filename);
    return $size;
}

sub readfile($) {
    my $filename = shift;
    my $encoding = ":raw :bytes";
    open( my $fh, "< $encoding", $filename ) || do {
        print STDERR "Can't open $filename: $!\n";
        exit;
    };
    binmode($fh);
    read $fh, my $bytes, &filesize($filename);
    close($fh);
    @sourcefile = split /\n/, $bytes;
}

sub writefile($$) {
    my $filename = $_[0];
    my @data     = @{ $_[1] };
    my $encoding = ":raw :bytes";
    open( my $fh, "> $encoding", $filename ) || do {
        print STDERR "Can't open $filename: $!\n";
        exit;
    };
    binmode($fh);
    use bytes;
    for my $n ( 0 .. $#data ) {
        print $fh $data[$n] . "\n";
    }
    no bytes;
    close($fh);
}

sub get_reply($) {
    my $command = $_[0];
    my $reply   = "";

    syswrite $tty, $command;
    while (1) {
        my $test = ReadKey 0.10;
        last if not defined $test;

        $reply .= $test;
    }
    return $reply;
}

sub move($$) {
    my $y = $_[0] + 1;
    my $x = $_[1] + 1;
    syswrite $tty, "\033[$y;$x" . "H";
}

sub clear() {
    syswrite $tty, "\033[J";
}

sub whereami() {
    my @coords = ( 0, 0 );
    my $reply  = &get_reply("\033[6n");
    my $y      = 0;
    my $x      = 0;
    if ( $reply =~ /^\033\[\d+;\d+R$/ ) {
        $reply =~ s/^\033\[//;
        $reply =~ s/R//;
        my @coords = split /;/, $reply;
        $y = $coords[0];
        $x = $coords[1];
    }
    return ( $y, $x );
}

sub newline() {
    if ($opt_q) {
        &move( $oldy - 1, $oldx - 1 );
        &clear;
    }
    else {
        syswrite $tty, $crlf;
    }
}

sub csv_file() {
    return &abspath("$rootname.csv");
}

sub read_csv() {
    my $csv = Text::CSV->new( { binary => 1 } )   # should set binary attribute.
      or die "Cannot use CSV: " . Text::CSV->error_diag();
    if ( open my $fh, "<", &csv_file ) {
        my $first = 1;
        my @head;
        while ( my $row = $csv->getline($fh) ) {
            if ($first) {
                @head  = @$row;
                $first = 0;
            }
            else {
                my @data = @$row;
                next if ( $data[0] !~ /^\d+(\.\d+)*$/ );
                my %data;
                for my $c ( 0 .. $#data ) {
                    $data{ $head[$c] } = $data[$c];
                }
                $csv_file{ $data[0] } = \%data;
            }
        }
        $csv->eof or $csv->error_diag();
        close $fh;
    }
}

sub write_csv() {
    my $csv = Text::CSV->new( { binary => 1 } )   # should set binary attribute.
      or die "Cannot use CSV: " . Text::CSV->error_diag();
    if ( open my $fh, ">", &csv_file ) {
        my $first = 1;
        my @head;
        my @sums;
        for my $data ( sort keys %csv_file ) {
            my %data = %{ $csv_file{$data} };
            if ($first) {
                $first = 0;

                # "(tests)" will sort on the left
                @head = ( sort keys %data );
                $csv->print( $fh, $_ ) for \@head;
                print $fh "\n";
            }
            my @data;
            for my $h ( 0 .. $#head ) {
                $data[$h] = $data{ $head[$h] };
                $sums[$h] += $data{ $head[$h] } if ( $h > 0 );
            }
            $csv->print( $fh, $_ ) for \@data;
            print $fh "\n";
        }
        $sums[0] = "totals";
        $csv->print( $fh, $_ ) for \@sums;
        print $fh "\n";
        close $fh;
    }
}

sub analyze($) {
    my $target = shift;

    my @targetfile;
    my %mismatches;

    my $GOAL = 80;
    my $quit = 0;

    open $tty, ">", "/dev/tty" or die "Cannot open /dev/tty ";
    binmode($tty);
    ReadMode 'ultra-raw';

    ( $oldy, $oldx ) = &whereami;
    &move( 9999, 9999 );
    ( $maxy, $maxx ) = &whereami;
    &move( $oldy - 1, $oldx - 1 );

    my $testcase = "";

    for my $row ( 0 .. $#sourcefile ) {
        $targetfile[$row] = $sourcefile[$row];
        if ( $sourcefile[$row] =~ /^\d+(\.\d+){0,2}\b/ ) {
            $testcase = $sourcefile[$row];
            $testcase =~ s/[^\d\.].*$//;
            $mismatches{$testcase} = 0;
        }
        syswrite $tty, $sourcefile[$row];
        use bytes;

        my $actual = length( $sourcefile[$row] );
        my $marker = index( $sourcefile[$row], "|" );
        if ( $actual <= 0 or $marker < $actual - 1 ) {
            if (    $actual == 79
                and substr( $sourcefile[$row], $actual - 1 ) eq " "
                and ( $testcase eq "2.1.1" or $testcase eq "2.2.1" ) )
            {

                # Markus' first version in September 1999 was okay, but he
                # omitted the marker on all subsequent versions:  the last
                # character is a blank rather than the vertical bar.  Here is a
                # simple workaround.
                $sourcefile[$row] =
                  substr( $sourcefile[$row], 0, $actual - 1 ) . "|";
                $targetfile[$row] = $sourcefile[$row];
            }
            else {
                &newline;
                next;
            }
        }

        my ( $nowy, $nowx ) = &whereami;

        # try to handle wrapping, but best results are on a wide terminal
        $nowx += $maxx if ( $nowx < ( $maxx / 4 ) );

        syswrite $tty, "$nowx:$actual" if ($opt_d);

        if ( $nowx != $GOAL ) {
            if ($opt_m) {
                my $mods = ( $GOAL - $nowx );
                $mods = -$mods if ( $mods < 0 );
                $mismatches{$testcase} += $mods;
            }
            else {
                $mismatches{$testcase}++;
            }
            &newline;
            while ( length( $targetfile[$row] ) < $GOAL ) {
                my $part =
                  substr( $targetfile[$row], 0,
                    length( $targetfile[$row] ) - 1 )
                  . " |";
                $targetfile[$row] = $part;
            }
            my $endbar = "";
            my $aligns = "|";
            my $limits = length( $targetfile[$row] );
            while ( length($aligns) < $limits ) {
                my $check =
                  substr( $targetfile[$row], $limits - length($aligns) );
                last if ( $check ne $aligns );
                $endbar = $aligns;
                $aligns = " " . $aligns;
            }
            my $have = length($endbar);
            if ( $nowx > $GOAL ) {
                my $want = 1 + ( $nowx - $GOAL );
                my $diff = 0;
                if ( $want > $have ) {
                    $want = $have;
                }
                $targetfile[$row] =
                  substr( $targetfile[$row], 0, $actual - $want ) . "|";
            }
            else {
                my $want = ( $GOAL - $nowx );
                my $diff = 0;
                $diff = $want - 1;
                $want = 1;
                my $spaces = sprintf( "%*s|", $diff + $want, " " );
                $targetfile[$row] =
                  substr( $targetfile[$row], 0, $actual - $want ) . $spaces;
            }
            if ($opt_r) {
                &move( $nowy - 2, 0 ) if ( $nowy == $maxy );
                &move( $nowy - 1, 0 ) if ( $nowy != $maxy );
                &clear;
            }
            syswrite $tty, $targetfile[$row];
            syswrite $tty, "NEW" if ($opt_d);
        }

        no bytes;
        if ($opt_i) {
            my $key = ReadKey 0;
            if ( $key eq "q" ) {
                $quit = 1;
                last;
            }
            &newline;
        }
        else {
            &newline;
        }
    }

    ReadMode 'restore';
    syswrite $tty, $crlf;

    return if ($quit);

    # Record the adjustments
    my $targetfile = &abspath("$rootname:$target.txt");
    &writefile( $targetfile, \@targetfile );

    # Record the information to allow comparing different terminals
    &read_csv;
    for my $testcase ( keys %mismatches ) {
        my %data;
        %data = %{ $csv_file{$testcase} } if ( defined $csv_file{$testcase} );
        $data{"(tests)"}     = $testcase;
        $data{$target}       = $mismatches{$testcase};
        $csv_file{$testcase} = \%data;
    }
    &write_csv;
}

sub main::HELP_MESSAGE() {
    printf STDERR <<EOF
Usage: $0 [options] target

Options:

-d         debugging shows details in margin (needs wide terminal)
-i         interactive, wait after each line (q exits)
-m         show the amount of modification rather than the number of failures
-n         do not update the target file
-q         quiet (do not scroll; just process on a single line)
-r         replace current line rather than adding changed-line
-s source  use a different source than
           $sourcefile
-t dir     use different directory for source/analysis files than
           $destdir
EOF
      ;
    exit;
}

$Getopt::Std::STANDARD_HELP_VERSION = 1;
&getopts('dimnqrs:t:') || &main::HELP_MESSAGE;
&main::HELP_MESSAGE if ( $opt_q and $opt_r );

$destdir    = $opt_t if ($opt_t);
$sourcefile = $opt_s if ($opt_s);
$sourcefile = &abspath($sourcefile);

$rootname = $sourcefile;
$rootname =~ s,\.[^./]*$,,;
$rootname =~ s,^.*/,,;
$rootname .= "-m" if ($opt_m);

&readfile($sourcefile);

if ( -t 0 and -t 2 ) {
    printf "** Interactive$crlf";
    autoflush STDOUT 1;
    &main::HELP_MESSAGE unless ( $#ARGV == 0 );
    &analyze( $ARGV[0] );
}
else {
    printf "** Non-Interactive\n";
    for my $n ( 0 .. $#sourcefile ) {
        printf( "%d:%d:%s\n",
            $n + 1, length( $sourcefile[$n] ),
            $sourcefile[$n] );
    }
}

1;